Coverage for src/wiktextract/extractor/zh/pronunciation.py: 82%

1import itertools

2import re

4from wikitextprocessor import (

5 HTMLNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from ..share import set_sound_file_url_fields

14from .models import Sound, WordEntry

15from .tags import translate_raw_tags

18def extract_pronunciation_section(

19 wxr: WiktextractContext, base_data: WordEntry, level_node: WikiNode

20) -> None:

21 for t_node in level_node.find_child(NodeKind.TEMPLATE):

22 if t_node.template_name == "zh-forms":

23 from .page import process_zh_forms

25 process_zh_forms(wxr, base_data, t_node)

26 else:

27 new_sounds, new_cats = process_pron_template(wxr, t_node)

28 base_data.sounds.extend(new_sounds)

29 base_data.categories.extend(new_cats)

30 for list_item_node in level_node.find_child_recursively(NodeKind.LIST_ITEM):

31 new_sounds, new_cats = process_pron_item_list_item(wxr, list_item_node)

32 base_data.sounds.extend(new_sounds)

33 base_data.categories.extend(new_cats)

36def process_pron_item_list_item(

37 wxr: WiktextractContext, list_item_node: WikiNode

38) -> tuple[list[Sound], list[str]]:

39 raw_tags = []

40 sounds = []

41 categories = []

42 for template_node in list_item_node.find_child(NodeKind.TEMPLATE):

43 new_sounds, new_cats = process_pron_template(

44 wxr, template_node, raw_tags

45 )

46 sounds.extend(new_sounds)

47 categories.extend(new_cats)

48 return sounds, categories

51def process_pron_template(

52 wxr: WiktextractContext,

53 template_node: TemplateNode,

54 raw_tags: list[str] = [],

55) -> tuple[list[Sound], list[str]]:

56 template_name = template_node.template_name.lower()

57 sounds = []

58 categories = []

59 if template_name == "zh-pron":

60 new_sounds, new_cats = process_zh_pron_template(wxr, template_node)

61 sounds.extend(new_sounds)

62 categories.extend(new_cats)

63 elif template_name in ["homophones", "homophone", "hmp"]:

64 sounds.extend(process_homophones_template(wxr, template_node))

65 elif template_name in ["a", "accent"]:

66 # https://zh.wiktionary.org/wiki/Template:Accent

67 raw_tags.append(clean_node(wxr, None, template_node).strip("()"))

68 elif template_name in ["audio", "音"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 sounds.extend(process_audio_template(wxr, template_node, raw_tags))

70 elif template_name == "ipa":

71 sounds.extend(process_ipa_template(wxr, template_node, raw_tags))

72 elif template_name == "enpr": 72 ↛ 74line 72 didn't jump to line 74 because the condition on line 72 was always true

73 sounds.extend(process_enpr_template(wxr, template_node, raw_tags))

74 elif template_name == "ja-pron":

75 new_sounds, new_cats = extract_ja_pron_template(wxr, template_node)

76 sounds.extend(new_sounds)

77 categories.extend(new_cats)

78 return sounds, categories

81def process_zh_pron_template(

82 wxr: WiktextractContext, template_node: TemplateNode

83) -> tuple[list[Sound], list[str]]:

84 # https://zh.wiktionary.org/wiki/Template:Zh-pron

85 expanded_node = wxr.wtp.parse(

86 wxr.wtp.node_to_wikitext(template_node), expand_all=True

87 )

88 seen_lists = set()

89 sounds = []

90 categories = {}

91 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):

92 if list_node not in seen_lists:

93 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

94 sounds.extend(

95 process_zh_pron_list_item(wxr, list_item, [], seen_lists)

96 )

97 clean_node(wxr, categories, expanded_node)

98 for sound in sounds:

99 translate_raw_tags(sound)

100 return sounds, categories.get("categories", [])

101

102

103def process_zh_pron_list_item(

104 wxr: WiktextractContext,

105 list_item_node: WikiNode,

106 raw_tags: list[str],

107 seen_lists: set[WikiNode],

108) -> list[Sound]:

109 current_tags = raw_tags[:]

110 sounds = []

111 is_first_small_tag = True

112 for node in list_item_node.children:

113 if isinstance(node, WikiNode):

114 if node.kind == NodeKind.LINK:

115 link_str = clean_node(wxr, None, node.largs)

116 node_str = clean_node(wxr, None, node)

117 if link_str.startswith("File:"): 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 filename = link_str.removeprefix("File:")

119 sound_data = Sound(raw_tags=current_tags)

120 set_sound_file_url_fields(wxr, filename, sound_data)

121 sounds.append(sound_data)

122 elif node_str != "":

123 current_tags.append(node_str.strip("()"))

124 elif isinstance(node, HTMLNode):

125 if node.tag == "small":

126 # remove "幫助"(help) <sup> tag

127 if is_first_small_tag:

128 raw_tag_text = clean_node(

129 wxr,

130 None,

131 [

132 n

133 for n in node.children

134 if not (

135 isinstance(n, HTMLNode) and n.tag == "sup"

136 )

137 ],

138 ).rstrip("：")

139 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))

140 elif len(sounds) > 0: 140 ↛ 144line 140 didn't jump to line 144 because the condition on line 140 was always true

141 sounds[-1].raw_tags.extend(

142 split_zh_pron_raw_tag(clean_node(wxr, None, node))

143 )

144 is_first_small_tag = False

145 elif node.tag == "span":

146 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))

147 elif (

148 node.tag == "table"

149 and len(current_tags) > 0

150 and current_tags[-1] == "同音詞"

151 ):

152 sounds.extend(

153 extract_zh_pron_homophones_table(

154 wxr, node, current_tags

155 )

156 )

157

158 elif node.kind == NodeKind.LIST: 158 ↛ 112line 158 didn't jump to line 112 because the condition on line 158 was always true

159 seen_lists.add(node)

160 for next_list_item in node.find_child(NodeKind.LIST_ITEM):

161 sounds.extend(

162 process_zh_pron_list_item(

163 wxr,

164 next_list_item,

165 current_tags,

166 seen_lists,

167 )

168 )

169 return sounds

170

171

172def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:

173 raw_tags = []

174 if "(" not in raw_tag_text and "（" not in raw_tag_text:

175 for raw_tag in re.split(r",|，|：|、|和", raw_tag_text):

176 raw_tag = raw_tag.strip().removeprefix("包括").strip()

177 if raw_tag != "":

178 raw_tags.append(raw_tag)

179 else:

180 processed_offsets = []

181 for match in re.finditer(r"\([^()]+\)|（[^（）]+）", raw_tag_text):

182 processed_offsets.append((match.start(), match.end()))

183 raw_tags.extend(

184 split_zh_pron_raw_tag(

185 raw_tag_text[match.start() + 1 : match.end() - 1]

186 )

187 )

188 not_processed = ""

189 last_end = 0

190 for start, end in processed_offsets:

191 not_processed += raw_tag_text[last_end:start]

192 last_end = end

193 not_processed += raw_tag_text[last_end:]

194 if not_processed != raw_tag_text:

195 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags

196 else:

197 raw_tags.append(not_processed)

198 return raw_tags

199

200

201def extract_zh_pron_span(

202 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]

203) -> list[Sound]:

204 sounds = []

205 small_tags = []

206 pron_nodes = []

207 roman = ""

208 phonetic_pron = ""

209 for index, node in enumerate(span_tag.children):

210 if isinstance(node, HTMLNode) and node.tag == "small":

211 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))

212 elif (

213 isinstance(node, HTMLNode)

214 and node.tag == "span"

215 and "-Latn" in node.attrs.get("lang", "")

216 ):

217 roman = clean_node(wxr, None, node).strip("() ")

218 elif isinstance(node, str) and node.strip() == "[實際讀音：":

219 phonetic_pron = clean_node(

220 wxr, None, span_tag.children[index + 1 :]

221 ).strip("] ")

222 break

223 else:

224 pron_nodes.append(node)

225 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):

226 zh_pron = zh_pron.strip("[]： ")

227 if len(zh_pron) > 0: 227 ↛ 225line 227 didn't jump to line 225 because the condition on line 227 was always true

228 if "IPA" in span_tag.attrs.get("class", ""):

229 sounds.append(

230 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)

231 )

232 else:

233 sounds.append(

234 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)

235 )

236 if len(sounds) > 0:

237 sounds[-1].raw_tags.extend(small_tags)

238 if phonetic_pron != "":

239 sounds.append(

240 Sound(

241 zh_pron=phonetic_pron,

242 roman=roman,

243 raw_tags=raw_tags + ["實際讀音"],

244 )

245 )

246 return sounds

247

248

249def split_zh_pron(zh_pron: str) -> list[str]:

250 # split by comma and other symbols that outside parentheses

251 parentheses = 0

252 pron_list = []

253 pron = ""

254 for c in zh_pron:

255 if (

256 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))

257 and parentheses == 0

258 and len(pron.strip()) > 0

259 ):

260 pron_list.append(pron.strip())

261 pron = ""

262 elif c in ["(", "（"]:

263 parentheses += 1

264 pron += c

265 elif c in [")", "）"]:

266 parentheses -= 1

267 pron += c

268 else:

269 pron += c

270

271 if pron.strip() != "":

272 pron_list.append(pron)

273 return pron_list

274

275

276def extract_zh_pron_homophones_table(

277 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]

278) -> list[Sound]:

279 sounds = []

280 for td_tag in table.find_html_recursively("td"):

281 for span_tag in td_tag.find_html("span"):

282 span_class = span_tag.attrs.get("class", "")

283 span_lang = span_tag.attrs.get("lang", "")

284 span_str = clean_node(wxr, None, span_tag)

285 if (

286 span_str not in ["", "／"]

287 and span_lang != ""

288 and span_class in ["Hant", "Hans", "Hani"]

289 ):

290 sound = Sound(homophone=span_str, raw_tags=raw_tags)

291 if span_class == "Hant":

292 sound.tags.append("Traditional-Chinese")

293 elif span_class == "Hans":

294 sound.tags.append("Simplified-Chinese")

295 sounds.append(sound)

296 return sounds

297

298

299def process_homophones_template(

300 wxr: WiktextractContext, template_node: TemplateNode

301) -> list[Sound]:

302 # https://zh.wiktionary.org/wiki/Template:homophones

303 sounds = []

304 for word_index in itertools.count(2): 304 ↛ 312line 304 didn't jump to line 312 because the loop on line 304 didn't complete

305 if word_index not in template_node.template_parameters:

306 break

307 homophone = clean_node(

308 wxr, None, template_node.template_parameters.get(word_index, "")

309 )

310 if len(homophone) > 0: 310 ↛ 304line 310 didn't jump to line 304 because the condition on line 310 was always true

311 sounds.append(Sound(homophone=homophone))

312 return sounds

313

314

315def process_audio_template(

316 wxr: WiktextractContext, template_node: TemplateNode, raw_tags: list[str]

317) -> list[Sound]:

318 # https://zh.wiktionary.org/wiki/Template:Audio

319 sound_file = clean_node(

320 wxr, None, template_node.template_parameters.get(2, "")

321 )

322 sound_data = Sound()

323 set_sound_file_url_fields(wxr, sound_file, sound_data)

324 raw_tag = clean_node(

325 wxr, None, template_node.template_parameters.get(3, "")

326 )

327 if len(raw_tag) > 0:

328 sound_data.raw_tags.append(raw_tag)

329 sound_data.raw_tags.extend(raw_tags)

330 return [sound_data]

331

332

333def process_ipa_template(

334 wxr: WiktextractContext,

335 template_node: TemplateNode,

336 raw_tags: list[str],

337) -> list[Sound]:

338 # https://zh.wiktionary.org/wiki/Template:IPA

339 sounds = []

340 for index in itertools.count(2): 340 ↛ 350line 340 didn't jump to line 350 because the loop on line 340 didn't complete

341 if index not in template_node.template_parameters:

342 break

343 sound = Sound(

344 ipa=clean_node(

345 wxr, None, template_node.template_parameters.get(index)

346 ),

347 raw_tags=raw_tags,

348 )

349 sounds.append(sound)

350 return sounds

351

352

353def process_enpr_template(

354 wxr: WiktextractContext,

355 template_node: TemplateNode,

356 raw_tags: list[str],

357) -> list[Sound]:

358 # https://zh.wiktionary.org/wiki/Template:enPR

359 sounds = []

360 for index in range(1, 4): 360 ↛ 370line 360 didn't jump to line 370 because the loop on line 360 didn't complete

361 if index not in template_node.template_parameters:

362 break

363 sound = Sound(

364 enpr=clean_node(

365 wxr, None, template_node.template_parameters.get(index)

366 ),

367 raw_tags=raw_tags,

368 )

369 sounds.append(sound)

370 return sounds

371

372

373def extract_ja_pron_template(

374 wxr: WiktextractContext, t_node: TemplateNode

375) -> tuple[list[Sound], list[str]]:

376 expanded_node = wxr.wtp.parse(

377 wxr.wtp.node_to_wikitext(t_node), expand_all=True

378 )

379 cats = {}

380 sounds = []

381 for li_tag in expanded_node.find_html_recursively("li"):

382 sound = Sound()

383 for span_tag in li_tag.find_html("span"):

384 span_class = span_tag.attrs.get("class", "")

385 if "usage-label-accent" in span_class:

386 raw_tag = clean_node(wxr, None, span_tag).strip("() ")

387 if raw_tag != "":

388 sound.raw_tags.append(raw_tag)

389 elif "IPA" in span_class:

390 sound.ipa = clean_node(wxr, None, span_tag)

391 elif "Latn" in span_class:

392 sound.roman = clean_node(wxr, None, span_tag)

393 elif span_tag.attrs.get("lang", "") == "ja":

394 sound.other = clean_node(wxr, None, span_tag)

395 if sound.ipa != "" or sound.other != "":

396 translate_raw_tags(sound)

397 sounds.append(sound)

398

399 clean_node(wxr, cats, expanded_node)

400 return sounds, cats.get("categories", [])