Coverage for src / wiktextract / extractor / th / sound.py: 50%
456 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for t_node in level_node.find_child(NodeKind.TEMPLATE):
23 if t_node.template_name == "zh-forms": 23 ↛ 24line 23 didn't jump to line 24 because the condition on line 23 was never true
24 from .page import extract_zh_forms
26 extract_zh_forms(wxr, base_data, t_node)
27 else:
28 extract_sound_template(wxr, base_data, t_node)
29 for list_node in level_node.find_child(NodeKind.LIST):
30 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
31 for t_node in list_item.find_child(NodeKind.TEMPLATE):
32 extract_sound_template(wxr, base_data, t_node)
35def extract_sound_template(
36 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
37):
38 if t_node.template_name in ["ja-pron", "ja-IPA"]:
39 extract_ja_pron_template(wxr, base_data, t_node)
40 elif t_node.template_name == "th-pron":
41 extract_th_pron_template(wxr, base_data, t_node)
42 elif t_node.template_name == "lo-pron":
43 extract_lo_pron_template(wxr, base_data, t_node)
44 elif t_node.template_name == "zh-pron": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 extract_zh_pron_template(wxr, base_data, t_node)
46 elif t_node.template_name.lower() == "ko-ipa": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 extract_ko_ipa_template(wxr, base_data, t_node)
48 elif (
49 t_node.template_name.lower() == "ipa"
50 or t_node.template_name.lower().endswith(("-ipa", "-pron"))
51 ):
52 extract_ipa_template(wxr, base_data, t_node)
53 elif t_node.template_name == "X-SAMPA": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 extract_x_sampa_template(wxr, base_data, t_node)
55 elif t_node.template_name == "enPR": 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 extract_enpr_template(wxr, base_data, t_node)
57 elif t_node.template_name in ["audio", "Audio", "เสียง"]: 57 ↛ 58line 57 didn't jump to line 58 because the condition on line 57 was never true
58 extract_audio_template(wxr, base_data, t_node)
59 elif t_node.template_name in ["rhymes", "rhyme"]:
60 extract_rhymes_template(wxr, base_data, t_node)
61 elif t_node.template_name in ["homophones", "homophone", "hmp"]: 61 ↛ 63line 61 didn't jump to line 63 because the condition on line 61 was always true
62 extract_homophones_template(wxr, base_data, t_node)
63 elif t_node.template_name in ["hyphenation", "hyph"]:
64 extract_hyphenation_template(wxr, base_data, t_node)
65 elif t_node.template_name in ["คำอ่านไทย", "คอท"]:
66 extract_approximate_th_pron(wxr, base_data, t_node)
69def extract_ipa_template(
70 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
71):
72 expanded_node = wxr.wtp.parse(
73 wxr.wtp.node_to_wikitext(t_node), expand_all=True
74 )
75 no_list_nodes = []
76 for node in expanded_node.children:
77 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
78 for list_item in node.find_child(NodeKind.LIST_ITEM):
79 extract_ipa_list_item(wxr, base_data, list_item)
80 else:
81 no_list_nodes.append(node)
82 if len(no_list_nodes) > 0: 82 ↛ 86line 82 didn't jump to line 86 because the condition on line 82 was always true
83 tmp_node = WikiNode(NodeKind.ROOT, 0)
84 tmp_node.children = no_list_nodes
85 extract_ipa_list_item(wxr, base_data, tmp_node)
86 clean_node(wxr, base_data, expanded_node)
89def extract_ipa_list_item(
90 wxr: WiktextractContext, base_data: WordEntry, list_item: WikiNode
91):
92 raw_tags = []
93 for italic_node in list_item.find_child(NodeKind.ITALIC):
94 # Template:vi-ipa location data
95 raw_tag = clean_node(wxr, None, italic_node)
96 if raw_tag != "": 96 ↛ 93line 96 didn't jump to line 93 because the condition on line 96 was always true
97 raw_tags.append(raw_tag)
98 for span_tag in list_item.find_html_recursively("span"):
99 span_class = span_tag.attrs.get("class", "").split()
100 if "qualifier-content" in span_class or "ib-content" in span_class: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true
101 for raw_tag in clean_node(wxr, None, span_tag).split(","):
102 raw_tag = raw_tag.strip()
103 if raw_tag != "":
104 raw_tags.append(raw_tag)
105 elif "IPA" in span_class: 105 ↛ 112line 105 didn't jump to line 112 because the condition on line 105 was always true
106 sound = Sound(
107 ipa=clean_node(wxr, None, span_tag), raw_tags=raw_tags
108 )
109 if sound.ipa != "": 109 ↛ 98line 109 didn't jump to line 98 because the condition on line 109 was always true
110 translate_raw_tags(sound)
111 base_data.sounds.append(sound)
112 elif "Latn" in span_class:
113 sound = Sound(
114 roman=clean_node(wxr, None, span_tag), raw_tags=raw_tags
115 )
116 if sound.roman != "":
117 translate_raw_tags(sound)
118 base_data.sounds.append(sound)
121def extract_ja_pron_template(
122 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
123):
124 JA_PRON_ACCENTS = {
125 "นากาดากะ": "Nakadaka",
126 "เฮบัง": "Heiban",
127 "อาตามาดากะ": "Atamadaka",
128 "โอดากะ": "Odaka",
129 }
130 expanded_node = wxr.wtp.parse(
131 wxr.wtp.node_to_wikitext(t_node), expand_all=True
132 )
133 for li_tag in expanded_node.find_html_recursively("li"):
134 sound = Sound()
135 for span_tag in li_tag.find_html("span"):
136 span_class = span_tag.attrs.get("class", "").split()
137 if "usage-label-accent" in span_class:
138 raw_tag = clean_node(wxr, None, span_tag).strip("() ")
139 if raw_tag != "": 139 ↛ 135line 139 didn't jump to line 135 because the condition on line 139 was always true
140 sound.raw_tags.append(raw_tag)
141 elif "IPA" in span_class:
142 sound.ipa = clean_node(wxr, None, span_tag)
143 elif "Latn" in span_class:
144 sound.roman = clean_node(wxr, None, span_tag)
145 elif span_tag.attrs.get("lang", "") == "ja": 145 ↛ 135line 145 didn't jump to line 135 because the condition on line 145 was always true
146 sound.other = clean_node(wxr, None, span_tag)
147 for link_node in li_tag.find_child(NodeKind.LINK):
148 link_text = clean_node(wxr, None, link_node)
149 if link_text in JA_PRON_ACCENTS:
150 sound.tags.append(JA_PRON_ACCENTS[link_text])
151 if sound.ipa != "" or sound.other != "": 151 ↛ 133line 151 didn't jump to line 133 because the condition on line 151 was always true
152 translate_raw_tags(sound)
153 base_data.sounds.append(sound)
154 audio_file = t_node.template_parameters.get(
155 "a", t_node.template_parameters.get("audio", "")
156 ).strip()
157 if audio_file != "": 157 ↛ 158line 157 didn't jump to line 158 because the condition on line 157 was never true
158 sound = Sound()
159 set_sound_file_url_fields(wxr, audio_file, sound)
160 base_data.sounds.append(sound)
161 clean_node(wxr, base_data, expanded_node)
164def extract_x_sampa_template(
165 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
166):
167 sound = Sound(
168 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")),
169 tags=["X-SAMPA"],
170 )
171 if sound.ipa != "":
172 base_data.sounds.append(sound)
175def extract_enpr_template(
176 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
177):
178 sound = Sound(
179 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, ""))
180 )
181 if sound.enpr != "":
182 base_data.sounds.append(sound)
185def extract_audio_template(
186 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
187):
188 sound = Sound()
189 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
190 if filename != "": 190 ↛ exitline 190 didn't return from function 'extract_audio_template' because the condition on line 190 was always true
191 set_sound_file_url_fields(wxr, filename, sound)
192 caption = clean_node(wxr, None, t_node.template_parameters.get(3, ""))
193 if caption != "": 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 sound.raw_tags.append(caption)
195 expanded_node = wxr.wtp.parse(
196 wxr.wtp.node_to_wikitext(t_node), expand_all=True
197 )
198 for span_node in expanded_node.find_html_recursively(
199 "span", attr_name="class", attr_value="ib-content"
200 ):
201 for raw_tag in clean_node(wxr, None, span_node).split(","):
202 if raw_tag != "": 202 ↛ 201line 202 didn't jump to line 201 because the condition on line 202 was always true
203 sound.raw_tags.append(raw_tag)
204 translate_raw_tags(sound)
205 base_data.sounds.append(sound)
206 clean_node(wxr, base_data, t_node)
209def extract_th_pron_template(
210 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
211):
212 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron
213 @dataclass
214 class TableHeader:
215 raw_tags: list[str]
216 rowspan: int
218 expanded_node = wxr.wtp.parse(
219 wxr.wtp.node_to_wikitext(t_node), expand_all=True
220 )
221 for table_tag in expanded_node.find_html("table"):
222 row_headers = []
223 for tr_tag in table_tag.find_html("tr"):
224 field = "other"
225 new_headers = []
226 for header in row_headers:
227 if header.rowspan > 1:
228 header.rowspan -= 1
229 new_headers.append(header)
230 row_headers = new_headers
231 for th_tag in tr_tag.find_html("th"):
232 header_str = clean_node(wxr, None, th_tag)
233 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"):
234 field = "ipa"
235 elif header_str.startswith("คำพ้องเสียง"):
236 field = "homophone"
237 elif header_str == "ไฟล์เสียง": 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 field = "audio"
239 elif header_str != "": 239 ↛ 231line 239 didn't jump to line 231 because the condition on line 239 was always true
240 rowspan = 1
241 rowspan_str = th_tag.attrs.get("rowspan", "1")
242 if re.fullmatch(r"\d+", rowspan_str): 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true
243 rowspan = int(rowspan_str)
244 header = TableHeader([], rowspan)
245 for line in header_str.splitlines():
246 for raw_tag in line.strip("{}\n ").split(";"):
247 raw_tag = raw_tag.strip()
248 if raw_tag != "": 248 ↛ 246line 248 didn't jump to line 246 because the condition on line 248 was always true
249 header.raw_tags.append(raw_tag)
250 row_headers.append(header)
252 for td_tag in tr_tag.find_html("td"):
253 if field == "audio": 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 for link_node in td_tag.find_child(NodeKind.LINK):
255 filename = clean_node(wxr, None, link_node.largs[0])
256 if filename != "":
257 sound = Sound()
258 set_sound_file_url_fields(wxr, filename, sound)
259 base_data.sounds.append(sound)
260 elif field == "homophone":
261 for span_tag in td_tag.find_html_recursively(
262 "span", attr_name="lang", attr_value="th"
263 ):
264 word = clean_node(wxr, None, span_tag)
265 if word != "": 265 ↛ 261line 265 didn't jump to line 261 because the condition on line 265 was always true
266 base_data.sounds.append(Sound(homophone=word))
267 else:
268 raw_tags = []
269 for html_node in td_tag.find_child_recursively(
270 NodeKind.HTML
271 ):
272 if html_node.tag == "small":
273 node_str = clean_node(wxr, None, html_node)
274 if node_str.startswith("[") and node_str.endswith( 274 ↛ 269line 274 didn't jump to line 269 because the condition on line 274 was always true
275 "]"
276 ):
277 for raw_tag in node_str.strip("[]").split(","):
278 raw_tag = raw_tag.strip()
279 if raw_tag != "": 279 ↛ 277line 279 didn't jump to line 277 because the condition on line 279 was always true
280 raw_tags.append(raw_tag)
281 elif html_node.tag == "span":
282 node_str = clean_node(wxr, None, html_node)
283 span_lang = html_node.attrs.get("lang", "")
284 span_class = html_node.attrs.get("class", "")
285 if node_str != "" and (
286 span_lang == "th" or span_class in ["IPA", "tr"]
287 ):
288 sound = Sound(raw_tags=raw_tags)
289 for header in row_headers:
290 sound.raw_tags.extend(header.raw_tags)
291 translate_raw_tags(sound)
292 if "romanization" in sound.tags:
293 field = "roman"
294 setattr(sound, field, node_str)
295 base_data.sounds.append(sound)
297 clean_node(wxr, base_data, expanded_node)
300def extract_lo_pron_template(
301 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
302):
303 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
304 expanded_node = wxr.wtp.parse(
305 wxr.wtp.node_to_wikitext(t_node), expand_all=True
306 )
307 for list_node in expanded_node.find_child(NodeKind.LIST):
308 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
309 field = "other"
310 raw_tag = ""
311 for node in list_item.children:
312 if isinstance(node, HTMLNode) and node.tag == "span":
313 span_class = node.attrs.get("class", "")
314 if "qualifier-content" in span_class:
315 raw_tag = clean_node(wxr, None, node)
316 elif span_class == "IPA":
317 ipa = clean_node(wxr, None, node)
318 if ipa != "": 318 ↛ 311line 318 didn't jump to line 311 because the condition on line 318 was always true
319 sound = Sound(ipa=ipa)
320 if raw_tag != "": 320 ↛ 323line 320 didn't jump to line 323 because the condition on line 320 was always true
321 sound.raw_tags.append(raw_tag)
322 translate_raw_tags(sound)
323 base_data.sounds.append(sound)
324 else:
325 span_lang = node.attrs.get("lang", "")
326 if span_lang == "lo" and field == "hyphenation":
327 span_str = clean_node(wxr, None, node)
328 if span_str != "": 328 ↛ 311line 328 didn't jump to line 311 because the condition on line 328 was always true
329 base_data.hyphenations.append(
330 Hyphenation(parts=span_str.split("-"))
331 )
332 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
333 link_str = clean_node(wxr, None, node)
334 if link_str == "สัทอักษรสากล":
335 field = "ipa"
336 elif link_str != "" and field == "rhymes":
337 base_data.sounds.append(Sound(rhymes=link_str))
338 elif isinstance(node, str) and node.strip().endswith(":"):
339 node = node.strip()
340 if node == "การแบ่งพยางค์:":
341 field = "hyphenation"
342 elif node == "สัมผัส:": 342 ↛ 311line 342 didn't jump to line 311 because the condition on line 342 was always true
343 field = "rhymes"
345 clean_node(wxr, base_data, expanded_node)
348def extract_zh_pron_template(
349 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
350):
351 expanded_node = wxr.wtp.parse(
352 wxr.wtp.node_to_wikitext(t_node), expand_all=True
353 )
354 seen_lists = set()
355 sounds = []
356 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
357 if list_node not in seen_lists:
358 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
359 sounds.extend(
360 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
361 )
362 for sound in sounds:
363 translate_raw_tags(sound)
364 base_data.sounds.extend(sounds)
365 clean_node(wxr, base_data, expanded_node)
368def extract_zh_pron_list_item(
369 wxr: WiktextractContext,
370 list_item_node: WikiNode,
371 raw_tags: list[str],
372 seen_lists: set[WikiNode],
373) -> list[Sound]:
374 current_tags = raw_tags[:]
375 sounds = []
376 is_first_small_tag = True
377 for node in list_item_node.children:
378 if isinstance(node, WikiNode):
379 if node.kind == NodeKind.LINK:
380 link_str = clean_node(wxr, None, node.largs)
381 node_str = clean_node(wxr, None, node)
382 if link_str.startswith(("File:", "ไฟล์:")):
383 filename = link_str.removeprefix("File:").removeprefix(
384 "ไฟล์:"
385 )
386 sound_data = Sound(raw_tags=current_tags)
387 set_sound_file_url_fields(wxr, filename, sound_data)
388 sounds.append(sound_data)
389 elif node_str != "":
390 current_tags.append(node_str.strip("()"))
391 elif isinstance(node, HTMLNode):
392 if node.tag == "small":
393 # remove <sup> tag
394 if is_first_small_tag:
395 raw_tag_text = clean_node(
396 wxr,
397 None,
398 [
399 n
400 for n in node.children
401 if not (
402 isinstance(n, HTMLNode) and n.tag == "sup"
403 )
404 ],
405 ).rstrip(":")
406 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
407 elif len(sounds) > 0:
408 sounds[-1].raw_tags.extend(
409 split_zh_pron_raw_tag(clean_node(wxr, None, node))
410 )
411 is_first_small_tag = False
412 elif node.tag == "span":
413 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
414 elif (
415 node.tag == "table"
416 and len(current_tags) > 0
417 and current_tags[-1] == "คำพ้องเสียง"
418 ):
419 sounds.extend(
420 extract_zh_pron_homophones_table(
421 wxr, node, current_tags
422 )
423 )
424 elif node.kind == NodeKind.LIST:
425 seen_lists.add(node)
426 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
427 sounds.extend(
428 extract_zh_pron_list_item(
429 wxr,
430 next_list_item,
431 current_tags,
432 seen_lists,
433 )
434 )
435 return sounds
438def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
439 raw_tags = []
440 if "(" not in raw_tag_text:
441 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
442 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
443 if raw_tag != "":
444 raw_tags.append(raw_tag)
445 else:
446 processed_offsets = []
447 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
448 processed_offsets.append((match.start(), match.end()))
449 raw_tags.extend(
450 split_zh_pron_raw_tag(
451 raw_tag_text[match.start() + 1 : match.end() - 1]
452 )
453 )
454 not_processed = ""
455 last_end = 0
456 for start, end in processed_offsets:
457 not_processed += raw_tag_text[last_end:start]
458 last_end = end
459 not_processed += raw_tag_text[last_end:]
460 if not_processed != raw_tag_text:
461 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
462 else:
463 raw_tags.append(not_processed)
464 return raw_tags
467def extract_zh_pron_span(
468 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
469) -> list[Sound]:
470 sounds = []
471 small_tags = []
472 pron_nodes = []
473 roman = ""
474 phonetic_pron = ""
475 for index, node in enumerate(span_tag.children):
476 if isinstance(node, HTMLNode) and node.tag == "small":
477 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
478 elif (
479 isinstance(node, HTMLNode)
480 and node.tag == "span"
481 and "-Latn" in node.attrs.get("lang", "")
482 ):
483 roman = clean_node(wxr, None, node).strip("() ")
484 elif isinstance(node, str) and node.strip() == "[Phonetic:":
485 phonetic_pron = clean_node(
486 wxr, None, span_tag.children[index + 1 :]
487 ).strip("] ")
488 break
489 else:
490 pron_nodes.append(node)
491 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
492 zh_pron = zh_pron.strip("[]: ")
493 if len(zh_pron) > 0:
494 if "IPA" in span_tag.attrs.get("class", ""):
495 sounds.append(
496 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
497 )
498 else:
499 sounds.append(
500 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
501 )
502 if len(sounds) > 0:
503 sounds[-1].raw_tags.extend(small_tags)
504 if phonetic_pron != "":
505 sounds.append(
506 Sound(
507 zh_pron=phonetic_pron,
508 roman=roman,
509 raw_tags=raw_tags + ["Phonetic"],
510 )
511 )
512 return sounds
515def split_zh_pron(zh_pron: str) -> list[str]:
516 # split by comma and other symbols that outside parentheses
517 parentheses = 0
518 pron_list = []
519 pron = ""
520 for c in zh_pron:
521 if (
522 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
523 and parentheses == 0
524 and len(pron.strip()) > 0
525 ):
526 pron_list.append(pron.strip())
527 pron = ""
528 elif c == "(":
529 parentheses += 1
530 pron += c
531 elif c == ")":
532 parentheses -= 1
533 pron += c
534 else:
535 pron += c
537 if pron.strip() != "":
538 pron_list.append(pron)
539 return pron_list
542def extract_zh_pron_homophones_table(
543 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
544) -> list[Sound]:
545 sounds = []
546 for td_tag in table.find_html_recursively("td"):
547 for span_tag in td_tag.find_html("span"):
548 span_class = span_tag.attrs.get("class", "")
549 span_lang = span_tag.attrs.get("lang", "")
550 span_str = clean_node(wxr, None, span_tag)
551 if (
552 span_str not in ["", "/"]
553 and span_lang != ""
554 and span_class in ["Hant", "Hans", "Hani"]
555 ):
556 sound = Sound(homophone=span_str, raw_tags=raw_tags)
557 if span_class == "Hant":
558 sound.tags.append("Traditional-Chinese")
559 elif span_class == "Hans":
560 sound.tags.append("Simplified-Chinese")
561 sounds.append(sound)
562 return sounds
565def extract_rhymes_template(
566 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
567):
568 expanded_node = wxr.wtp.parse(
569 wxr.wtp.node_to_wikitext(t_node), expand_all=True
570 )
571 for link_node in expanded_node.find_child(NodeKind.LINK):
572 rhyme = clean_node(wxr, base_data, link_node)
573 if rhyme != "":
574 base_data.sounds.append(Sound(rhymes=rhyme))
577def extract_homophones_template(
578 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
579):
580 expanded_node = wxr.wtp.parse(
581 wxr.wtp.node_to_wikitext(t_node), expand_all=True
582 )
583 homophones = []
584 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
585 for top_span in expanded_node.find_html(
586 "span", attr_name="class", attr_value="homophones"
587 ):
588 for span_tag in top_span.find_html("span"):
589 span_lang = span_tag.attrs.get("lang", "")
590 span_class = span_tag.attrs.get("class", "").split()
591 if "tr" in span_class and len(homophones) > 0:
592 homophones[-1].roman = clean_node(wxr, None, span_tag)
593 elif span_lang == lang_code:
594 homophone = clean_node(wxr, None, span_tag)
595 if homophone != "": 595 ↛ 588line 595 didn't jump to line 588 because the condition on line 595 was always true
596 homophones.append(Sound(homophone=homophone))
597 elif "qualifier-content" in span_class and len(homophones) > 0:
598 raw_tag = clean_node(wxr, None, span_tag)
599 if raw_tag != "": 599 ↛ 588line 599 didn't jump to line 588 because the condition on line 599 was always true
600 homophones[-1].raw_tags.append(raw_tag)
601 translate_raw_tags(homophones[-1])
603 base_data.sounds.extend(homophones)
604 for link_node in expanded_node.find_child(NodeKind.LINK):
605 clean_node(wxr, base_data, link_node)
608def extract_hyphenation_template(
609 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
610):
611 expanded_node = wxr.wtp.parse(
612 wxr.wtp.node_to_wikitext(t_node), expand_all=True
613 )
614 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
615 for span_tag in expanded_node.find_html(
616 "span", attr_name="lang", attr_value=lang_code
617 ):
618 h_str = clean_node(wxr, None, span_tag)
619 h_data = Hyphenation(
620 parts=list(filter(None, map(str.strip, h_str.split("‧"))))
621 )
622 if len(h_data.parts) > 0:
623 base_data.hyphenations.append(h_data)
626def extract_ko_ipa_template(
627 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
628):
629 sounds = []
630 expanded_node = wxr.wtp.parse(
631 wxr.wtp.node_to_wikitext(t_node), expand_all=True
632 )
633 clean_node(wxr, word_entry, expanded_node)
634 for ul_node in expanded_node.find_html("ul"):
635 for li_node in ul_node.find_html("li"):
636 if "ko-pron__ph" in li_node.attrs.get("class", ""):
637 for span_node in li_node.find_html(
638 "span", attr_name="lang", attr_value="ko"
639 ):
640 hangeul_str = clean_node(wxr, None, span_node).strip("[]")
641 for hangeul in hangeul_str.split("/"):
642 if hangeul != "":
643 sounds.append(
644 Sound(hangeul=hangeul, tags=["phonetic"])
645 )
646 else:
647 raw_tags = []
648 for i_node in li_node.find_html("i"):
649 for raw_tag in clean_node(wxr, None, i_node).split("/"):
650 if raw_tag not in ["", "สัทอักษรสากล"]:
651 raw_tags.append(raw_tag)
652 for span_node in li_node.find_html(
653 "span", attr_name="class", attr_value="IPA"
654 ):
655 ipas = clean_node(wxr, None, span_node)
656 for ipa in ipas.split("~"):
657 ipa = ipa.strip()
658 if ipa != "":
659 sound = Sound(ipa=ipa, raw_tags=raw_tags)
660 translate_raw_tags(sound)
661 sounds.append(sound)
663 for table in expanded_node.find_html("table"):
664 for tr in table.find_html("tr"):
665 raw_tag = ""
666 for th in tr.find_html("th"):
667 raw_tag = clean_node(wxr, None, th)
668 for td in tr.find_html("td"):
669 roman = clean_node(wxr, None, td)
670 if roman != "":
671 sound = Sound(roman=roman)
672 if raw_tag != "":
673 sound.raw_tags.append(raw_tag)
674 translate_raw_tags(sound)
675 sounds.append(sound)
677 audio_file = clean_node(
678 wxr,
679 None,
680 t_node.template_parameters.get(
681 "a", t_node.template_parameters.get("audio", "")
682 ),
683 )
684 if audio_file != "":
685 sound = Sound()
686 set_sound_file_url_fields(wxr, audio_file, sound)
687 sounds.append(sound)
688 word_entry.sounds.extend(sounds)
691def extract_approximate_th_pron(
692 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
693):
694 # https://th.wiktionary.org/wiki/แม่แบบ:คำอ่านไทย
695 for arg_index in range(1, 7):
696 if arg_index not in t_node.template_parameters:
697 break
698 value = clean_node(wxr, None, t_node.template_parameters[arg_index])
699 if value != "":
700 base_data.sounds.append(
701 Sound(other=value, raw_tags=["เทียบเสียงภาษาไทยโดยประมาณ"])
702 )