Coverage for src/wiktextract/extractor/th/sound.py: 51%
315 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for t_node in level_node.find_child(NodeKind.TEMPLATE):
23 extract_sound_template(wxr, base_data, t_node)
24 for list_node in level_node.find_child(NodeKind.LIST):
25 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
26 for t_node in list_item.find_child(NodeKind.TEMPLATE):
27 extract_sound_template(wxr, base_data, t_node)
30def extract_sound_template(
31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
32):
33 if t_node.template_name == "IPA": 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true
34 extract_ipa_template(wxr, base_data, t_node)
35 elif t_node.template_name == "X-SAMPA": 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 extract_x_sampa_template(wxr, base_data, t_node)
37 elif t_node.template_name == "enPR": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 extract_enpr_template(wxr, base_data, t_node)
39 elif t_node.template_name == "audio": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 extract_audio_template(wxr, base_data, t_node)
41 elif t_node.template_name == "th-pron":
42 extract_th_pron_template(wxr, base_data, t_node)
43 elif t_node.template_name == "lo-pron":
44 extract_lo_pron_template(wxr, base_data, t_node)
45 elif t_node.template_name in ["ja-pron", "ja-IPA"]: 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true
46 extract_ja_pron_template(wxr, base_data, t_node)
47 elif t_node.template_name == "zh-pron":
48 extract_zh_pron_template(wxr, base_data, t_node)
51def extract_ipa_template(
52 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
53):
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(t_node), expand_all=True
56 )
57 extract_ipa_li_tag(wxr, base_data, expanded_node)
58 clean_node(wxr, base_data, expanded_node)
61def extract_ipa_li_tag(
62 wxr: WiktextractContext, base_data: WordEntry, li_tag: HTMLNode
63):
64 raw_tag = ""
65 for span_tag in li_tag.find_html_recursively("span"):
66 span_class = span_tag.attrs.get("class", "").split()
67 if "qualifier-content" in span_class: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 raw_tag = clean_node(wxr, None, span_tag)
69 elif "IPA" in span_class:
70 sound = Sound(ipa=clean_node(wxr, None, span_tag))
71 if raw_tag != "": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 sound.raw_tags.append(raw_tag)
73 translate_raw_tags(sound)
74 if sound.ipa != "": 74 ↛ 65line 74 didn't jump to line 65 because the condition on line 74 was always true
75 base_data.sounds.append(sound)
76 elif "Latn" in span_class:
77 sound = Sound(roman=clean_node(wxr, None, span_tag))
78 if raw_tag != "": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 sound.raw_tags.append(raw_tag)
80 translate_raw_tags(sound)
81 if sound.roman != "": 81 ↛ 65line 81 didn't jump to line 65 because the condition on line 81 was always true
82 base_data.sounds.append(sound)
85def extract_ja_pron_template(
86 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
87):
88 expanded_node = wxr.wtp.parse(
89 wxr.wtp.node_to_wikitext(t_node), expand_all=True
90 )
91 for li_tag in expanded_node.find_html_recursively("li"):
92 extract_ipa_li_tag(wxr, base_data, li_tag)
93 clean_node(wxr, base_data, expanded_node)
96def extract_x_sampa_template(
97 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
98):
99 sound = Sound(
100 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")),
101 tags=["X-SAMPA"],
102 )
103 if sound.ipa != "":
104 base_data.sounds.append(sound)
107def extract_enpr_template(
108 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
109):
110 sound = Sound(
111 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, ""))
112 )
113 if sound.enpr != "":
114 base_data.sounds.append(sound)
117def extract_audio_template(
118 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
119):
120 sound = Sound()
121 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
122 if filename != "": 122 ↛ exitline 122 didn't return from function 'extract_audio_template' because the condition on line 122 was always true
123 set_sound_file_url_fields(wxr, filename, sound)
124 for raw_tag in clean_node(
125 wxr, None, t_node.template_parameters.get("a", "")
126 ).split(","):
127 raw_tag = raw_tag.strip()
128 if raw_tag != "": 128 ↛ 124line 128 didn't jump to line 124 because the condition on line 128 was always true
129 sound.raw_tags.append(raw_tag)
130 translate_raw_tags(sound)
131 base_data.sounds.append(sound)
132 clean_node(wxr, base_data, t_node)
135def extract_th_pron_template(
136 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
137):
138 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron
139 @dataclass
140 class TableHeader:
141 raw_tags: list[str]
142 rowspan: int
144 expanded_node = wxr.wtp.parse(
145 wxr.wtp.node_to_wikitext(t_node), expand_all=True
146 )
147 for table_tag in expanded_node.find_html("table"):
148 row_headers = []
149 for tr_tag in table_tag.find_html("tr"):
150 field = "other"
151 new_headers = []
152 for header in row_headers:
153 if header.rowspan > 1:
154 header.rowspan -= 1
155 new_headers.append(header)
156 row_headers = new_headers
157 for th_tag in tr_tag.find_html("th"):
158 header_str = clean_node(wxr, None, th_tag)
159 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"):
160 field = "ipa"
161 elif header_str.startswith("คำพ้องเสียง"):
162 field = "homophone"
163 elif header_str == "ไฟล์เสียง": 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 field = "audio"
165 elif header_str != "": 165 ↛ 157line 165 didn't jump to line 157 because the condition on line 165 was always true
166 rowspan = 1
167 rowspan_str = th_tag.attrs.get("rowspan", "1")
168 if re.fullmatch(r"\d+", rowspan_str): 168 ↛ 170line 168 didn't jump to line 170 because the condition on line 168 was always true
169 rowspan = int(rowspan_str)
170 header = TableHeader([], rowspan)
171 for line in header_str.splitlines():
172 for raw_tag in line.strip("{}\n ").split(";"):
173 raw_tag = raw_tag.strip()
174 if raw_tag != "": 174 ↛ 172line 174 didn't jump to line 172 because the condition on line 174 was always true
175 header.raw_tags.append(raw_tag)
176 row_headers.append(header)
178 for td_tag in tr_tag.find_html("td"):
179 if field == "audio": 179 ↛ 180line 179 didn't jump to line 180 because the condition on line 179 was never true
180 for link_node in td_tag.find_child(NodeKind.LINK):
181 filename = clean_node(wxr, None, link_node.largs[0])
182 if filename != "":
183 sound = Sound()
184 set_sound_file_url_fields(wxr, filename, sound)
185 base_data.sounds.append(sound)
186 elif field == "homophone":
187 for span_tag in td_tag.find_html_recursively(
188 "span", attr_name="lang", attr_value="th"
189 ):
190 word = clean_node(wxr, None, span_tag)
191 if word != "": 191 ↛ 187line 191 didn't jump to line 187 because the condition on line 191 was always true
192 base_data.sounds.append(Sound(homophone=word))
193 else:
194 raw_tags = []
195 for html_node in td_tag.find_child_recursively(
196 NodeKind.HTML
197 ):
198 if html_node.tag == "small":
199 node_str = clean_node(wxr, None, html_node)
200 if node_str.startswith("[") and node_str.endswith( 200 ↛ 195line 200 didn't jump to line 195 because the condition on line 200 was always true
201 "]"
202 ):
203 for raw_tag in node_str.strip("[]").split(","):
204 raw_tag = raw_tag.strip()
205 if raw_tag != "": 205 ↛ 203line 205 didn't jump to line 203 because the condition on line 205 was always true
206 raw_tags.append(raw_tag)
207 elif html_node.tag == "span":
208 node_str = clean_node(wxr, None, html_node)
209 span_lang = html_node.attrs.get("lang", "")
210 span_class = html_node.attrs.get("class", "")
211 if node_str != "" and (
212 span_lang == "th" or span_class in ["IPA", "tr"]
213 ):
214 sound = Sound(raw_tags=raw_tags)
215 for header in row_headers:
216 sound.raw_tags.extend(header.raw_tags)
217 translate_raw_tags(sound)
218 if "romanization" in sound.tags:
219 field = "roman"
220 setattr(sound, field, node_str)
221 base_data.sounds.append(sound)
223 clean_node(wxr, base_data, expanded_node)
226def extract_lo_pron_template(
227 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
228):
229 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
230 expanded_node = wxr.wtp.parse(
231 wxr.wtp.node_to_wikitext(t_node), expand_all=True
232 )
233 for list_node in expanded_node.find_child(NodeKind.LIST):
234 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
235 field = "other"
236 raw_tag = ""
237 for node in list_item.children:
238 if isinstance(node, HTMLNode) and node.tag == "span":
239 span_class = node.attrs.get("class", "")
240 if "qualifier-content" in span_class:
241 raw_tag = clean_node(wxr, None, node)
242 elif span_class == "IPA":
243 ipa = clean_node(wxr, None, node)
244 if ipa != "": 244 ↛ 237line 244 didn't jump to line 237 because the condition on line 244 was always true
245 sound = Sound(ipa=ipa)
246 if raw_tag != "": 246 ↛ 249line 246 didn't jump to line 249 because the condition on line 246 was always true
247 sound.raw_tags.append(raw_tag)
248 translate_raw_tags(sound)
249 base_data.sounds.append(sound)
250 else:
251 span_lang = node.attrs.get("lang", "")
252 if span_lang == "lo" and field == "hyphenation":
253 span_str = clean_node(wxr, None, node)
254 if span_str != "": 254 ↛ 237line 254 didn't jump to line 237 because the condition on line 254 was always true
255 base_data.hyphenations.append(
256 Hyphenation(parts=span_str.split("-"))
257 )
258 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
259 link_str = clean_node(wxr, None, node)
260 if link_str == "สัทอักษรสากล":
261 field = "ipa"
262 elif link_str != "" and field == "rhymes":
263 base_data.sounds.append(Sound(rhymes=link_str))
264 elif isinstance(node, str) and node.strip().endswith(":"):
265 node = node.strip()
266 if node == "การแบ่งพยางค์:":
267 field = "hyphenation"
268 elif node == "สัมผัส:": 268 ↛ 237line 268 didn't jump to line 237 because the condition on line 268 was always true
269 field = "rhymes"
271 clean_node(wxr, base_data, expanded_node)
274def extract_zh_pron_template(
275 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
276):
277 expanded_node = wxr.wtp.parse(
278 wxr.wtp.node_to_wikitext(t_node), expand_all=True
279 )
280 seen_lists = set()
281 sounds = []
282 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
283 if list_node not in seen_lists:
284 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
285 sounds.extend(
286 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
287 )
288 for sound in sounds:
289 translate_raw_tags(sound)
290 base_data.sounds.extend(sounds)
291 clean_node(wxr, base_data, expanded_node)
294def extract_zh_pron_list_item(
295 wxr: WiktextractContext,
296 list_item_node: WikiNode,
297 raw_tags: list[str],
298 seen_lists: set[WikiNode],
299) -> list[Sound]:
300 current_tags = raw_tags[:]
301 sounds = []
302 is_first_small_tag = True
303 for node in list_item_node.children:
304 if isinstance(node, WikiNode):
305 if node.kind == NodeKind.LINK:
306 link_str = clean_node(wxr, None, node.largs)
307 node_str = clean_node(wxr, None, node)
308 if link_str.startswith(("File:", "ไฟล์:")):
309 filename = link_str.removeprefix("File:").removeprefix(
310 "ไฟล์:"
311 )
312 sound_data = Sound(raw_tags=current_tags)
313 set_sound_file_url_fields(wxr, filename, sound_data)
314 sounds.append(sound_data)
315 elif node_str != "":
316 current_tags.append(node_str.strip("()"))
317 elif isinstance(node, HTMLNode):
318 if node.tag == "small":
319 # remove <sup> tag
320 if is_first_small_tag:
321 raw_tag_text = clean_node(
322 wxr,
323 None,
324 [
325 n
326 for n in node.children
327 if not (
328 isinstance(n, HTMLNode) and n.tag == "sup"
329 )
330 ],
331 ).rstrip(":")
332 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
333 elif len(sounds) > 0:
334 sounds[-1].raw_tags.extend(
335 split_zh_pron_raw_tag(clean_node(wxr, None, node))
336 )
337 is_first_small_tag = False
338 elif node.tag == "span":
339 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
340 elif (
341 node.tag == "table"
342 and len(current_tags) > 0
343 and current_tags[-1] == "คำพ้องเสียง"
344 ):
345 sounds.extend(
346 extract_zh_pron_homophones_table(
347 wxr, node, current_tags
348 )
349 )
350 elif node.kind == NodeKind.LIST:
351 seen_lists.add(node)
352 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
353 sounds.extend(
354 extract_zh_pron_list_item(
355 wxr,
356 next_list_item,
357 current_tags,
358 seen_lists,
359 )
360 )
361 return sounds
364def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
365 raw_tags = []
366 if "(" not in raw_tag_text:
367 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
368 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
369 if raw_tag != "":
370 raw_tags.append(raw_tag)
371 else:
372 processed_offsets = []
373 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
374 processed_offsets.append((match.start(), match.end()))
375 raw_tags.extend(
376 split_zh_pron_raw_tag(
377 raw_tag_text[match.start() + 1 : match.end() - 1]
378 )
379 )
380 not_processed = ""
381 last_end = 0
382 for start, end in processed_offsets:
383 not_processed += raw_tag_text[last_end:start]
384 last_end = end
385 not_processed += raw_tag_text[last_end:]
386 if not_processed != raw_tag_text:
387 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
388 else:
389 raw_tags.append(not_processed)
390 return raw_tags
393def extract_zh_pron_span(
394 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
395) -> list[Sound]:
396 sounds = []
397 small_tags = []
398 pron_nodes = []
399 roman = ""
400 phonetic_pron = ""
401 for index, node in enumerate(span_tag.children):
402 if isinstance(node, HTMLNode) and node.tag == "small":
403 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
404 elif (
405 isinstance(node, HTMLNode)
406 and node.tag == "span"
407 and "-Latn" in node.attrs.get("lang", "")
408 ):
409 roman = clean_node(wxr, None, node).strip("() ")
410 elif isinstance(node, str) and node.strip() == "[Phonetic:":
411 phonetic_pron = clean_node(
412 wxr, None, span_tag.children[index + 1 :]
413 ).strip("] ")
414 break
415 else:
416 pron_nodes.append(node)
417 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
418 zh_pron = zh_pron.strip("[]: ")
419 if len(zh_pron) > 0:
420 if "IPA" in span_tag.attrs.get("class", ""):
421 sounds.append(
422 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
423 )
424 else:
425 sounds.append(
426 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
427 )
428 if len(sounds) > 0:
429 sounds[-1].raw_tags.extend(small_tags)
430 if phonetic_pron != "":
431 sounds.append(
432 Sound(
433 zh_pron=phonetic_pron,
434 roman=roman,
435 raw_tags=raw_tags + ["Phonetic"],
436 )
437 )
438 return sounds
441def split_zh_pron(zh_pron: str) -> list[str]:
442 # split by comma and other symbols that outside parentheses
443 parentheses = 0
444 pron_list = []
445 pron = ""
446 for c in zh_pron:
447 if (
448 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
449 and parentheses == 0
450 and len(pron.strip()) > 0
451 ):
452 pron_list.append(pron.strip())
453 pron = ""
454 elif c == "(":
455 parentheses += 1
456 pron += c
457 elif c == ")":
458 parentheses -= 1
459 pron += c
460 else:
461 pron += c
463 if pron.strip() != "":
464 pron_list.append(pron)
465 return pron_list
468def extract_zh_pron_homophones_table(
469 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
470) -> list[Sound]:
471 sounds = []
472 for td_tag in table.find_html_recursively("td"):
473 for span_tag in td_tag.find_html("span"):
474 span_class = span_tag.attrs.get("class", "")
475 span_lang = span_tag.attrs.get("lang", "")
476 span_str = clean_node(wxr, None, span_tag)
477 if (
478 span_str not in ["", "/"]
479 and span_lang != ""
480 and span_class in ["Hant", "Hans", "Hani"]
481 ):
482 sound = Sound(homophone=span_str, raw_tags=raw_tags)
483 if span_class == "Hant":
484 sound.tags.append("Traditional-Chinese")
485 elif span_class == "Hans":
486 sound.tags.append("Simplified-Chinese")
487 sounds.append(sound)
488 return sounds