Coverage for src/wiktextract/extractor/th/sound.py: 48%
295 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode
21):
22 for t_node in level_node.find_child(NodeKind.TEMPLATE):
23 extract_sound_template(wxr, base_data, t_node)
24 for list_node in level_node.find_child(NodeKind.LIST):
25 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
26 for t_node in list_item.find_child(NodeKind.TEMPLATE):
27 extract_sound_template(wxr, base_data, t_node)
30def extract_sound_template(
31 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
32):
33 if t_node.template_name == "IPA": 33 ↛ 34line 33 didn't jump to line 34 because the condition on line 33 was never true
34 extract_ipa_template(wxr, base_data, t_node)
35 elif t_node.template_name == "X-SAMPA": 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 extract_x_sampa_template(wxr, base_data, t_node)
37 elif t_node.template_name == "enPR": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 extract_enpr_template(wxr, base_data, t_node)
39 elif t_node.template_name == "audio": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 extract_audio_template(wxr, base_data, t_node)
41 elif t_node.template_name == "th-pron":
42 extract_th_pron_template(wxr, base_data, t_node)
43 elif t_node.template_name == "lo-pron":
44 extract_lo_pron_template(wxr, base_data, t_node)
45 elif t_node.template_name in ["ja-pron", "ja-IPA"]: 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true
46 extract_ja_pron_template(wxr, base_data, t_node)
47 elif t_node.template_name == "zh-pron":
48 extract_zh_pron_template(wxr, base_data, t_node)
51def extract_ipa_template(
52 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
53):
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(t_node), expand_all=True
56 )
57 extract_ipa_li_tag(wxr, base_data, expanded_node)
58 clean_node(wxr, base_data, expanded_node)
61def extract_ipa_li_tag(
62 wxr: WiktextractContext, base_data: WordEntry, li_tag: HTMLNode
63):
64 raw_tag = ""
65 for span_tag in li_tag.find_html_recursively("span"):
66 span_class = span_tag.attrs.get("class", "").split()
67 if "qualifier-content" in span_class: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 raw_tag = clean_node(wxr, None, span_tag)
69 elif "IPA" in span_class:
70 sound = Sound(ipa=clean_node(wxr, None, span_tag))
71 if raw_tag != "": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 sound.raw_tags.append(raw_tag)
73 translate_raw_tags(sound)
74 if sound.ipa != "": 74 ↛ 65line 74 didn't jump to line 65 because the condition on line 74 was always true
75 base_data.sounds.append(sound)
76 elif "Latn" in span_class:
77 sound = Sound(roman=clean_node(wxr, None, span_tag))
78 if raw_tag != "": 78 ↛ 79line 78 didn't jump to line 79 because the condition on line 78 was never true
79 sound.raw_tags.append(raw_tag)
80 translate_raw_tags(sound)
81 if sound.roman != "": 81 ↛ 65line 81 didn't jump to line 65 because the condition on line 81 was always true
82 base_data.sounds.append(sound)
85def extract_ja_pron_template(
86 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
87):
88 expanded_node = wxr.wtp.parse(
89 wxr.wtp.node_to_wikitext(t_node), expand_all=True
90 )
91 for li_tag in expanded_node.find_html_recursively("li"):
92 extract_ipa_li_tag(wxr, base_data, li_tag)
93 clean_node(wxr, base_data, expanded_node)
96def extract_x_sampa_template(
97 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
98):
99 sound = Sound(
100 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")),
101 tags=["X-SAMPA"],
102 )
103 if sound.ipa != "":
104 base_data.sounds.append(sound)
107def extract_enpr_template(
108 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
109):
110 sound = Sound(
111 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, ""))
112 )
113 if sound.enpr != "":
114 base_data.sounds.append(sound)
117def extract_audio_template(
118 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
119):
120 sound = Sound()
121 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
122 if filename != "": 122 ↛ exitline 122 didn't return from function 'extract_audio_template' because the condition on line 122 was always true
123 set_sound_file_url_fields(wxr, filename, sound)
124 for raw_tag in clean_node(
125 wxr, None, t_node.template_parameters.get("a", "")
126 ).split(","):
127 raw_tag = raw_tag.strip()
128 if raw_tag != "": 128 ↛ 124line 128 didn't jump to line 124 because the condition on line 128 was always true
129 sound.raw_tags.append(raw_tag)
130 translate_raw_tags(sound)
131 base_data.sounds.append(sound)
132 clean_node(wxr, base_data, t_node)
135@dataclass
136class TableHeader:
137 text: str
138 rowspan: int
141def extract_th_pron_template(
142 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
143):
144 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron
145 expanded_node = wxr.wtp.parse(
146 wxr.wtp.node_to_wikitext(t_node), expand_all=True
147 )
148 for table_tag in expanded_node.find_html("table"):
149 row_headers = []
150 for tr_tag in table_tag.find_html("tr"):
151 field = "other"
152 new_headers = []
153 for header in row_headers:
154 if header.rowspan > 1:
155 header.rowspan -= 1
156 new_headers.append(header)
157 row_headers = new_headers
158 for th_tag in tr_tag.find_html("th"):
159 header_str = clean_node(wxr, None, th_tag)
160 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"):
161 field = "ipa"
162 elif header_str.startswith("คำพ้องเสียง"):
163 field = "homophone"
164 elif header_str == "ไฟล์เสียง": 164 ↛ 165line 164 didn't jump to line 165 because the condition on line 164 was never true
165 field = "audio"
166 elif header_str != "": 166 ↛ 158line 166 didn't jump to line 158 because the condition on line 166 was always true
167 rowspan = 1
168 rowspan_str = th_tag.attrs.get("rowspan", "1")
169 if re.fullmatch(r"\d+", rowspan_str): 169 ↛ 171line 169 didn't jump to line 171 because the condition on line 169 was always true
170 rowspan = int(rowspan_str)
171 row_headers.append(TableHeader(header_str, rowspan))
173 for td_tag in tr_tag.find_html("td"):
174 if field == "audio": 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 for link_node in td_tag.find_child(NodeKind.LINK):
176 filename = clean_node(wxr, None, link_node.largs[0])
177 if filename != "":
178 sound = Sound()
179 set_sound_file_url_fields(wxr, filename, sound)
180 base_data.sounds.append(sound)
181 elif field == "homophone":
182 for span_tag in td_tag.find_html_recursively(
183 "span", attr_name="lang", attr_value="th"
184 ):
185 word = clean_node(wxr, None, span_tag)
186 if word != "": 186 ↛ 182line 186 didn't jump to line 182 because the condition on line 186 was always true
187 base_data.sounds.append(Sound(homophone=word))
188 else:
189 data = clean_node(wxr, None, td_tag)
190 if data != "": 190 ↛ 173line 190 didn't jump to line 173 because the condition on line 190 was always true
191 sound = Sound()
192 setattr(sound, field, data)
193 for header in row_headers:
194 sound.raw_tags.append(header.text)
195 translate_raw_tags(sound)
196 base_data.sounds.append(sound)
198 clean_node(wxr, base_data, expanded_node)
201def extract_lo_pron_template(
202 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
203):
204 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
205 expanded_node = wxr.wtp.parse(
206 wxr.wtp.node_to_wikitext(t_node), expand_all=True
207 )
208 for list_node in expanded_node.find_child(NodeKind.LIST):
209 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
210 field = "other"
211 raw_tag = ""
212 for node in list_item.children:
213 if isinstance(node, HTMLNode) and node.tag == "span":
214 span_class = node.attrs.get("class", "")
215 if "qualifier-content" in span_class:
216 raw_tag = clean_node(wxr, None, node)
217 elif span_class == "IPA":
218 ipa = clean_node(wxr, None, node)
219 if ipa != "": 219 ↛ 212line 219 didn't jump to line 212 because the condition on line 219 was always true
220 sound = Sound(ipa=ipa)
221 if raw_tag != "": 221 ↛ 224line 221 didn't jump to line 224 because the condition on line 221 was always true
222 sound.raw_tags.append(raw_tag)
223 translate_raw_tags(sound)
224 base_data.sounds.append(sound)
225 else:
226 span_lang = node.attrs.get("lang", "")
227 if span_lang == "lo" and field == "hyphenation":
228 span_str = clean_node(wxr, None, node)
229 if span_str != "": 229 ↛ 212line 229 didn't jump to line 212 because the condition on line 229 was always true
230 base_data.hyphenations.append(
231 Hyphenation(parts=span_str.split("-"))
232 )
233 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
234 link_str = clean_node(wxr, None, node)
235 if link_str == "สัทอักษรสากล":
236 field = "ipa"
237 elif link_str != "" and field == "rhymes":
238 base_data.sounds.append(Sound(rhymes=link_str))
239 elif isinstance(node, str) and node.strip().endswith(":"):
240 node = node.strip()
241 if node == "การแบ่งพยางค์:":
242 field = "hyphenation"
243 elif node == "สัมผัส:": 243 ↛ 212line 243 didn't jump to line 212 because the condition on line 243 was always true
244 field = "rhymes"
246 clean_node(wxr, base_data, expanded_node)
249def extract_zh_pron_template(
250 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
251):
252 expanded_node = wxr.wtp.parse(
253 wxr.wtp.node_to_wikitext(t_node), expand_all=True
254 )
255 seen_lists = set()
256 sounds = []
257 for list_node in expanded_node.find_child_recursively(NodeKind.LIST):
258 if list_node not in seen_lists:
259 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
260 sounds.extend(
261 extract_zh_pron_list_item(wxr, list_item, [], seen_lists)
262 )
263 for sound in sounds:
264 translate_raw_tags(sound)
265 base_data.sounds.extend(sounds)
266 clean_node(wxr, base_data, expanded_node)
269def extract_zh_pron_list_item(
270 wxr: WiktextractContext,
271 list_item_node: WikiNode,
272 raw_tags: list[str],
273 seen_lists: set[WikiNode],
274) -> list[Sound]:
275 current_tags = raw_tags[:]
276 sounds = []
277 is_first_small_tag = True
278 for node in list_item_node.children:
279 if isinstance(node, WikiNode):
280 if node.kind == NodeKind.LINK:
281 link_str = clean_node(wxr, None, node.largs)
282 node_str = clean_node(wxr, None, node)
283 if link_str.startswith(("File:", "ไฟล์:")):
284 filename = link_str.removeprefix("File:").removeprefix(
285 "ไฟล์:"
286 )
287 sound_data = Sound(raw_tags=current_tags)
288 set_sound_file_url_fields(wxr, filename, sound_data)
289 sounds.append(sound_data)
290 elif node_str != "":
291 current_tags.append(node_str.strip("()"))
292 elif isinstance(node, HTMLNode):
293 if node.tag == "small":
294 # remove <sup> tag
295 if is_first_small_tag:
296 raw_tag_text = clean_node(
297 wxr,
298 None,
299 [
300 n
301 for n in node.children
302 if not (
303 isinstance(n, HTMLNode) and n.tag == "sup"
304 )
305 ],
306 ).rstrip(":")
307 current_tags.extend(split_zh_pron_raw_tag(raw_tag_text))
308 elif len(sounds) > 0:
309 sounds[-1].raw_tags.extend(
310 split_zh_pron_raw_tag(clean_node(wxr, None, node))
311 )
312 is_first_small_tag = False
313 elif node.tag == "span":
314 sounds.extend(extract_zh_pron_span(wxr, node, current_tags))
315 elif (
316 node.tag == "table"
317 and len(current_tags) > 0
318 and current_tags[-1] == "คำพ้องเสียง"
319 ):
320 sounds.extend(
321 extract_zh_pron_homophones_table(
322 wxr, node, current_tags
323 )
324 )
325 elif node.kind == NodeKind.LIST:
326 seen_lists.add(node)
327 for next_list_item in node.find_child(NodeKind.LIST_ITEM):
328 sounds.extend(
329 extract_zh_pron_list_item(
330 wxr,
331 next_list_item,
332 current_tags,
333 seen_lists,
334 )
335 )
336 return sounds
339def split_zh_pron_raw_tag(raw_tag_text: str) -> list[str]:
340 raw_tags = []
341 if "(" not in raw_tag_text:
342 for raw_tag in re.split(r",|:|;| and ", raw_tag_text):
343 raw_tag = raw_tag.strip().removeprefix("incl. ").strip()
344 if raw_tag != "":
345 raw_tags.append(raw_tag)
346 else:
347 processed_offsets = []
348 for match in re.finditer(r"\([^()]+\)", raw_tag_text):
349 processed_offsets.append((match.start(), match.end()))
350 raw_tags.extend(
351 split_zh_pron_raw_tag(
352 raw_tag_text[match.start() + 1 : match.end() - 1]
353 )
354 )
355 not_processed = ""
356 last_end = 0
357 for start, end in processed_offsets:
358 not_processed += raw_tag_text[last_end:start]
359 last_end = end
360 not_processed += raw_tag_text[last_end:]
361 if not_processed != raw_tag_text:
362 raw_tags = split_zh_pron_raw_tag(not_processed) + raw_tags
363 else:
364 raw_tags.append(not_processed)
365 return raw_tags
368def extract_zh_pron_span(
369 wxr: WiktextractContext, span_tag: HTMLNode, raw_tags: list[str]
370) -> list[Sound]:
371 sounds = []
372 small_tags = []
373 pron_nodes = []
374 roman = ""
375 phonetic_pron = ""
376 for index, node in enumerate(span_tag.children):
377 if isinstance(node, HTMLNode) and node.tag == "small":
378 small_tags = split_zh_pron_raw_tag(clean_node(wxr, None, node))
379 elif (
380 isinstance(node, HTMLNode)
381 and node.tag == "span"
382 and "-Latn" in node.attrs.get("lang", "")
383 ):
384 roman = clean_node(wxr, None, node).strip("() ")
385 elif isinstance(node, str) and node.strip() == "[Phonetic:":
386 phonetic_pron = clean_node(
387 wxr, None, span_tag.children[index + 1 :]
388 ).strip("] ")
389 break
390 else:
391 pron_nodes.append(node)
392 for zh_pron in split_zh_pron(clean_node(wxr, None, pron_nodes)):
393 zh_pron = zh_pron.strip("[]: ")
394 if len(zh_pron) > 0:
395 if "IPA" in span_tag.attrs.get("class", ""):
396 sounds.append(
397 Sound(ipa=zh_pron, roman=roman, raw_tags=raw_tags)
398 )
399 else:
400 sounds.append(
401 Sound(zh_pron=zh_pron, roman=roman, raw_tags=raw_tags)
402 )
403 if len(sounds) > 0:
404 sounds[-1].raw_tags.extend(small_tags)
405 if phonetic_pron != "":
406 sounds.append(
407 Sound(
408 zh_pron=phonetic_pron,
409 roman=roman,
410 raw_tags=raw_tags + ["Phonetic"],
411 )
412 )
413 return sounds
416def split_zh_pron(zh_pron: str) -> list[str]:
417 # split by comma and other symbols that outside parentheses
418 parentheses = 0
419 pron_list = []
420 pron = ""
421 for c in zh_pron:
422 if (
423 (c in [",", ";", "→"] or (c == "/" and not zh_pron.startswith("/")))
424 and parentheses == 0
425 and len(pron.strip()) > 0
426 ):
427 pron_list.append(pron.strip())
428 pron = ""
429 elif c == "(":
430 parentheses += 1
431 pron += c
432 elif c == ")":
433 parentheses -= 1
434 pron += c
435 else:
436 pron += c
438 if pron.strip() != "":
439 pron_list.append(pron)
440 return pron_list
443def extract_zh_pron_homophones_table(
444 wxr: WiktextractContext, table: HTMLNode, raw_tags: list[str]
445) -> list[Sound]:
446 sounds = []
447 for td_tag in table.find_html_recursively("td"):
448 for span_tag in td_tag.find_html("span"):
449 span_class = span_tag.attrs.get("class", "")
450 span_lang = span_tag.attrs.get("lang", "")
451 span_str = clean_node(wxr, None, span_tag)
452 if (
453 span_str not in ["", "/"]
454 and span_lang != ""
455 and span_class in ["Hant", "Hans", "Hani"]
456 ):
457 sound = Sound(homophone=span_str, raw_tags=raw_tags)
458 if span_class == "Hant":
459 sound.tags.append("Traditional-Chinese")
460 elif span_class == "Hans":
461 sound.tags.append("Simplified-Chinese")
462 sounds.append(sound)
463 return sounds