Coverage for src/wiktextract/extractor/th/sound.py: 74%
141 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..share import set_sound_file_url_fields
15from .models import Hyphenation, Sound, WordEntry
16from .tags import translate_raw_tags
19def extract_sound_section(
20 wxr: WiktextractContext,
21 base_data: WordEntry,
22 level_node: LevelNode,
23) -> None:
24 for t_node in level_node.find_child(NodeKind.TEMPLATE):
25 if t_node.template_name == "th-pron":
26 extract_th_pron_template(wxr, base_data, t_node)
27 elif t_node.template_name == "lo-pron": 27 ↛ 24line 27 didn't jump to line 24 because the condition on line 27 was always true
28 extract_lo_pron_template(wxr, base_data, t_node)
29 for list_node in level_node.find_child(NodeKind.LIST): 29 ↛ 30line 29 didn't jump to line 30 because the loop on line 29 never started
30 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
31 extract_sound_list_item(wxr, base_data, list_item)
34def extract_sound_list_item(
35 wxr: WiktextractContext,
36 base_data: WordEntry,
37 list_item: WikiNode,
38) -> None:
39 for t_node in list_item.find_child(NodeKind.TEMPLATE):
40 if t_node.template_name == "IPA":
41 extract_ipa_template(wxr, base_data, t_node)
42 elif t_node.template_name == "X-SAMPA":
43 extract_x_sampa_template(wxr, base_data, t_node)
44 elif t_node.template_name == "enPR":
45 extract_enpr_template(wxr, base_data, t_node)
46 elif t_node.template_name == "audio":
47 extract_audio_template(wxr, base_data, t_node)
50def extract_ipa_template(
51 wxr: WiktextractContext,
52 base_data: WordEntry,
53 t_node: TemplateNode,
54) -> None:
55 sound = Sound(
56 ipa=clean_node(wxr, None, t_node.template_parameters.get(2, ""))
57 )
58 if sound.ipa != "":
59 base_data.sounds.append(sound)
60 clean_node(wxr, base_data, t_node)
63def extract_x_sampa_template(
64 wxr: WiktextractContext,
65 base_data: WordEntry,
66 t_node: TemplateNode,
67) -> None:
68 sound = Sound(
69 ipa=clean_node(wxr, None, t_node.template_parameters.get(1, "")),
70 tags=["X-SAMPA"],
71 )
72 if sound.ipa != "":
73 base_data.sounds.append(sound)
76def extract_enpr_template(
77 wxr: WiktextractContext,
78 base_data: WordEntry,
79 t_node: TemplateNode,
80) -> None:
81 sound = Sound(
82 enpr=clean_node(wxr, None, t_node.template_parameters.get(1, ""))
83 )
84 if sound.enpr != "":
85 base_data.sounds.append(sound)
88def extract_audio_template(
89 wxr: WiktextractContext,
90 base_data: WordEntry,
91 t_node: TemplateNode,
92) -> None:
93 sound = Sound()
94 filename = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
95 if filename != "": 95 ↛ exitline 95 didn't return from function 'extract_audio_template' because the condition on line 95 was always true
96 set_sound_file_url_fields(wxr, filename, sound)
97 for raw_tag in clean_node(
98 wxr, None, t_node.template_parameters.get("a", "")
99 ).split(","):
100 raw_tag = raw_tag.strip()
101 if raw_tag != "": 101 ↛ 97line 101 didn't jump to line 97 because the condition on line 101 was always true
102 sound.raw_tags.append(raw_tag)
103 translate_raw_tags(sound)
104 base_data.sounds.append(sound)
105 clean_node(wxr, base_data, t_node)
108@dataclass
109class TableHeader:
110 text: str
111 rowspan: int
114def extract_th_pron_template(
115 wxr: WiktextractContext,
116 base_data: WordEntry,
117 t_node: TemplateNode,
118) -> None:
119 # https://th.wiktionary.org/wiki/แม่แบบ:th-pron
120 expanded_node = wxr.wtp.parse(
121 wxr.wtp.node_to_wikitext(t_node), expand_all=True
122 )
123 for table_tag in expanded_node.find_html("table"):
124 row_headers = []
125 for tr_tag in table_tag.find_html("tr"):
126 field = "other"
127 new_headers = []
128 for header in row_headers:
129 if header.rowspan > 1:
130 header.rowspan -= 1
131 new_headers.append(header)
132 row_headers = new_headers
133 for th_tag in tr_tag.find_html("th"):
134 header_str = clean_node(wxr, None, th_tag)
135 if header_str.startswith("(มาตรฐาน) สัทอักษรสากล"):
136 field = "ipa"
137 elif header_str.startswith("คำพ้องเสียง"):
138 field = "homophone"
139 elif header_str == "ไฟล์เสียง": 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 field = "audio"
141 elif header_str != "": 141 ↛ 133line 141 didn't jump to line 133 because the condition on line 141 was always true
142 rowspan = 1
143 rowspan_str = th_tag.attrs.get("rowspan", "1")
144 if re.fullmatch(r"\d+", rowspan_str): 144 ↛ 146line 144 didn't jump to line 146 because the condition on line 144 was always true
145 rowspan = int(rowspan_str)
146 row_headers.append(TableHeader(header_str, rowspan))
148 for td_tag in tr_tag.find_html("td"):
149 if field == "audio": 149 ↛ 150line 149 didn't jump to line 150 because the condition on line 149 was never true
150 for link_node in td_tag.find_child(NodeKind.LINK):
151 filename = clean_node(wxr, None, link_node.largs[0])
152 if filename != "":
153 sound = Sound()
154 set_sound_file_url_fields(wxr, filename, sound)
155 base_data.sounds.append(sound)
156 elif field == "homophone":
157 for span_tag in td_tag.find_html_recursively(
158 "span", attr_name="lang", attr_value="th"
159 ):
160 word = clean_node(wxr, None, span_tag)
161 if word != "": 161 ↛ 157line 161 didn't jump to line 157 because the condition on line 161 was always true
162 base_data.sounds.append(Sound(homophone=word))
163 else:
164 data = clean_node(wxr, None, td_tag)
165 if data != "": 165 ↛ 148line 165 didn't jump to line 148 because the condition on line 165 was always true
166 sound = Sound()
167 setattr(sound, field, data)
168 for header in row_headers:
169 sound.raw_tags.append(header.text)
170 translate_raw_tags(sound)
171 base_data.sounds.append(sound)
173 clean_node(wxr, base_data, expanded_node)
176def extract_lo_pron_template(
177 wxr: WiktextractContext,
178 base_data: WordEntry,
179 t_node: TemplateNode,
180) -> None:
181 # https://th.wiktionary.org/wiki/แม่แบบ:lo-pron
182 expanded_node = wxr.wtp.parse(
183 wxr.wtp.node_to_wikitext(t_node), expand_all=True
184 )
185 for list_node in expanded_node.find_child(NodeKind.LIST):
186 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
187 field = "other"
188 raw_tag = ""
189 for node in list_item.children:
190 if isinstance(node, HTMLNode) and node.tag == "span":
191 span_class = node.attrs.get("class", "")
192 if "qualifier-content" in span_class:
193 raw_tag = clean_node(wxr, None, node)
194 elif span_class == "IPA":
195 ipa = clean_node(wxr, None, node)
196 if ipa != "": 196 ↛ 189line 196 didn't jump to line 189 because the condition on line 196 was always true
197 sound = Sound(ipa=ipa)
198 if raw_tag != "": 198 ↛ 201line 198 didn't jump to line 201 because the condition on line 198 was always true
199 sound.raw_tags.append(raw_tag)
200 translate_raw_tags(sound)
201 base_data.sounds.append(sound)
202 else:
203 span_lang = node.attrs.get("lang", "")
204 if span_lang == "lo" and field == "hyphenation":
205 span_str = clean_node(wxr, None, node)
206 if span_str != "": 206 ↛ 189line 206 didn't jump to line 189 because the condition on line 206 was always true
207 base_data.hyphenations.append(
208 Hyphenation(parts=span_str.split("-"))
209 )
210 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
211 link_str = clean_node(wxr, None, node)
212 if link_str == "สัทอักษรสากล":
213 field = "ipa"
214 elif link_str != "" and field == "rhymes":
215 base_data.sounds.append(Sound(rhymes=link_str))
216 elif isinstance(node, str) and node.strip().endswith(":"):
217 node = node.strip()
218 if node == "การแบ่งพยางค์:":
219 field = "hyphenation"
220 elif node == "สัมผัส:": 220 ↛ 189line 220 didn't jump to line 189 because the condition on line 220 was always true
221 field = "rhymes"
223 clean_node(wxr, base_data, expanded_node)