Coverage for src/wiktextract/extractor/th/pos.py: 75%
204 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from .example import extract_example_list_item
14from .models import AltForm, Classifier, Form, Sense, WordEntry
15from .section_titles import POS_DATA
16from .tags import translate_raw_tags
19def extract_pos_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24 pos_title: str,
25) -> None:
26 page_data.append(base_data.model_copy(deep=True))
27 page_data[-1].pos_title = pos_title
28 pos_data = POS_DATA[pos_title]
29 page_data[-1].pos = pos_data["pos"]
30 base_data.pos = pos_data["pos"]
31 page_data[-1].tags.extend(pos_data.get("tags", []))
33 gloss_list_index = len(level_node.children)
34 for index, list_node in level_node.find_child(NodeKind.LIST, True):
35 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
36 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
37 extract_gloss_list_item(wxr, page_data[-1], list_item)
38 if index < gloss_list_index: 38 ↛ 35line 38 didn't jump to line 35 because the condition on line 38 was always true
39 gloss_list_index = index
41 for node in level_node.children[:gloss_list_index]:
42 if isinstance(node, TemplateNode) and node.template_name == "th-noun":
43 extract_th_noun_template(wxr, page_data[-1], node)
44 elif isinstance(node, TemplateNode) and node.template_name in [
45 "th-verb",
46 "th-adj",
47 ]:
48 extract_th_verb_adj_template(wxr, page_data[-1], node)
49 elif isinstance(node, TemplateNode):
50 extract_headword_line_template(wxr, page_data[-1], node)
53# redirect
54ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"])
55FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"])
58def extract_gloss_list_item(
59 wxr: WiktextractContext,
60 word_entry: WordEntry,
61 list_item: WikiNode,
62 parent_sense: Sense | None = None,
63) -> None:
64 sense = (
65 parent_sense.model_copy(deep=True)
66 if parent_sense is not None
67 else Sense()
68 )
69 gloss_nodes = []
70 has_form_of_template = False
71 for node in list_item.children:
72 if isinstance(node, TemplateNode) and node.template_name in [
73 "label",
74 "lb",
75 "lbl",
76 ]:
77 extract_label_template(wxr, sense, node)
78 elif isinstance(node, TemplateNode) and node.template_name == "cls":
79 extract_cls_template(wxr, sense, node)
80 elif isinstance(node, TemplateNode) and (
81 node.template_name.endswith(" of")
82 or node.template_name.startswith("alternate ")
83 or node.template_name in ALT_OF_TEMPLATES
84 or node.template_name in FORM_OF_TEMPLATES
85 ):
86 extract_form_of_template(wxr, word_entry, sense, node)
87 has_form_of_template = True
88 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 extract_zh_mw_template(wxr, node, sense)
90 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
91 gloss_nodes.append(node)
93 if not has_form_of_template:
94 gloss_str = clean_node(wxr, sense, gloss_nodes)
95 if gloss_str != "": 95 ↛ 100line 95 didn't jump to line 100 because the condition on line 95 was always true
96 sense.glosses.append(gloss_str)
97 translate_raw_tags(sense)
98 word_entry.senses.append(sense)
100 for child_list in list_item.find_child(NodeKind.LIST):
101 if child_list.sarg.startswith("#") and child_list.sarg.endswith(
102 (":", "*")
103 ):
104 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
105 extract_example_list_item(wxr, word_entry, sense, e_list_item)
106 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 106 ↛ 100line 106 didn't jump to line 100 because the condition on line 106 was always true
107 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
108 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
111def extract_label_template(
112 wxr: WiktextractContext,
113 sense: Sense,
114 t_node: TemplateNode,
115) -> None:
116 # https://th.wiktionary.org/wiki/แม่แบบ:label
117 expanded_node = wxr.wtp.parse(
118 wxr.wtp.node_to_wikitext(t_node), expand_all=True
119 )
120 for span_tag in expanded_node.find_html_recursively(
121 "span", attr_name="class", attr_value="ib-content"
122 ):
123 span_str = clean_node(wxr, None, span_tag)
124 for raw_tag in re.split(r",| หรือ ", span_str):
125 raw_tag = raw_tag.strip()
126 if raw_tag != "": 126 ↛ 124line 126 didn't jump to line 124 because the condition on line 126 was always true
127 sense.raw_tags.append(raw_tag)
128 clean_node(wxr, sense, expanded_node)
131def extract_cls_template(
132 wxr: WiktextractContext,
133 sense: Sense,
134 t_node: TemplateNode,
135) -> None:
136 # https://th.wiktionary.org/wiki/แม่แบบ:cls
137 for arg_name in itertools.count(2): 137 ↛ 143line 137 didn't jump to line 143 because the loop on line 137 didn't complete
138 if arg_name not in t_node.template_parameters:
139 break
140 cls = clean_node(wxr, None, t_node.template_parameters[arg_name])
141 if cls != "": 141 ↛ 137line 141 didn't jump to line 137 because the condition on line 141 was always true
142 sense.classifiers.append(Classifier(classifier=cls))
143 clean_node(wxr, sense, t_node)
146def extract_th_noun_template(
147 wxr: WiktextractContext,
148 word_entry: WordEntry,
149 t_node: TemplateNode,
150) -> None:
151 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun
152 expanded_node = wxr.wtp.parse(
153 wxr.wtp.node_to_wikitext(t_node), expand_all=True
154 )
155 for b_tag in expanded_node.find_html_recursively("b"):
156 cls = clean_node(wxr, None, b_tag)
157 if cls != "": 157 ↛ 155line 157 didn't jump to line 155 because the condition on line 157 was always true
158 word_entry.classifiers.append(Classifier(classifier=cls))
160 clean_node(wxr, word_entry, expanded_node)
163def extract_th_verb_adj_template(
164 wxr: WiktextractContext,
165 word_entry: WordEntry,
166 t_node: TemplateNode,
167) -> None:
168 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun
169 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj
170 expanded_node = wxr.wtp.parse(
171 wxr.wtp.node_to_wikitext(t_node), expand_all=True
172 )
173 for b_tag in expanded_node.find_html_recursively("b"):
174 form_str = clean_node(wxr, None, b_tag)
175 if form_str != "": 175 ↛ 173line 175 didn't jump to line 173 because the condition on line 175 was always true
176 word_entry.forms.append(
177 Form(
178 form=form_str,
179 tags=[
180 "abstract-noun"
181 if t_node.template_name == "th-verb"
182 else "noun-from-adj"
183 ],
184 )
185 )
187 clean_node(wxr, word_entry, expanded_node)
190def extract_note_section(
191 wxr: WiktextractContext,
192 word_entry: WordEntry,
193 level_node: LevelNode,
194) -> None:
195 for list_node in level_node.find_child(NodeKind.LIST):
196 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
197 note_str = clean_node(
198 wxr,
199 word_entry,
200 list(list_item.invert_find_child(NodeKind.LIST)),
201 )
202 if note_str != "":
203 word_entry.notes.append(note_str)
206def extract_form_of_template(
207 wxr: WiktextractContext,
208 word_entry: WordEntry,
209 first_sense: Sense,
210 t_node: TemplateNode,
211) -> None:
212 form = AltForm(word="")
213 expanded_node = wxr.wtp.parse(
214 wxr.wtp.node_to_wikitext(t_node), expand_all=True
215 )
216 senses = []
217 if expanded_node.contain_node(NodeKind.LIST):
218 first_list_idx = len(expanded_node.children)
219 first_gloss = ""
220 for index, node in enumerate(expanded_node.children):
221 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
222 if index < first_list_idx: 222 ↛ 230line 222 didn't jump to line 230 because the condition on line 222 was always true
223 first_list_idx = index
224 first_gloss = clean_node(
225 wxr, first_sense, expanded_node.children[:index]
226 )
227 if first_gloss != "": 227 ↛ 230line 227 didn't jump to line 230 because the condition on line 227 was always true
228 first_sense.glosses.append(first_gloss)
229 senses.append(first_sense)
230 for list_item in node.find_child(NodeKind.LIST_ITEM):
231 sense = Sense()
232 if first_gloss != "": 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true
233 sense.glosses.append(first_gloss)
234 gloss = clean_node(wxr, sense, list_item.children)
235 if gloss != "": 235 ↛ 230line 235 didn't jump to line 230 because the condition on line 235 was always true
236 sense.glosses.append(gloss)
237 senses.append(sense)
238 else:
239 gloss = clean_node(wxr, first_sense, expanded_node)
240 if gloss != "": 240 ↛ 244line 240 didn't jump to line 244 because the condition on line 240 was always true
241 first_sense.glosses.append(gloss)
242 senses.append(first_sense)
244 for i_tag in expanded_node.find_html_recursively("i"): 244 ↛ 247line 244 didn't jump to line 247 because the loop on line 244 didn't complete
245 form.word = clean_node(wxr, None, i_tag)
246 break
247 for span_tag in expanded_node.find_html_recursively("span"):
248 if "mention-tr" in span_tag.attrs.get("class", ""):
249 form.roman = clean_node(wxr, None, span_tag)
250 break
251 is_alt_of = (
252 t_node.template_name.startswith(("alternative ", "alternate "))
253 or t_node.template_name in ALT_OF_TEMPLATES
254 )
255 if form.word != "": 255 ↛ 265line 255 didn't jump to line 265 because the condition on line 255 was always true
256 for sense in senses:
257 if is_alt_of:
258 sense.alt_of.append(form)
259 else:
260 sense.form_of.append(form)
261 if is_alt_of and "alt-of" not in sense.tags:
262 sense.tags.append("alt-of")
263 if not is_alt_of and "form-of" not in sense.tags:
264 sense.tags.append("form-of")
265 word_entry.senses.extend(senses)
268def extract_usage_note_section(
269 wxr: WiktextractContext,
270 word_entry: WordEntry,
271 level_node: LevelNode,
272) -> None:
273 for list_node in level_node.find_child(NodeKind.LIST):
274 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
275 note_str = clean_node(wxr, None, list_item.children)
276 if note_str != "":
277 word_entry.notes.append(note_str)
280def extract_zh_mw_template(
281 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
282) -> None:
283 # Chinese inline classifier template
284 # copied from zh edition code
285 expanded_node = wxr.wtp.parse(
286 wxr.wtp.node_to_wikitext(t_node), expand_all=True
287 )
288 classifiers = []
289 last_word = ""
290 for span_tag in expanded_node.find_html_recursively("span"):
291 span_class = span_tag.attrs.get("class", "")
292 if span_class in ["Hani", "Hant", "Hans"]:
293 word = clean_node(wxr, None, span_tag)
294 if word != "/":
295 classifier = Classifier(classifier=word)
296 if span_class == "Hant":
297 classifier.tags.append("Traditional-Chinese")
298 elif span_class == "Hans":
299 classifier.tags.append("Simplified-Chinese")
301 if len(classifiers) > 0 and last_word != "/":
302 sense.classifiers.extend(classifiers)
303 classifiers.clear()
304 classifiers.append(classifier)
305 last_word = word
306 elif "title" in span_tag.attrs:
307 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
308 if len(raw_tag) > 0:
309 for classifier in classifiers:
310 classifier.raw_tags.append(raw_tag)
311 sense.classifiers.extend(classifiers)
312 for classifier in sense.classifiers:
313 translate_raw_tags(classifier)
316def extract_headword_line_template(
317 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
318):
319 expanded_node = wxr.wtp.parse(
320 wxr.wtp.node_to_wikitext(t_node), expand_all=True
321 )
322 for main_span_tag in expanded_node.find_html(
323 "span", attr_name="class", attr_value="headword-line"
324 ):
325 for strong_tag in main_span_tag.find_html(
326 "strong", attr_name="class", attr_value="headword"
327 ):
328 strong_str = clean_node(wxr, None, strong_tag)
329 if strong_str not in ["", wxr.wtp.title]: 329 ↛ 325line 329 didn't jump to line 325 because the condition on line 329 was always true
330 word_entry.forms.append(
331 Form(form=strong_str, tags=["canonical"])
332 )
333 for roman_span in main_span_tag.find_html(
334 "span", attr_name="class", attr_value="headword-tr"
335 ):
336 roman = clean_node(wxr, None, roman_span)
337 if roman != "": 337 ↛ 333line 337 didn't jump to line 333 because the condition on line 337 was always true
338 word_entry.forms.append(
339 Form(form=roman, tags=["transliteration"])
340 )
341 for gender_span in main_span_tag.find_html(
342 "span", attr_name="class", attr_value="gender"
343 ):
344 for abbr_tag in gender_span.find_html("abbr"):
345 word_entry.raw_tags.append(clean_node(wxr, None, abbr_tag))
346 form_raw_tag = ""
347 for html_tag in main_span_tag.find_child(NodeKind.HTML):
348 if html_tag.tag == "i":
349 form_raw_tag = clean_node(wxr, None, html_tag)
350 elif html_tag.tag == "b":
351 form_str = clean_node(wxr, None, html_tag)
352 if form_str != "": 352 ↛ 347line 352 didn't jump to line 347 because the condition on line 352 was always true
353 form = Form(form=form_str)
354 if form_raw_tag != "": 354 ↛ 357line 354 didn't jump to line 357 because the condition on line 354 was always true
355 form.raw_tags.append(form_raw_tag)
356 translate_raw_tags(form)
357 word_entry.forms.append(form)
359 clean_node(wxr, word_entry, expanded_node)
360 translate_raw_tags(word_entry)