Coverage for src / wiktextract / extractor / th / pos.py: 77%
240 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import itertools
2import re
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..ruby import extract_ruby
15from .example import extract_example_list_item
16from .models import AltForm, Classifier, Form, Sense, WordEntry
17from .section_titles import POS_DATA
18from .tags import translate_raw_tags
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26 pos_title: str,
27) -> None:
28 page_data.append(base_data.model_copy(deep=True))
29 page_data[-1].pos_title = pos_title
30 pos_data = POS_DATA[pos_title]
31 page_data[-1].pos = pos_data["pos"]
32 base_data.pos = pos_data["pos"]
33 page_data[-1].tags.extend(pos_data.get("tags", []))
35 gloss_list_index = len(level_node.children)
36 for index, list_node in level_node.find_child(NodeKind.LIST, True):
37 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
39 extract_gloss_list_item(wxr, page_data[-1], list_item)
40 if index < gloss_list_index: 40 ↛ 37line 40 didn't jump to line 37 because the condition on line 40 was always true
41 gloss_list_index = index
43 for node in level_node.children[:gloss_list_index]:
44 if isinstance(node, TemplateNode) and node.template_name == "th-noun":
45 extract_th_noun_template(wxr, page_data[-1], node)
46 elif isinstance(node, TemplateNode) and node.template_name in [
47 "th-verb",
48 "th-adj",
49 ]:
50 extract_th_verb_adj_template(wxr, page_data[-1], node)
51 elif isinstance(node, TemplateNode):
52 extract_headword_line_template(wxr, page_data[-1], node)
55# redirect
56ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "alt sp", "altsp"])
57FORM_OF_TEMPLATES = frozenset(["อักษรย่อ", "คำย่อ"])
60def extract_gloss_list_item(
61 wxr: WiktextractContext,
62 word_entry: WordEntry,
63 list_item: WikiNode,
64 parent_sense: Sense | None = None,
65) -> None:
66 sense = (
67 parent_sense.model_copy(deep=True)
68 if parent_sense is not None
69 else Sense()
70 )
71 gloss_nodes = []
72 has_form_of_template = False
73 for node in list_item.children:
74 if isinstance(node, TemplateNode) and node.template_name in [
75 "label",
76 "lb",
77 "lbl",
78 "qualifier",
79 "q",
80 "qf",
81 "qual",
82 ]:
83 extract_label_template(wxr, sense, node)
84 elif isinstance(node, TemplateNode) and node.template_name == "cls":
85 extract_cls_template(wxr, sense, node)
86 elif isinstance(node, TemplateNode) and (
87 node.template_name.endswith(" of")
88 or node.template_name.startswith("alternate ")
89 or node.template_name in ALT_OF_TEMPLATES
90 or node.template_name in FORM_OF_TEMPLATES
91 ):
92 extract_form_of_template(wxr, word_entry, sense, node)
93 has_form_of_template = True
94 elif isinstance(node, TemplateNode) and node.template_name == "zh-mw": 94 ↛ 95line 94 didn't jump to line 95 because the condition on line 94 was never true
95 extract_zh_mw_template(wxr, node, sense)
96 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
97 gloss_nodes.append(node)
99 if not has_form_of_template:
100 gloss_str = clean_node(wxr, sense, gloss_nodes)
101 if gloss_str != "": 101 ↛ 106line 101 didn't jump to line 106 because the condition on line 101 was always true
102 sense.glosses.append(gloss_str)
103 translate_raw_tags(sense)
104 word_entry.senses.append(sense)
106 for child_list in list_item.find_child(NodeKind.LIST):
107 if child_list.sarg.startswith("#") and child_list.sarg.endswith(
108 (":", "*")
109 ):
110 for e_list_item in child_list.find_child(NodeKind.LIST_ITEM):
111 extract_example_list_item(wxr, word_entry, sense, e_list_item)
112 elif child_list.sarg.startswith("#") and child_list.sarg.endswith("#"): 112 ↛ 106line 112 didn't jump to line 106 because the condition on line 112 was always true
113 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
114 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
117def extract_label_template(
118 wxr: WiktextractContext,
119 sense: Sense,
120 t_node: TemplateNode,
121) -> None:
122 # https://th.wiktionary.org/wiki/แม่แบบ:label
123 expanded_node = wxr.wtp.parse(
124 wxr.wtp.node_to_wikitext(t_node), expand_all=True
125 )
126 for span_tag in expanded_node.find_html_recursively(
127 "span", attr_name="class", attr_value="ib-content"
128 ):
129 span_str = clean_node(wxr, None, span_tag)
130 for raw_tag in re.split(r",| หรือ ", span_str):
131 raw_tag = raw_tag.strip()
132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true
133 sense.raw_tags.append(raw_tag)
134 clean_node(wxr, sense, expanded_node)
137def extract_cls_template(
138 wxr: WiktextractContext,
139 sense: Sense,
140 t_node: TemplateNode,
141) -> None:
142 # https://th.wiktionary.org/wiki/แม่แบบ:cls
143 for arg_name in itertools.count(2): 143 ↛ 149line 143 didn't jump to line 149 because the loop on line 143 didn't complete
144 if arg_name not in t_node.template_parameters:
145 break
146 cls = clean_node(wxr, None, t_node.template_parameters[arg_name])
147 if cls != "": 147 ↛ 143line 147 didn't jump to line 143 because the condition on line 147 was always true
148 sense.classifiers.append(Classifier(classifier=cls))
149 clean_node(wxr, sense, t_node)
152def extract_th_noun_template(
153 wxr: WiktextractContext,
154 word_entry: WordEntry,
155 t_node: TemplateNode,
156) -> None:
157 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun
158 expanded_node = wxr.wtp.parse(
159 wxr.wtp.node_to_wikitext(t_node), expand_all=True
160 )
161 for b_tag in expanded_node.find_html_recursively("b"):
162 cls = clean_node(wxr, None, b_tag)
163 if cls != "": 163 ↛ 161line 163 didn't jump to line 161 because the condition on line 163 was always true
164 word_entry.classifiers.append(Classifier(classifier=cls))
166 clean_node(wxr, word_entry, expanded_node)
169def extract_th_verb_adj_template(
170 wxr: WiktextractContext,
171 word_entry: WordEntry,
172 t_node: TemplateNode,
173) -> None:
174 # https://th.wiktionary.org/wiki/แม่แบบ:th-noun
175 # https://th.wiktionary.org/wiki/แม่แบบ:th-adj
176 expanded_node = wxr.wtp.parse(
177 wxr.wtp.node_to_wikitext(t_node), expand_all=True
178 )
179 for b_tag in expanded_node.find_html_recursively("b"):
180 form_str = clean_node(wxr, None, b_tag)
181 if form_str != "": 181 ↛ 179line 181 didn't jump to line 179 because the condition on line 181 was always true
182 word_entry.forms.append(
183 Form(
184 form=form_str,
185 tags=[
186 "abstract-noun"
187 if t_node.template_name == "th-verb"
188 else "noun-from-adj"
189 ],
190 )
191 )
193 clean_node(wxr, word_entry, expanded_node)
196def extract_note_section(
197 wxr: WiktextractContext,
198 word_entry: WordEntry,
199 level_node: LevelNode,
200) -> None:
201 for list_node in level_node.find_child(NodeKind.LIST):
202 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
203 note_str = clean_node(
204 wxr,
205 word_entry,
206 list(
207 list_item.invert_find_child(
208 NodeKind.LIST, include_empty_str=True
209 )
210 ),
211 )
212 if note_str != "":
213 word_entry.notes.append(note_str)
216def extract_form_of_template(
217 wxr: WiktextractContext,
218 word_entry: WordEntry,
219 first_sense: Sense,
220 t_node: TemplateNode,
221) -> None:
222 form = AltForm(word="")
223 expanded_node = wxr.wtp.parse(
224 wxr.wtp.node_to_wikitext(t_node), expand_all=True
225 )
226 senses = []
227 if expanded_node.contain_node(NodeKind.LIST):
228 first_list_idx = len(expanded_node.children)
229 first_gloss = ""
230 for index, node in enumerate(expanded_node.children):
231 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
232 if index < first_list_idx: 232 ↛ 240line 232 didn't jump to line 240 because the condition on line 232 was always true
233 first_list_idx = index
234 first_gloss = clean_node(
235 wxr, first_sense, expanded_node.children[:index]
236 )
237 if first_gloss != "": 237 ↛ 240line 237 didn't jump to line 240 because the condition on line 237 was always true
238 first_sense.glosses.append(first_gloss)
239 senses.append(first_sense)
240 for list_item in node.find_child(NodeKind.LIST_ITEM):
241 sense = Sense()
242 if first_gloss != "": 242 ↛ 244line 242 didn't jump to line 244 because the condition on line 242 was always true
243 sense.glosses.append(first_gloss)
244 gloss = clean_node(wxr, sense, list_item.children)
245 if gloss != "": 245 ↛ 240line 245 didn't jump to line 240 because the condition on line 245 was always true
246 sense.glosses.append(gloss)
247 senses.append(sense)
248 else:
249 gloss = clean_node(wxr, first_sense, expanded_node)
250 if gloss != "": 250 ↛ 254line 250 didn't jump to line 254 because the condition on line 250 was always true
251 first_sense.glosses.append(gloss)
252 senses.append(first_sense)
254 for i_tag in expanded_node.find_html_recursively("i"): 254 ↛ 257line 254 didn't jump to line 257 because the loop on line 254 didn't complete
255 form.word = clean_node(wxr, None, i_tag)
256 break
257 for span_tag in expanded_node.find_html_recursively("span"):
258 if "mention-tr" in span_tag.attrs.get("class", ""):
259 form.roman = clean_node(wxr, None, span_tag)
260 break
261 is_alt_of = (
262 t_node.template_name.startswith(("alternative ", "alternate "))
263 or t_node.template_name in ALT_OF_TEMPLATES
264 )
265 if form.word != "": 265 ↛ 275line 265 didn't jump to line 275 because the condition on line 265 was always true
266 for sense in senses:
267 if is_alt_of:
268 sense.alt_of.append(form)
269 else:
270 sense.form_of.append(form)
271 if is_alt_of and "alt-of" not in sense.tags:
272 sense.tags.append("alt-of")
273 if not is_alt_of and "form-of" not in sense.tags:
274 sense.tags.append("form-of")
275 word_entry.senses.extend(senses)
278def extract_usage_note_section(
279 wxr: WiktextractContext,
280 word_entry: WordEntry,
281 level_node: LevelNode,
282) -> None:
283 for list_node in level_node.find_child(NodeKind.LIST):
284 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
285 note_str = clean_node(wxr, None, list_item.children)
286 if note_str != "":
287 word_entry.notes.append(note_str)
290def extract_zh_mw_template(
291 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
292) -> None:
293 # Chinese inline classifier template
294 # copied from zh edition code
295 expanded_node = wxr.wtp.parse(
296 wxr.wtp.node_to_wikitext(t_node), expand_all=True
297 )
298 classifiers = []
299 last_word = ""
300 for span_tag in expanded_node.find_html_recursively("span"):
301 span_class = span_tag.attrs.get("class", "")
302 if span_class in ["Hani", "Hant", "Hans"]:
303 word = clean_node(wxr, None, span_tag)
304 if word != "/":
305 classifier = Classifier(classifier=word)
306 if span_class == "Hant":
307 classifier.tags.append("Traditional-Chinese")
308 elif span_class == "Hans":
309 classifier.tags.append("Simplified-Chinese")
311 if len(classifiers) > 0 and last_word != "/":
312 sense.classifiers.extend(classifiers)
313 classifiers.clear()
314 classifiers.append(classifier)
315 last_word = word
316 elif "title" in span_tag.attrs:
317 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
318 if len(raw_tag) > 0:
319 for classifier in classifiers:
320 classifier.raw_tags.append(raw_tag)
321 sense.classifiers.extend(classifiers)
322 for classifier in sense.classifiers:
323 translate_raw_tags(classifier)
326def extract_headword_line_template(
327 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
328):
329 forms = []
330 expanded_node = wxr.wtp.parse(
331 wxr.wtp.node_to_wikitext(t_node), expand_all=True
332 )
333 for main_span_tag in expanded_node.find_html(
334 "span", attr_name="class", attr_value="headword-line"
335 ):
336 i_tags = []
337 for html_node in main_span_tag.find_child(NodeKind.HTML):
338 class_names = html_node.attrs.get("class", "").split()
339 if html_node.tag == "strong" and "headword" in class_names:
340 ruby, no_ruby = extract_ruby(wxr, html_node)
341 strong_str = clean_node(wxr, None, no_ruby)
342 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0:
343 forms.append(
344 Form(form=strong_str, tags=["canonical"], ruby=ruby)
345 )
346 elif html_node.tag == "span":
347 if "headword-tr" in class_names or "tr" in class_names:
348 roman = clean_node(wxr, None, html_node)
349 if (
350 len(forms) > 0
351 and "canonical" not in forms[-1].tags
352 and "romanization" not in forms[-1].tags
353 ):
354 forms[-1].roman = roman
355 elif roman != "": 355 ↛ 337line 355 didn't jump to line 337 because the condition on line 355 was always true
356 forms.append(Form(form=roman, tags=["romanization"]))
357 elif "gender" in class_names:
358 for abbr_tag in html_node.find_html("abbr"):
359 gender_tag = clean_node(wxr, None, abbr_tag)
360 if ( 360 ↛ 365line 360 didn't jump to line 365 because the condition on line 360 was never true
361 len(forms) > 0
362 and "canonical" not in forms[-1].tags
363 and "romanization" not in forms[-1].tags
364 ):
365 forms[-1].raw_tags.append(gender_tag)
366 else:
367 word_entry.raw_tags.append(gender_tag)
368 elif "ib-content" in class_names: 368 ↛ 369line 368 didn't jump to line 369 because the condition on line 368 was never true
369 raw_tag = clean_node(wxr, None, html_node)
370 if raw_tag != "":
371 word_entry.raw_tags.append(raw_tag)
372 elif html_node.tag == "sup" and word_entry.lang_code == "ja":
373 forms.append(extract_historical_kana(wxr, html_node))
374 elif html_node.tag == "i":
375 if len(i_tags) > 0:
376 word_entry.raw_tags.extend(i_tags)
377 i_tags.clear()
378 for i_child in html_node.children:
379 raw_tag = (
380 clean_node(wxr, None, i_child)
381 .removeprefix("^†")
382 .strip()
383 )
384 if raw_tag != "": 384 ↛ 378line 384 didn't jump to line 378 because the condition on line 384 was always true
385 i_tags.append(raw_tag)
386 elif html_node.tag == "b": 386 ↛ 337line 386 didn't jump to line 337 because the condition on line 386 was always true
387 ruby, no_ruby = extract_ruby(wxr, html_node)
388 for form_str in filter(
389 None,
390 map(str.strip, clean_node(wxr, None, no_ruby).split(",")),
391 ):
392 form = Form(form=form_str, ruby=ruby)
393 if i_tags == ["หรือ"]:
394 if len(forms) > 0: 394 ↛ 398line 394 didn't jump to line 398 because the condition on line 394 was always true
395 form.raw_tags.extend(forms[-1].raw_tags)
396 else:
397 form.raw_tags.extend(i_tags)
398 forms.append(form)
399 i_tags.clear()
401 if len(i_tags) > 0:
402 word_entry.raw_tags.extend(i_tags)
403 for form in forms:
404 translate_raw_tags(form)
405 word_entry.forms.extend(forms)
406 clean_node(wxr, word_entry, expanded_node)
407 translate_raw_tags(word_entry)
410def extract_historical_kana(
411 wxr: WiktextractContext, sup_node: HTMLNode
412) -> Form:
413 form = Form(form="", tags=["archaic"])
414 for strong_node in sup_node.find_html("strong"):
415 form.form = clean_node(wxr, None, strong_node)
416 for span_node in sup_node.find_html(
417 "span", attr_name="class", attr_value="tr"
418 ):
419 form.roman = clean_node(wxr, None, span_node)
420 return form