Coverage for src / wiktextract / extractor / vi / pos.py: 58%
220 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..ruby import extract_ruby
15from .example import extract_example_list_item
16from .models import AltForm, Classifier, Form, Sense, WordEntry
17from .section_titles import POS_DATA
18from .tags import translate_raw_tags
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26 pos_title: str,
27):
28 page_data.append(base_data.model_copy(deep=True))
29 page_data[-1].pos_title = pos_title
30 pos_data = POS_DATA[pos_title]
31 page_data[-1].pos = pos_data["pos"]
32 base_data.pos = pos_data["pos"]
33 page_data[-1].tags.extend(pos_data.get("tags", []))
35 gloss_list_index = len(level_node.children)
36 for index, list_node in level_node.find_child(NodeKind.LIST, True):
37 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
38 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
39 extract_gloss_list_item(wxr, page_data[-1], list_item)
40 if index < gloss_list_index:
41 gloss_list_index = index
43 for node in level_node.children[:gloss_list_index]:
44 if isinstance(node, TemplateNode):
45 extract_headword_template(wxr, page_data[-1], node)
48# redirect
49ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"])
50FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"])
53def extract_gloss_list_item(
54 wxr: WiktextractContext,
55 word_entry: WordEntry,
56 list_item: WikiNode,
57 parent_sense: Sense | None = None,
58):
59 sense = (
60 parent_sense.model_copy(deep=True)
61 if parent_sense is not None
62 else Sense()
63 )
64 sense.examples.clear()
65 gloss_nodes = []
66 for node in list_item.children:
67 if isinstance(node, TemplateNode):
68 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 extract_label_template(wxr, sense, node)
70 elif node.template_name == "term":
71 extract_term_template(wxr, sense, node)
72 elif ( 72 ↛ 79line 72 didn't jump to line 79 because the condition on line 72 was always true
73 node.template_name.endswith((" of", "-of"))
74 or node.template_name in ALT_OF_TEMPLATES
75 or node.template_name in FORM_OF_TEMPLATES
76 ):
77 extract_form_of_template(wxr, sense, node)
78 gloss_nodes.append(node)
79 elif node.template_name == "@":
80 extract_at_template(wxr, sense, node)
81 elif node.template_name in ["zho-mw", "zh-mw"]:
82 extract_zh_mw_template(wxr, node, sense)
83 else:
84 gloss_nodes.append(node)
85 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
86 gloss_nodes.append(node)
87 gloss_str = clean_node(wxr, sense, gloss_nodes)
88 if gloss_str != "": 88 ↛ 93line 88 didn't jump to line 93 because the condition on line 88 was always true
89 sense.glosses.append(gloss_str)
90 translate_raw_tags(sense)
91 word_entry.senses.append(sense)
93 for child_list in list_item.find_child(NodeKind.LIST):
94 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
95 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
96 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
97 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 97 ↛ 93line 97 didn't jump to line 93 because the condition on line 97 was always true
98 (":", "*")
99 ):
100 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
101 extract_example_list_item(
102 wxr, word_entry, sense, child_list_item
103 )
106def extract_label_template(
107 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
108):
109 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn
110 expanded_node = wxr.wtp.parse(
111 wxr.wtp.node_to_wikitext(t_node), expand_all=True
112 )
113 for span_tag in expanded_node.find_html_recursively("span"):
114 span_classes = span_tag.attrs.get("class", "").split()
115 if "label-content" in span_classes:
116 for raw_tag in clean_node(wxr, None, span_tag).split(","):
117 raw_tag = raw_tag.strip()
118 if raw_tag != "":
119 sense.raw_tags.append(raw_tag)
120 clean_node(wxr, sense, expanded_node)
123def extract_term_template(
124 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
125):
126 # https://vi.wiktionary.org/wiki/Bản_mẫu:term
127 expanded_node = wxr.wtp.parse(
128 wxr.wtp.node_to_wikitext(t_node), expand_all=True
129 )
130 for italic_node in expanded_node.find_child(NodeKind.ITALIC):
131 raw_tag = clean_node(wxr, None, italic_node)
132 if raw_tag != "": 132 ↛ 130line 132 didn't jump to line 130 because the condition on line 132 was always true
133 sense.raw_tags.append(raw_tag)
136def extract_form_of_template(
137 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
138):
139 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ
140 expanded_node = wxr.wtp.parse(
141 wxr.wtp.node_to_wikitext(t_node), expand_all=True
142 )
143 form = AltForm(word="")
144 for i_tag in expanded_node.find_html_recursively("i"): 144 ↛ 147line 144 didn't jump to line 147 because the loop on line 144 didn't complete
145 form.word = clean_node(wxr, None, i_tag)
146 break
147 for span_tag in expanded_node.find_html_recursively("span"): 147 ↛ 151line 147 didn't jump to line 151 because the loop on line 147 didn't complete
148 if "mention-tr" in span_tag.attrs.get("class", "").split():
149 form.roman = clean_node(wxr, None, span_tag)
150 break
151 is_alt_of = (
152 "alternative" in t_node.template_name
153 or t_node.template_name in ALT_OF_TEMPLATES
154 )
155 if form.word != "": 155 ↛ exitline 155 didn't return from function 'extract_form_of_template' because the condition on line 155 was always true
156 if is_alt_of: 156 ↛ 157line 156 didn't jump to line 157 because the condition on line 156 was never true
157 sense.alt_of.append(form)
158 sense.tags.append("alt-of")
159 else:
160 sense.form_of.append(form)
161 sense.tags.append("form-of")
164def extract_at_template(
165 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
166):
167 # https://vi.wiktionary.org/wiki/Thể_loại:@
168 # obsolete template
169 expanded_node = wxr.wtp.parse(
170 wxr.wtp.node_to_wikitext(t_node), expand_all=True
171 )
172 for i_tag in expanded_node.find_html("i"):
173 text = clean_node(wxr, None, i_tag)
174 for raw_tag in re.split(r",|;", text):
175 raw_tag = raw_tag.strip()
176 if raw_tag != "":
177 sense.raw_tags.append(raw_tag)
180def extract_note_section(
181 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
182):
183 has_list = False
184 for list_node in level_node.find_child(NodeKind.LIST):
185 has_list = True
186 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
187 note = clean_node(wxr, None, list_item.children)
188 if note != "":
189 word_entry.notes.append(note)
190 if not has_list:
191 note = clean_node(
192 wxr,
193 None,
194 list(
195 level_node.invert_find_child(
196 LEVEL_KIND_FLAGS, include_empty_str=True
197 )
198 ),
199 )
200 if note != "":
201 word_entry.notes.append(note)
204def extract_headword_template(
205 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
206):
207 forms = []
208 has_headword_span = False
209 expanded_node = wxr.wtp.parse(
210 wxr.wtp.node_to_wikitext(t_node), expand_all=True
211 )
212 for main_span_tag in expanded_node.find_html(
213 "span", attr_name="class", attr_value="headword-line"
214 ):
215 has_headword_span = True
216 i_tags = []
217 for html_node in main_span_tag.find_child(NodeKind.HTML):
218 class_names = html_node.attrs.get("class", "").split()
219 if html_node.tag == "strong" and "headword" in class_names:
220 ruby, no_ruby = extract_ruby(wxr, html_node)
221 strong_str = clean_node(wxr, None, no_ruby)
222 if strong_str not in ["", wxr.wtp.title] or len(ruby) > 0:
223 forms.append(
224 Form(form=strong_str, tags=["canonical"], ruby=ruby)
225 )
226 elif html_node.tag == "span":
227 if "headword-tr" in class_names or "tr" in class_names:
228 roman = clean_node(wxr, None, html_node)
229 if (
230 len(forms) > 0
231 and "canonical" not in forms[-1].tags
232 and "romanization" not in forms[-1].tags
233 ):
234 forms[-1].roman = roman
235 elif roman != "": 235 ↛ 217line 235 didn't jump to line 217 because the condition on line 235 was always true
236 forms.append(Form(form=roman, tags=["romanization"]))
237 elif "gender" in class_names: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 for abbr_tag in html_node.find_html("abbr"):
239 gender_tag = abbr_tag.attrs.get(
240 "title", clean_node(wxr, None, abbr_tag)
241 )
242 if (
243 len(forms) > 0
244 and "canonical" not in forms[-1].tags
245 and "romanization" not in forms[-1].tags
246 ):
247 forms[-1].raw_tags.append(gender_tag)
248 else:
249 word_entry.raw_tags.append(gender_tag)
250 elif "ib-content" in class_names: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 raw_tag = clean_node(wxr, None, html_node)
252 if raw_tag != "":
253 word_entry.raw_tags.append(raw_tag)
254 elif html_node.tag == "sup" and word_entry.lang_code == "ja": 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 forms.append(extract_historical_kana(wxr, html_node))
256 elif html_node.tag == "i":
257 if len(i_tags) > 0:
258 word_entry.raw_tags.extend(i_tags)
259 i_tags.clear()
260 for i_child in html_node.children:
261 raw_tag = (
262 clean_node(wxr, None, i_child)
263 .removeprefix("^†")
264 .strip()
265 )
266 if raw_tag != "": 266 ↛ 260line 266 didn't jump to line 260 because the condition on line 266 was always true
267 i_tags.append(raw_tag)
268 elif html_node.tag == "b": 268 ↛ 217line 268 didn't jump to line 217 because the condition on line 268 was always true
269 ruby, no_ruby = extract_ruby(wxr, html_node)
270 for form_str in filter(
271 None,
272 map(str.strip, clean_node(wxr, None, no_ruby).split(",")),
273 ):
274 form = Form(form=form_str, ruby=ruby)
275 if i_tags == ["hoặc"]: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true
276 if len(forms) > 0:
277 form.raw_tags.extend(forms[-1].raw_tags)
278 else:
279 form.raw_tags.extend(i_tags)
280 forms.append(form)
281 i_tags.clear()
283 if len(i_tags) > 0: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 word_entry.raw_tags.extend(i_tags)
285 for form in forms:
286 translate_raw_tags(form)
287 word_entry.forms.extend(forms)
288 clean_node(wxr, word_entry, expanded_node)
289 translate_raw_tags(word_entry)
291 if not has_headword_span:
292 # Template:eng-noun
293 raw_tag = ""
294 for node in expanded_node.find_child_recursively(
295 NodeKind.ITALIC | NodeKind.HTML
296 ):
297 if node.kind == NodeKind.ITALIC:
298 raw_tag = clean_node(wxr, None, node)
299 elif (
300 isinstance(node, HTMLNode)
301 and node.tag == "span"
302 and "form-of" in node.attrs.get("class", "").split()
303 ):
304 form = Form(form=clean_node(wxr, None, node))
305 if raw_tag != "": 305 ↛ 309line 305 didn't jump to line 309 because the condition on line 305 was always true
306 form.raw_tags.append(raw_tag)
307 translate_raw_tags(form)
308 raw_tag = ""
309 if form.form != "": 309 ↛ 294line 309 didn't jump to line 294 because the condition on line 309 was always true
310 word_entry.forms.append(form)
313def extract_historical_kana(
314 wxr: WiktextractContext, sup_node: HTMLNode
315) -> Form:
316 form = Form(form="", tags=["archaic"])
317 for strong_node in sup_node.find_html("strong"):
318 form.form = clean_node(wxr, None, strong_node)
319 for span_node in sup_node.find_html(
320 "span", attr_name="class", attr_value="tr"
321 ):
322 form.roman = clean_node(wxr, None, span_node)
323 return form
326def extract_zh_mw_template(
327 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
328):
329 # Chinese inline classifier template
330 # https://zh.wiktionary.org/wiki/Bản_mẫu:zho-mw
331 expanded_node = wxr.wtp.parse(
332 wxr.wtp.node_to_wikitext(t_node), expand_all=True
333 )
334 classifiers = []
335 last_word = ""
336 for span_tag in expanded_node.find_html_recursively("span"):
337 span_class = span_tag.attrs.get("class", "")
338 if span_class in ["Hani", "Hant", "Hans"]:
339 word = clean_node(wxr, None, span_tag)
340 if word != "/":
341 classifier = Classifier(classifier=word)
342 if span_class == "Hant":
343 classifier.tags.append("Traditional-Chinese")
344 elif span_class == "Hans":
345 classifier.tags.append("Simplified-Chinese")
347 if len(classifiers) > 0 and last_word != "/":
348 sense.classifiers.extend(classifiers)
349 classifiers.clear()
350 classifiers.append(classifier)
351 last_word = word
352 elif "title" in span_tag.attrs:
353 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
354 if len(raw_tag) > 0:
355 for classifier in classifiers:
356 classifier.raw_tags.append(raw_tag)
357 sense.classifiers.extend(classifiers)
358 for classifier in sense.classifiers:
359 translate_raw_tags(classifier)
360 for link in expanded_node.find_child(NodeKind.LINK):
361 clean_node(wxr, sense, link)