Coverage for src/wiktextract/extractor/vi/pos.py: 54%
157 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .example import extract_example_list_item
15from .models import AltForm, Classifier, Form, Sense, WordEntry
16from .section_titles import POS_DATA
17from .tags import translate_raw_tags
20def extract_pos_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: LevelNode,
25 pos_title: str,
26):
27 page_data.append(base_data.model_copy(deep=True))
28 page_data[-1].pos_title = pos_title
29 pos_data = POS_DATA[pos_title]
30 page_data[-1].pos = pos_data["pos"]
31 base_data.pos = pos_data["pos"]
32 page_data[-1].tags.extend(pos_data.get("tags", []))
34 gloss_list_index = len(level_node.children)
35 for index, list_node in level_node.find_child(NodeKind.LIST, True):
36 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
38 extract_gloss_list_item(wxr, page_data[-1], list_item)
39 if index < gloss_list_index: 39 ↛ 36line 39 didn't jump to line 36 because the condition on line 39 was always true
40 gloss_list_index = index
42 for node in level_node.children[:gloss_list_index]:
43 if isinstance(node, TemplateNode):
44 extract_headword_template(wxr, page_data[-1], node)
47# redirect
48ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"])
49FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"])
52def extract_gloss_list_item(
53 wxr: WiktextractContext,
54 word_entry: WordEntry,
55 list_item: WikiNode,
56 parent_sense: Sense | None = None,
57):
58 sense = (
59 parent_sense.model_copy(deep=True)
60 if parent_sense is not None
61 else Sense()
62 )
63 sense.examples.clear()
64 gloss_nodes = []
65 for node in list_item.children:
66 if isinstance(node, TemplateNode):
67 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 extract_label_template(wxr, sense, node)
69 elif node.template_name == "term":
70 extract_term_template(wxr, sense, node)
71 elif ( 71 ↛ 78line 71 didn't jump to line 78 because the condition on line 71 was always true
72 node.template_name.endswith((" of", "-of"))
73 or node.template_name in ALT_OF_TEMPLATES
74 or node.template_name in FORM_OF_TEMPLATES
75 ):
76 extract_form_of_template(wxr, sense, node)
77 gloss_nodes.append(node)
78 elif node.template_name == "@":
79 extract_at_template(wxr, sense, node)
80 elif node.template_name in ["zho-mw", "zh-mw"]:
81 extract_zh_mw_template(wxr, node, sense)
82 else:
83 gloss_nodes.append(node)
84 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
85 gloss_nodes.append(node)
86 gloss_str = clean_node(wxr, sense, gloss_nodes)
87 if gloss_str != "": 87 ↛ 92line 87 didn't jump to line 92 because the condition on line 87 was always true
88 sense.glosses.append(gloss_str)
89 translate_raw_tags(sense)
90 word_entry.senses.append(sense)
92 for child_list in list_item.find_child(NodeKind.LIST):
93 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
94 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
95 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
96 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 96 ↛ 92line 96 didn't jump to line 92 because the condition on line 96 was always true
97 (":", "*")
98 ):
99 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
100 extract_example_list_item(
101 wxr, word_entry, sense, child_list_item
102 )
105def extract_label_template(
106 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
107):
108 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn
109 expanded_node = wxr.wtp.parse(
110 wxr.wtp.node_to_wikitext(t_node), expand_all=True
111 )
112 for span_tag in expanded_node.find_html_recursively("span"):
113 span_classes = span_tag.attrs.get("class", "").split()
114 if "label-content" in span_classes:
115 for raw_tag in clean_node(wxr, None, span_tag).split(","):
116 raw_tag = raw_tag.strip()
117 if raw_tag != "":
118 sense.raw_tags.append(raw_tag)
119 clean_node(wxr, sense, expanded_node)
122def extract_term_template(
123 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
124):
125 # https://vi.wiktionary.org/wiki/Bản_mẫu:term
126 expanded_node = wxr.wtp.parse(
127 wxr.wtp.node_to_wikitext(t_node), expand_all=True
128 )
129 for italic_node in expanded_node.find_child(NodeKind.ITALIC):
130 raw_tag = clean_node(wxr, None, italic_node)
131 if raw_tag != "": 131 ↛ 129line 131 didn't jump to line 129 because the condition on line 131 was always true
132 sense.raw_tags.append(raw_tag)
135def extract_form_of_template(
136 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
137):
138 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ
139 expanded_node = wxr.wtp.parse(
140 wxr.wtp.node_to_wikitext(t_node), expand_all=True
141 )
142 form = AltForm(word="")
143 for i_tag in expanded_node.find_html_recursively("i"): 143 ↛ 146line 143 didn't jump to line 146 because the loop on line 143 didn't complete
144 form.word = clean_node(wxr, None, i_tag)
145 break
146 for span_tag in expanded_node.find_html_recursively("span"): 146 ↛ 150line 146 didn't jump to line 150 because the loop on line 146 didn't complete
147 if "mention-tr" in span_tag.attrs.get("class", "").split():
148 form.roman = clean_node(wxr, None, span_tag)
149 break
150 is_alt_of = (
151 "alternative" in t_node.template_name
152 or t_node.template_name in ALT_OF_TEMPLATES
153 )
154 if form.word != "": 154 ↛ exitline 154 didn't return from function 'extract_form_of_template' because the condition on line 154 was always true
155 if is_alt_of: 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 sense.alt_of.append(form)
157 sense.tags.append("alt-of")
158 else:
159 sense.form_of.append(form)
160 sense.tags.append("form-of")
163def extract_at_template(
164 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
165):
166 # https://vi.wiktionary.org/wiki/Thể_loại:@
167 # obsolete template
168 expanded_node = wxr.wtp.parse(
169 wxr.wtp.node_to_wikitext(t_node), expand_all=True
170 )
171 for i_tag in expanded_node.find_html("i"):
172 text = clean_node(wxr, None, i_tag)
173 for raw_tag in re.split(r",|;", text):
174 raw_tag = raw_tag.strip()
175 if raw_tag != "":
176 sense.raw_tags.append(raw_tag)
179def extract_note_section(
180 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
181):
182 has_list = False
183 for list_node in level_node.find_child(NodeKind.LIST):
184 has_list = True
185 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
186 note = clean_node(wxr, None, list_item.children)
187 if note != "":
188 word_entry.notes.append(note)
189 if not has_list:
190 note = clean_node(
191 wxr,
192 None,
193 list(
194 level_node.invert_find_child(
195 LEVEL_KIND_FLAGS, include_empty_str=True
196 )
197 ),
198 )
199 if note != "":
200 word_entry.notes.append(note)
203def extract_headword_template(
204 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
205):
206 raw_tag = ""
207 expanded_node = wxr.wtp.parse(
208 wxr.wtp.node_to_wikitext(t_node), expand_all=True
209 )
210 for node in expanded_node.find_child_recursively(
211 NodeKind.ITALIC | NodeKind.HTML
212 ):
213 if node.kind == NodeKind.ITALIC:
214 raw_tag = clean_node(wxr, None, node)
215 elif (
216 isinstance(node, HTMLNode)
217 and node.tag == "span"
218 and "form-of" in node.attrs.get("class", "").split()
219 ):
220 form = Form(form=clean_node(wxr, None, node))
221 if raw_tag != "": 221 ↛ 225line 221 didn't jump to line 225 because the condition on line 221 was always true
222 form.raw_tags.append(raw_tag)
223 translate_raw_tags(form)
224 raw_tag = ""
225 if form.form != "": 225 ↛ 210line 225 didn't jump to line 210 because the condition on line 225 was always true
226 word_entry.forms.append(form)
228 for link_node in expanded_node.find_child(NodeKind.LINK):
229 clean_node(wxr, word_entry, link_node)
232def extract_zh_mw_template(
233 wxr: WiktextractContext, t_node: TemplateNode, sense: Sense
234):
235 # Chinese inline classifier template
236 # https://zh.wiktionary.org/wiki/Bản_mẫu:zho-mw
237 expanded_node = wxr.wtp.parse(
238 wxr.wtp.node_to_wikitext(t_node), expand_all=True
239 )
240 classifiers = []
241 last_word = ""
242 for span_tag in expanded_node.find_html_recursively("span"):
243 span_class = span_tag.attrs.get("class", "")
244 if span_class in ["Hani", "Hant", "Hans"]:
245 word = clean_node(wxr, None, span_tag)
246 if word != "/":
247 classifier = Classifier(classifier=word)
248 if span_class == "Hant":
249 classifier.tags.append("Traditional-Chinese")
250 elif span_class == "Hans":
251 classifier.tags.append("Simplified-Chinese")
253 if len(classifiers) > 0 and last_word != "/":
254 sense.classifiers.extend(classifiers)
255 classifiers.clear()
256 classifiers.append(classifier)
257 last_word = word
258 elif "title" in span_tag.attrs:
259 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
260 if len(raw_tag) > 0:
261 for classifier in classifiers:
262 classifier.raw_tags.append(raw_tag)
263 sense.classifiers.extend(classifiers)
264 for classifier in sense.classifiers:
265 translate_raw_tags(classifier)
266 for link in expanded_node.find_child(NodeKind.LINK):
267 clean_node(wxr, sense, link)