Coverage for src/wiktextract/extractor/vi/pos.py: 67%
126 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .example import extract_example_list_item
15from .models import AltForm, Form, Sense, WordEntry
16from .section_titles import POS_DATA
17from .tags import translate_raw_tags
20def extract_pos_section(
21 wxr: WiktextractContext,
22 page_data: list[WordEntry],
23 base_data: WordEntry,
24 level_node: LevelNode,
25 pos_title: str,
26):
27 page_data.append(base_data.model_copy(deep=True))
28 page_data[-1].pos_title = pos_title
29 pos_data = POS_DATA[pos_title]
30 page_data[-1].pos = pos_data["pos"]
31 base_data.pos = pos_data["pos"]
32 page_data[-1].tags.extend(pos_data.get("tags", []))
34 gloss_list_index = len(level_node.children)
35 for index, list_node in level_node.find_child(NodeKind.LIST, True):
36 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
37 if list_node.sarg.startswith("#") and list_node.sarg.endswith("#"):
38 extract_gloss_list_item(wxr, page_data[-1], list_item)
39 if index < gloss_list_index: 39 ↛ 36line 39 didn't jump to line 36 because the condition on line 39 was always true
40 gloss_list_index = index
42 for node in level_node.children[:gloss_list_index]:
43 if isinstance(node, TemplateNode):
44 extract_headword_template(wxr, page_data[-1], node)
47# redirect
48ALT_OF_TEMPLATES = frozenset(["altform", "alt form", "vi-alt sp", "vie-alt sp"])
49FORM_OF_TEMPLATES = frozenset(["số nhiều của", "short for"])
52def extract_gloss_list_item(
53 wxr: WiktextractContext,
54 word_entry: WordEntry,
55 list_item: WikiNode,
56 parent_sense: Sense | None = None,
57):
58 sense = (
59 parent_sense.model_copy(deep=True)
60 if parent_sense is not None
61 else Sense()
62 )
63 sense.examples.clear()
64 gloss_nodes = []
65 for node in list_item.children:
66 if isinstance(node, TemplateNode):
67 if node.template_name in ["nhãn", "label", "def-lb", "context"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 extract_label_template(wxr, sense, node)
69 elif node.template_name == "term":
70 extract_term_template(wxr, sense, node)
71 elif ( 71 ↛ 78line 71 didn't jump to line 78 because the condition on line 71 was always true
72 node.template_name.endswith((" of", "-of"))
73 or node.template_name in ALT_OF_TEMPLATES
74 or node.template_name in FORM_OF_TEMPLATES
75 ):
76 extract_form_of_template(wxr, sense, node)
77 gloss_nodes.append(node)
78 elif node.template_name == "@":
79 extract_at_template(wxr, sense, node)
80 else:
81 gloss_nodes.append(node)
82 elif not (isinstance(node, WikiNode) and node.kind == NodeKind.LIST):
83 gloss_nodes.append(node)
84 gloss_str = clean_node(wxr, sense, gloss_nodes)
85 if gloss_str != "": 85 ↛ 90line 85 didn't jump to line 90 because the condition on line 85 was always true
86 sense.glosses.append(gloss_str)
87 translate_raw_tags(sense)
88 word_entry.senses.append(sense)
90 for child_list in list_item.find_child(NodeKind.LIST):
91 if child_list.sarg.startswith("#") and child_list.sarg.endswith("#"):
92 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
93 extract_gloss_list_item(wxr, word_entry, child_list_item, sense)
94 elif child_list.sarg.startswith("#") and child_list.sarg.endswith( 94 ↛ 90line 94 didn't jump to line 90 because the condition on line 94 was always true
95 (":", "*")
96 ):
97 for child_list_item in child_list.find_child(NodeKind.LIST_ITEM):
98 extract_example_list_item(
99 wxr, word_entry, sense, child_list_item
100 )
103def extract_label_template(
104 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
105):
106 # https://vi.wiktionary.org/wiki/Bản_mẫu:nhãn
107 expanded_node = wxr.wtp.parse(
108 wxr.wtp.node_to_wikitext(t_node), expand_all=True
109 )
110 for span_tag in expanded_node.find_html_recursively("span"):
111 span_classes = span_tag.attrs.get("class", "").split()
112 if "label-content" in span_classes:
113 for raw_tag in clean_node(wxr, None, span_tag).split(","):
114 raw_tag = raw_tag.strip()
115 if raw_tag != "":
116 sense.raw_tags.append(raw_tag)
117 clean_node(wxr, sense, expanded_node)
120def extract_term_template(
121 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
122):
123 # https://vi.wiktionary.org/wiki/Bản_mẫu:term
124 expanded_node = wxr.wtp.parse(
125 wxr.wtp.node_to_wikitext(t_node), expand_all=True
126 )
127 for italic_node in expanded_node.find_child(NodeKind.ITALIC):
128 raw_tag = clean_node(wxr, None, italic_node)
129 if raw_tag != "": 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true
130 sense.raw_tags.append(raw_tag)
133def extract_form_of_template(
134 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
135):
136 # https://vi.wiktionary.org/wiki/Thể_loại:Bản_mẫu_dạng_từ
137 expanded_node = wxr.wtp.parse(
138 wxr.wtp.node_to_wikitext(t_node), expand_all=True
139 )
140 form = AltForm(word="")
141 for i_tag in expanded_node.find_html_recursively("i"): 141 ↛ 144line 141 didn't jump to line 144 because the loop on line 141 didn't complete
142 form.word = clean_node(wxr, None, i_tag)
143 break
144 for span_tag in expanded_node.find_html_recursively("span"): 144 ↛ 148line 144 didn't jump to line 148 because the loop on line 144 didn't complete
145 if "mention-tr" in span_tag.attrs.get("class", "").split():
146 form.roman = clean_node(wxr, None, span_tag)
147 break
148 is_alt_of = (
149 "alternative" in t_node.template_name
150 or t_node.template_name in ALT_OF_TEMPLATES
151 )
152 if form.word != "": 152 ↛ exitline 152 didn't return from function 'extract_form_of_template' because the condition on line 152 was always true
153 if is_alt_of: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 sense.alt_of.append(form)
155 sense.tags.append("alt-of")
156 else:
157 sense.form_of.append(form)
158 sense.tags.append("form-of")
161def extract_at_template(
162 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
163):
164 # https://vi.wiktionary.org/wiki/Thể_loại:@
165 # obsolete template
166 expanded_node = wxr.wtp.parse(
167 wxr.wtp.node_to_wikitext(t_node), expand_all=True
168 )
169 for i_tag in expanded_node.find_html("i"):
170 text = clean_node(wxr, None, i_tag)
171 for raw_tag in re.split(r",|;", text):
172 raw_tag = raw_tag.strip()
173 if raw_tag != "":
174 sense.raw_tags.append(raw_tag)
177def extract_note_section(
178 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
179):
180 has_list = False
181 for list_node in level_node.find_child(NodeKind.LIST):
182 has_list = True
183 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
184 note = clean_node(wxr, None, list_item.children)
185 if note != "":
186 word_entry.notes.append(note)
187 if not has_list:
188 note = clean_node(
189 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
190 )
191 if note != "":
192 word_entry.notes.append(note)
195def extract_headword_template(
196 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
197):
198 raw_tag = ""
199 expanded_node = wxr.wtp.parse(
200 wxr.wtp.node_to_wikitext(t_node), expand_all=True
201 )
202 for node in expanded_node.find_child_recursively(
203 NodeKind.ITALIC | NodeKind.HTML
204 ):
205 if node.kind == NodeKind.ITALIC:
206 raw_tag = clean_node(wxr, None, node)
207 elif (
208 isinstance(node, HTMLNode)
209 and node.tag == "span"
210 and "form-of" in node.attrs.get("class", "").split()
211 ):
212 form = Form(form=clean_node(wxr, None, node))
213 if raw_tag != "": 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true
214 form.raw_tags.append(raw_tag)
215 translate_raw_tags(form)
216 raw_tag = ""
217 if form.form != "": 217 ↛ 202line 217 didn't jump to line 202 because the condition on line 217 was always true
218 word_entry.forms.append(form)
220 for link_node in expanded_node.find_child(NodeKind.LINK):
221 clean_node(wxr, word_entry, link_node)