Coverage for src/wiktextract/extractor/vi/page.py: 38%
165 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from .descendant import extract_descendant_section
17from .etymology import extract_etymology_section
18from .linkage import extract_alt_form_section, extract_linkage_section
19from .models import Form, Sense, WordEntry
20from .pos import extract_note_section, extract_pos_section
21from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
22from .sound import extract_homophone_section, extract_sound_section
23from .tags import translate_raw_tags
24from .translation import extract_translation_section
27def parse_section(
28 wxr: WiktextractContext,
29 page_data: list[WordEntry],
30 base_data: WordEntry,
31 level_node: LevelNode,
32) -> None:
33 subtitle = clean_node(wxr, None, level_node.largs)
34 subtitle = subtitle.rstrip(string.digits + string.whitespace)
35 if subtitle in POS_DATA:
36 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
37 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS:
38 page_data.pop()
39 extract_linkage_section(
40 wxr,
41 page_data if len(page_data) > 0 else [base_data],
42 level_node,
43 LINKAGE_SECTIONS[subtitle],
44 )
45 elif subtitle in TRANSLATION_SECTIONS:
46 extract_translation_section(
47 wxr, page_data[-1] if len(page_data) else base_data, level_node
48 )
49 elif subtitle == "Cách phát âm":
50 extract_sound_section(wxr, base_data, level_node)
51 elif subtitle == "Từ đồng âm": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_homophone_section(wxr, base_data, level_node)
53 elif subtitle == "Từ nguyên": 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 if level_node.contain_node(LEVEL_KIND_FLAGS):
55 base_data = base_data.model_copy(deep=True)
56 extract_etymology_section(wxr, base_data, level_node)
57 elif subtitle == "Cách viết khác":
58 extract_alt_form_section(wxr, base_data, page_data, level_node)
59 elif subtitle in ["Ghi chú sử dụng", "Chú ý"]: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 extract_note_section(
61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
62 )
63 elif subtitle in LINKAGE_SECTIONS: 63 ↛ 70line 63 didn't jump to line 70 because the condition on line 63 was always true
64 extract_linkage_section(
65 wxr,
66 page_data if len(page_data) > 0 else [base_data],
67 level_node,
68 LINKAGE_SECTIONS[subtitle],
69 )
70 elif subtitle == "Hậu duệ":
71 extract_descendant_section(
72 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
73 )
74 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]:
75 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22")
77 extract_section_cats(wxr, base_data, page_data, level_node)
78 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
79 parse_section(wxr, page_data, base_data, next_level)
82def parse_page(
83 wxr: WiktextractContext, page_title: str, page_text: str
84) -> list[dict[str, Any]]:
85 # page layout
86 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ
88 # ignore thesaurus, rhyme, quote, reconstruct pages
89 if page_title.startswith( 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true
90 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:")
91 ):
92 return []
94 wxr.wtp.start_page(page_title)
95 tree = wxr.wtp.parse(page_text, pre_expand=True)
96 page_data = []
97 for level2_node in tree.find_child(NodeKind.LEVEL2):
98 categories = {}
99 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown"
100 lang_code = name_to_code(lang_name, "vi") or "unknown"
101 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 101 ↛ 102line 101 didn't jump to line 102 because the loop on line 101 never started
102 if t_node.template_name == "langname":
103 lang_code = clean_node(
104 wxr, None, t_node.template_parameters.get(1, "")
105 )
106 if ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true
107 wxr.config.capture_language_codes is not None
108 and lang_code not in wxr.config.capture_language_codes
109 ):
110 continue
111 wxr.wtp.start_section(lang_name)
112 base_data = WordEntry(
113 word=wxr.wtp.title,
114 lang_code=lang_code,
115 lang=lang_name,
116 pos="unknown",
117 )
118 base_data.categories = categories.get("categories", [])
119 extract_section_cats(wxr, base_data, page_data, level2_node)
120 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 120 ↛ 121line 120 didn't jump to line 121 because the loop on line 120 never started
121 if t_node.template_name in ["zho-forms", "zh-forms"]:
122 extract_zh_forms_template(wxr, base_data, t_node)
123 elif t_node.template_name in ["zh-see", "zho-see"]:
124 base_data.redirects.append(
125 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
126 )
127 clean_node(wxr, base_data, t_node)
128 elif t_node.template_name in ["ja-see", "jpn-see", "ja-see-kango"]:
129 for key, value in t_node.template_parameters.items():
130 if isinstance(key, int):
131 base_data.redirects.append(clean_node(wxr, None, value))
132 clean_node(wxr, base_data, t_node)
133 if len(base_data.redirects) > 0: 133 ↛ 134line 133 didn't jump to line 134 because the condition on line 133 was never true
134 page_data.append(base_data)
135 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
136 parse_section(wxr, page_data, base_data, next_level)
138 for data in page_data:
139 if len(data.senses) == 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 data.senses.append(Sense(tags=["no-gloss"]))
142 return [d.model_dump(exclude_defaults=True) for d in page_data]
145def extract_section_cats(
146 wxr: WiktextractContext,
147 base_data: WordEntry,
148 page_data: list[WordEntry],
149 level_node: LevelNode,
150):
151 cats = {}
152 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
153 if node.kind == NodeKind.LINK: 153 ↛ 154line 153 didn't jump to line 154 because the condition on line 153 was never true
154 clean_node(wxr, cats, node)
155 elif node.template_name in [ 155 ↛ 162line 155 didn't jump to line 162 because the condition on line 155 was never true
156 "topics",
157 "C",
158 "topic",
159 "catlangname",
160 "cln",
161 ]:
162 clean_node(wxr, cats, node)
164 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:
165 base_data.categories.extend(cats.get("categories", []))
166 else:
167 for data in page_data:
168 if data.lang_code == page_data[-1].lang_code: 168 ↛ 167line 168 didn't jump to line 167 because the condition on line 168 was always true
169 data.categories.extend(cats.get("categories", []))
172def extract_zh_forms_template(
173 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
174):
175 base_data.literal_meaning = clean_node(
176 wxr, None, t_node.template_parameters.get("lit", "")
177 )
178 expanded_node = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(t_node), expand_all=True
180 )
181 for table in expanded_node.find_child(NodeKind.TABLE):
182 for row in table.find_child(NodeKind.TABLE_ROW):
183 row_header = ""
184 row_header_tags = []
185 header_has_span = False
186 for cell in row.find_child(
187 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
188 ):
189 if cell.kind == NodeKind.TABLE_HEADER_CELL:
190 row_header, row_header_tags, header_has_span = (
191 extract_zh_forms_header_cell(wxr, base_data, cell)
192 )
193 elif not header_has_span:
194 extract_zh_forms_data_cell(
195 wxr, base_data, cell, row_header, row_header_tags
196 )
197 for link_node in expanded_node.find_child(NodeKind.LINK):
198 clean_node(wxr, base_data, link_node)
201def extract_zh_forms_header_cell(
202 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
203) -> tuple[str, list[str], bool]:
204 row_header = ""
205 row_header_tags = []
206 header_has_span = False
207 first_span_index = len(header_cell.children)
208 for index, span_tag in header_cell.find_html("span", with_index=True):
209 if index < first_span_index:
210 first_span_index = index
211 header_has_span = True
212 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
213 for raw_tag in row_header.split(" và "):
214 raw_tag = raw_tag.strip()
215 if raw_tag != "":
216 row_header_tags.append(raw_tag)
217 for span_tag in header_cell.find_html_recursively("span"):
218 span_lang = span_tag.attrs.get("lang", "")
219 form_nodes = []
220 sup_title = ""
221 for node in span_tag.children:
222 if isinstance(node, HTMLNode) and node.tag == "sup":
223 for sup_span in node.find_html("span"):
224 sup_title = sup_span.attrs.get("title", "")
225 else:
226 form_nodes.append(node)
227 if span_lang in ["zh-Hant", "zh-Hans"]:
228 for word in clean_node(wxr, None, form_nodes).split("/"):
229 if word not in [base_data.word, ""]:
230 form = Form(form=word, raw_tags=row_header_tags)
231 if sup_title != "":
232 form.raw_tags.append(sup_title)
233 translate_raw_tags(form)
234 base_data.forms.append(form)
235 return row_header, row_header_tags, header_has_span
238def extract_zh_forms_data_cell(
239 wxr: WiktextractContext,
240 base_data: WordEntry,
241 cell: WikiNode,
242 row_header: str,
243 row_header_tags: list[str],
244):
245 for top_span_tag in cell.find_html("span"):
246 forms = []
247 for span_tag in top_span_tag.find_html("span"):
248 span_lang = span_tag.attrs.get("lang", "")
249 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:
250 word = clean_node(wxr, None, span_tag)
251 if word not in ["", "/", base_data.word]:
252 form = Form(form=word, raw_tags=row_header_tags)
253 if span_lang == "zh-Hant":
254 form.tags.append("Traditional-Chinese")
255 elif span_lang == "zh-Hans":
256 form.tags.append("Simplified-Chinese")
257 translate_raw_tags(form)
258 forms.append(form)
259 elif "font-size:80%" in span_tag.attrs.get("style", ""):
260 raw_tag = clean_node(wxr, None, span_tag)
261 if raw_tag != "":
262 for form in forms:
263 form.raw_tags.append(raw_tag)
264 translate_raw_tags(form)
265 base_data.forms.extend(forms)