Coverage for src / wiktextract / extractor / vi / page.py: 40%
167 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 HTMLNode,
8 LevelNode,
9 NodeKind,
10 TemplateNode,
11 WikiNode,
12)
14from ...page import clean_node
15from ...wxr_context import WiktextractContext
16from .descendant import extract_descendant_section
17from .etymology import extract_etymology_section, extract_ja_kanjitab_template
18from .linkage import extract_alt_form_section, extract_linkage_section
19from .models import Form, Sense, WordEntry
20from .pos import extract_note_section, extract_pos_section
21from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
22from .sound import extract_homophone_section, extract_sound_section
23from .tags import translate_raw_tags
24from .translation import extract_translation_section
27def parse_section(
28 wxr: WiktextractContext,
29 page_data: list[WordEntry],
30 base_data: WordEntry,
31 level_node: LevelNode,
32) -> None:
33 subtitle = clean_node(wxr, None, level_node.largs)
34 subtitle = subtitle.rstrip(string.digits + string.whitespace)
35 if subtitle in POS_DATA:
36 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
37 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS:
38 page_data.pop()
39 extract_linkage_section(
40 wxr,
41 page_data if len(page_data) > 0 else [base_data],
42 level_node,
43 LINKAGE_SECTIONS[subtitle],
44 )
45 elif subtitle in TRANSLATION_SECTIONS:
46 extract_translation_section(
47 wxr, page_data[-1] if len(page_data) else base_data, level_node
48 )
49 elif subtitle == "Cách phát âm":
50 extract_sound_section(wxr, base_data, level_node)
51 elif subtitle == "Từ đồng âm": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_homophone_section(wxr, base_data, level_node)
53 elif subtitle == "Từ nguyên":
54 if level_node.contain_node(LEVEL_KIND_FLAGS): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 base_data = base_data.model_copy(deep=True)
56 extract_etymology_section(wxr, base_data, level_node)
57 elif subtitle == "Cách viết khác":
58 extract_alt_form_section(wxr, base_data, page_data, level_node)
59 elif subtitle in ["Ghi chú sử dụng", "Chú ý"]: 59 ↛ 60line 59 didn't jump to line 60 because the condition on line 59 was never true
60 extract_note_section(
61 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
62 )
63 elif subtitle in LINKAGE_SECTIONS: 63 ↛ 70line 63 didn't jump to line 70 because the condition on line 63 was always true
64 extract_linkage_section(
65 wxr,
66 page_data if len(page_data) > 0 else [base_data],
67 level_node,
68 LINKAGE_SECTIONS[subtitle],
69 )
70 elif subtitle == "Hậu duệ":
71 extract_descendant_section(
72 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
73 )
74 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]:
75 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22")
77 extract_section_cats(wxr, base_data, page_data, level_node)
78 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
79 parse_section(wxr, page_data, base_data, next_level)
82def parse_page(
83 wxr: WiktextractContext, page_title: str, page_text: str
84) -> list[dict[str, Any]]:
85 # page layout
86 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ
88 # ignore thesaurus, rhyme, quote, reconstruct pages
89 if page_title.startswith( 89 ↛ 92line 89 didn't jump to line 92 because the condition on line 89 was never true
90 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:")
91 ):
92 return []
94 wxr.wtp.start_page(page_title)
95 tree = wxr.wtp.parse(page_text, pre_expand=True)
96 page_data = []
97 for level2_node in tree.find_child(NodeKind.LEVEL2):
98 categories = {}
99 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown"
100 lang_code = name_to_code(lang_name, "vi") or "unknown"
101 for t_node in level2_node.find_content(NodeKind.TEMPLATE):
102 if t_node.template_name == "langname": 102 ↛ 101line 102 didn't jump to line 101 because the condition on line 102 was always true
103 lang_code = clean_node(
104 wxr, None, t_node.template_parameters.get(1, "")
105 )
106 if ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true
107 wxr.config.capture_language_codes is not None
108 and lang_code not in wxr.config.capture_language_codes
109 ):
110 continue
111 wxr.wtp.start_section(lang_name)
112 base_data = WordEntry(
113 word=wxr.wtp.title,
114 lang_code=lang_code,
115 lang=lang_name,
116 pos="unknown",
117 )
118 base_data.categories = categories.get("categories", [])
119 extract_section_cats(wxr, base_data, page_data, level2_node)
120 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 120 ↛ 121line 120 didn't jump to line 121 because the loop on line 120 never started
121 if t_node.template_name in ["zho-forms", "zh-forms"]:
122 extract_zh_forms_template(wxr, base_data, t_node)
123 elif t_node.template_name in ["zh-see", "zho-see"]:
124 base_data.redirects.append(
125 clean_node(wxr, None, t_node.template_parameters.get(1, ""))
126 )
127 clean_node(wxr, base_data, t_node)
128 elif t_node.template_name in ["ja-see", "jpn-see", "ja-see-kango"]:
129 for key, value in t_node.template_parameters.items():
130 if isinstance(key, int):
131 base_data.redirects.append(clean_node(wxr, None, value))
132 clean_node(wxr, base_data, t_node)
133 elif (
134 t_node.template_name.endswith("-kanjitab")
135 or t_node.template_name == "ja-kt"
136 ):
137 extract_ja_kanjitab_template(wxr, t_node, base_data)
139 if len(base_data.redirects) > 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 page_data.append(base_data)
141 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
142 parse_section(wxr, page_data, base_data, next_level)
144 for data in page_data:
145 if len(data.senses) == 0: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 data.senses.append(Sense(tags=["no-gloss"]))
148 return [d.model_dump(exclude_defaults=True) for d in page_data]
151def extract_section_cats(
152 wxr: WiktextractContext,
153 base_data: WordEntry,
154 page_data: list[WordEntry],
155 level_node: LevelNode,
156):
157 cats = {}
158 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
159 if node.kind == NodeKind.LINK: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 clean_node(wxr, cats, node)
161 elif node.template_name in [ 161 ↛ 168line 161 didn't jump to line 168 because the condition on line 161 was never true
162 "topics",
163 "C",
164 "topic",
165 "catlangname",
166 "cln",
167 ]:
168 clean_node(wxr, cats, node)
170 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:
171 base_data.categories.extend(cats.get("categories", []))
172 else:
173 for data in page_data:
174 if data.lang_code == page_data[-1].lang_code: 174 ↛ 173line 174 didn't jump to line 173 because the condition on line 174 was always true
175 data.categories.extend(cats.get("categories", []))
178def extract_zh_forms_template(
179 wxr: WiktextractContext, base_data: WordEntry, t_node: TemplateNode
180):
181 base_data.literal_meaning = clean_node(
182 wxr, None, t_node.template_parameters.get("lit", "")
183 )
184 expanded_node = wxr.wtp.parse(
185 wxr.wtp.node_to_wikitext(t_node), expand_all=True
186 )
187 for table in expanded_node.find_child(NodeKind.TABLE):
188 for row in table.find_child(NodeKind.TABLE_ROW):
189 row_header = ""
190 row_header_tags = []
191 header_has_span = False
192 for cell in row.find_child(
193 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
194 ):
195 if cell.kind == NodeKind.TABLE_HEADER_CELL:
196 row_header, row_header_tags, header_has_span = (
197 extract_zh_forms_header_cell(wxr, base_data, cell)
198 )
199 elif not header_has_span:
200 extract_zh_forms_data_cell(
201 wxr, base_data, cell, row_header, row_header_tags
202 )
203 for link_node in expanded_node.find_child(NodeKind.LINK):
204 clean_node(wxr, base_data, link_node)
207def extract_zh_forms_header_cell(
208 wxr: WiktextractContext, base_data: WordEntry, header_cell: WikiNode
209) -> tuple[str, list[str], bool]:
210 row_header = ""
211 row_header_tags = []
212 header_has_span = False
213 first_span_index = len(header_cell.children)
214 for index, span_tag in header_cell.find_html("span", with_index=True):
215 if index < first_span_index:
216 first_span_index = index
217 header_has_span = True
218 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
219 for raw_tag in row_header.split(" và "):
220 raw_tag = raw_tag.strip()
221 if raw_tag != "":
222 row_header_tags.append(raw_tag)
223 for span_tag in header_cell.find_html_recursively("span"):
224 span_lang = span_tag.attrs.get("lang", "")
225 form_nodes = []
226 sup_title = ""
227 for node in span_tag.children:
228 if isinstance(node, HTMLNode) and node.tag == "sup":
229 for sup_span in node.find_html("span"):
230 sup_title = sup_span.attrs.get("title", "")
231 else:
232 form_nodes.append(node)
233 if span_lang in ["zh-Hant", "zh-Hans"]:
234 for word in clean_node(wxr, None, form_nodes).split("/"):
235 if word not in [base_data.word, ""]:
236 form = Form(form=word, raw_tags=row_header_tags)
237 if sup_title != "":
238 form.raw_tags.append(sup_title)
239 translate_raw_tags(form)
240 base_data.forms.append(form)
241 return row_header, row_header_tags, header_has_span
244def extract_zh_forms_data_cell(
245 wxr: WiktextractContext,
246 base_data: WordEntry,
247 cell: WikiNode,
248 row_header: str,
249 row_header_tags: list[str],
250):
251 for top_span_tag in cell.find_html("span"):
252 forms = []
253 for span_tag in top_span_tag.find_html("span"):
254 span_lang = span_tag.attrs.get("lang", "")
255 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:
256 word = clean_node(wxr, None, span_tag)
257 if word not in ["", "/", base_data.word]:
258 form = Form(form=word, raw_tags=row_header_tags)
259 if span_lang == "zh-Hant":
260 form.tags.append("Traditional-Chinese")
261 elif span_lang == "zh-Hans":
262 form.tags.append("Simplified-Chinese")
263 translate_raw_tags(form)
264 forms.append(form)
265 elif "font-size:80%" in span_tag.attrs.get("style", ""):
266 raw_tag = clean_node(wxr, None, span_tag)
267 if raw_tag != "":
268 for form in forms:
269 form.raw_tags.append(raw_tag)
270 translate_raw_tags(form)
271 base_data.forms.extend(forms)