Coverage for src/wiktextract/extractor/vi/page.py: 79%
73 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from typing import Any
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .etymology import extract_etymology_section
9from .linkage import extract_alt_form_section, extract_linkage_section
10from .models import Sense, WordEntry
11from .pos import extract_note_section, extract_pos_section
12from .section_titles import LINKAGE_SECTIONS, POS_DATA, TRANSLATION_SECTIONS
13from .sound import extract_sound_section
14from .translation import extract_translation_section
17def parse_section(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 base_data: WordEntry,
21 level_node: LevelNode,
22) -> None:
23 subtitle = clean_node(wxr, None, level_node.largs)
24 if subtitle in POS_DATA:
25 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
26 if len(page_data[-1].senses) == 0 and subtitle in LINKAGE_SECTIONS:
27 page_data.pop()
28 extract_linkage_section(
29 wxr,
30 page_data if len(page_data) > 0 else [base_data],
31 level_node,
32 LINKAGE_SECTIONS[subtitle],
33 )
34 elif subtitle in TRANSLATION_SECTIONS:
35 extract_translation_section(
36 wxr, page_data[-1] if len(page_data) else base_data, level_node
37 )
38 elif subtitle == "Cách phát âm":
39 extract_sound_section(wxr, base_data, level_node)
40 elif subtitle == "Từ nguyên": 40 ↛ 41line 40 didn't jump to line 41 because the condition on line 40 was never true
41 extract_etymology_section(wxr, base_data, level_node)
42 elif subtitle == "Cách viết khác":
43 extract_alt_form_section(wxr, base_data, page_data, level_node)
44 elif subtitle == "Ghi chú sử dụng": 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 extract_note_section(
46 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
47 )
48 elif subtitle in LINKAGE_SECTIONS: 48 ↛ 55line 48 didn't jump to line 55 because the condition on line 48 was always true
49 extract_linkage_section(
50 wxr,
51 page_data if len(page_data) > 0 else [base_data],
52 level_node,
53 LINKAGE_SECTIONS[subtitle],
54 )
55 elif subtitle not in ["Tham khảo", "Cách ra dấu", "Đọc thêm", "Xem thêm"]:
56 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="vi/page/22")
58 extract_section_cats(wxr, base_data, page_data, level_node)
59 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
60 parse_section(wxr, page_data, base_data, next_level)
63def parse_page(
64 wxr: WiktextractContext, page_title: str, page_text: str
65) -> list[dict[str, Any]]:
66 # page layout
67 # https://vi.wiktionary.org/wiki/Wiktionary:Sơ_đồ_mục_từ
69 # ignore thesaurus, rhyme, quote, reconstruct pages
70 if page_title.startswith( 70 ↛ 73line 70 didn't jump to line 73 because the condition on line 70 was never true
71 ("Kho từ vựng:", "Vần:", "Kho ngữ liệu:", "Từ tái tạo:")
72 ):
73 return []
75 wxr.wtp.start_page(page_title)
76 tree = wxr.wtp.parse(page_text, pre_expand=True)
77 page_data = []
78 for level2_node in tree.find_child(NodeKind.LEVEL2):
79 categories = {}
80 lang_name = clean_node(wxr, categories, level2_node.largs) or "unknown"
81 lang_code = name_to_code(lang_name, "vi") or "unknown"
82 for t_node in level2_node.find_content(NodeKind.TEMPLATE): 82 ↛ 83line 82 didn't jump to line 83 because the loop on line 82 never started
83 if t_node.template_name == "langname":
84 lang_code = clean_node(
85 wxr, None, t_node.template_parameters.get(1, "")
86 )
87 if ( 87 ↛ 91line 87 didn't jump to line 91 because the condition on line 87 was never true
88 wxr.config.capture_language_codes is not None
89 and lang_code not in wxr.config.capture_language_codes
90 ):
91 continue
92 wxr.wtp.start_section(lang_name)
93 base_data = WordEntry(
94 word=wxr.wtp.title,
95 lang_code=lang_code,
96 lang=lang_name,
97 pos="unknown",
98 )
99 base_data.categories = categories.get("categories", [])
100 extract_section_cats(wxr, base_data, page_data, level2_node)
101 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
102 parse_section(wxr, page_data, base_data, next_level)
104 for data in page_data:
105 if len(data.senses) == 0: 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 data.senses.append(Sense(tags=["no-gloss"]))
108 return [d.model_dump(exclude_defaults=True) for d in page_data]
111def extract_section_cats(
112 wxr: WiktextractContext,
113 base_data: WordEntry,
114 page_data: list[WordEntry],
115 level_node: LevelNode,
116):
117 cats = {}
118 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
119 if node.kind == NodeKind.LINK: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 clean_node(wxr, cats, node)
121 elif node.template_name in [ 121 ↛ 128line 121 didn't jump to line 128 because the condition on line 121 was never true
122 "topics",
123 "C",
124 "topic",
125 "catlangname",
126 "cln",
127 ]:
128 clean_node(wxr, cats, node)
130 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:
131 base_data.categories.extend(cats.get("categories", []))
132 else:
133 for data in page_data:
134 if data.lang_code == page_data[-1].lang_code: 134 ↛ 133line 134 didn't jump to line 133 because the condition on line 134 was always true
135 data.categories.extend(cats.get("categories", []))