Coverage for src / wiktextract / extractor / ms / page.py: 74%
104 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 LevelNode,
8 NodeKind,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from .linkage import extract_form_section, extract_linkage_section
15from .models import Sense, WordEntry
16from .pos import extract_pos_section
17from .section_titles import FORM_SECTIONS, LINKAGE_SECTIONS, POS_DATA
18from .sound import extract_sound_section
19from .translation import extract_translation_section
22def parse_section(
23 wxr: WiktextractContext,
24 page_data: list[WordEntry],
25 base_data: WordEntry,
26 level_node: LevelNode,
27) -> None:
28 title_text = clean_node(wxr, None, level_node.largs)
29 wxr.wtp.start_subsection(title_text)
30 title_text = title_text.rstrip(string.digits + string.whitespace + "IVX")
31 lower_title = title_text.lower()
32 if lower_title in POS_DATA:
33 old_data_len = len(page_data)
34 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
35 if len(page_data) == old_data_len and lower_title in LINKAGE_SECTIONS:
36 extract_linkage_section(wxr, page_data, base_data, level_node)
37 elif lower_title == "etimologi":
38 extract_etymology_section(wxr, page_data, base_data, level_node)
39 elif lower_title in FORM_SECTIONS:
40 extract_form_section(
41 wxr,
42 page_data[-1] if len(page_data) > 0 else base_data,
43 level_node,
44 FORM_SECTIONS[lower_title],
45 )
46 elif lower_title == "tesaurus" or lower_title in LINKAGE_SECTIONS:
47 extract_linkage_section(wxr, page_data, base_data, level_node)
48 elif lower_title == "terjemahan":
49 extract_translation_section(wxr, page_data, base_data, level_node)
50 elif lower_title == "sebutan": 50 ↛ 52line 50 didn't jump to line 52 because the condition on line 50 was always true
51 extract_sound_section(wxr, page_data, base_data, level_node)
52 elif lower_title in ["nota penggunaan", "penggunaan"]:
53 extract_note_section(
54 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
55 )
56 elif lower_title not in [
57 "pautan luar",
58 "rujukan",
59 "bacaan lanjut",
60 "lihat juga",
61 ]:
62 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="ms/page/44")
64 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
65 parse_section(wxr, page_data, base_data, next_level)
66 for link_node in level_node.find_child(NodeKind.LINK): 66 ↛ 67line 66 didn't jump to line 67 because the loop on line 66 never started
67 clean_node(
68 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
69 )
70 for t_node in level_node.find_child(NodeKind.TEMPLATE):
71 if t_node.template_name in ["topik", "C", "topics"]: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 clean_node(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
74 )
77def parse_page(
78 wxr: WiktextractContext, page_title: str, page_text: str
79) -> list[dict[str, Any]]:
80 # Page format
81 # https://ms.wiktionary.org/wiki/Wikikamus:Memulakan_laman_baru#Format_laman
82 if page_title.startswith(("Portal:", "Reconstruction:")): 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 return []
84 wxr.wtp.start_page(page_title)
85 tree = wxr.wtp.parse(page_text, pre_expand=True)
86 page_data: list[WordEntry] = []
88 for level2_node in tree.find_child(NodeKind.LEVEL2):
89 pre_data_len = len(page_data)
90 lang_name = clean_node(wxr, None, level2_node.largs)
91 lang_code = (
92 name_to_code(lang_name.removeprefix("Bahasa "), "ms") or "unknown"
93 )
94 wxr.wtp.start_section(lang_name)
95 base_data = WordEntry(
96 word=wxr.wtp.title,
97 lang_code=lang_code,
98 lang=lang_name,
99 pos="unknown",
100 )
101 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
102 parse_section(wxr, page_data, base_data, next_level_node)
103 if len(page_data) == pre_data_len:
104 page_data.append(base_data.model_copy(deep=True))
106 for data in page_data:
107 if len(data.senses) == 0:
108 data.senses.append(Sense(tags=["no-gloss"]))
109 return [m.model_dump(exclude_defaults=True) for m in page_data]
112def extract_etymology_section(
113 wxr: WiktextractContext,
114 page_data: list[WordEntry],
115 base_data: WordEntry,
116 level_node: LevelNode,
117):
118 cats = {}
119 e_nodes = []
120 e_texts = []
121 for node in level_node.children:
122 if isinstance(node, LevelNode): 122 ↛ 123line 122 didn't jump to line 123 because the condition on line 122 was never true
123 break
124 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
125 for list_item in node.find_child(NodeKind.LIST_ITEM):
126 e_text = clean_node(wxr, cats, list_item.children)
127 if e_text != "": 127 ↛ 125line 127 didn't jump to line 125 because the condition on line 127 was always true
128 e_texts.append(e_text)
129 else:
130 e_nodes.append(node)
131 if len(e_nodes) > 0: 131 ↛ 135line 131 didn't jump to line 135 because the condition on line 131 was always true
132 e_text = clean_node(wxr, cats, e_nodes)
133 if e_text != "":
134 e_texts.append(e_text)
135 if len(e_texts) == 0: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 return
137 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code:
138 base_data.etymology_texts = e_texts
139 base_data.categories.extend(cats.get("categories", []))
140 elif level_node.kind == NodeKind.LEVEL3: 140 ↛ 146line 140 didn't jump to line 146 because the condition on line 140 was always true
141 for data in page_data:
142 if data.lang_code == page_data[-1].lang_code: 142 ↛ 141line 142 didn't jump to line 141 because the condition on line 142 was always true
143 data.etymology_texts = e_texts
144 data.categories.extend(cats.get("categories", []))
145 else:
146 page_data[-1].etymology_texts = e_texts
147 page_data[-1].categories.extend(cats.get("categories", []))
150def extract_note_section(
151 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
152) -> None:
153 has_list = False
154 for list_node in level_node.find_child(NodeKind.LIST):
155 has_list = True
156 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
157 note = clean_node(wxr, None, list_item.children)
158 if note != "":
159 word_entry.notes.append(note)
160 if not has_list:
161 note = clean_node(wxr, None, level_node.children)
162 if note != "":
163 word_entry.notes.append(note)