Coverage for src/wiktextract/extractor/ms/page.py: 65%
90 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .linkage import extract_form_section, extract_linkage_section
10from .models import Sense, WordEntry
11from .pos import extract_pos_section
12from .section_titles import FORM_SECTIONS, LINKAGE_SECTIONS, POS_DATA
13from .sound import extract_sound_section
14from .translation import extract_translation_section
17def parse_section(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 base_data: WordEntry,
21 level_node: LevelNode,
22) -> None:
23 title_text = clean_node(wxr, None, level_node.largs)
24 wxr.wtp.start_subsection(title_text)
25 title_text = title_text.rstrip(string.digits + string.whitespace + "IVX")
26 lower_title = title_text.lower()
27 if lower_title in POS_DATA:
28 old_data_len = len(page_data)
29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
30 if len(page_data) == old_data_len and lower_title in LINKAGE_SECTIONS:
31 extract_linkage_section(wxr, page_data, base_data, level_node)
32 elif lower_title == "etimologi":
33 extract_etymology_section(wxr, page_data, base_data, level_node)
34 elif lower_title in FORM_SECTIONS:
35 extract_form_section(
36 wxr,
37 page_data[-1] if len(page_data) > 0 else base_data,
38 level_node,
39 FORM_SECTIONS[lower_title],
40 )
41 elif lower_title == "tesaurus" or lower_title in LINKAGE_SECTIONS:
42 extract_linkage_section(wxr, page_data, base_data, level_node)
43 elif lower_title == "terjemahan":
44 extract_translation_section(wxr, page_data, base_data, level_node)
45 elif lower_title == "sebutan": 45 ↛ 47line 45 didn't jump to line 47 because the condition on line 45 was always true
46 extract_sound_section(wxr, page_data, base_data, level_node)
47 elif lower_title in ["nota penggunaan", "penggunaan"]:
48 extract_note_section(
49 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
50 )
51 elif lower_title not in [
52 "pautan luar",
53 "rujukan",
54 "bacaan lanjut",
55 "lihat juga",
56 ]:
57 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="ms/page/44")
59 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
60 parse_section(wxr, page_data, base_data, next_level)
61 for link_node in level_node.find_child(NodeKind.LINK): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started
62 clean_node(
63 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
64 )
65 for t_node in level_node.find_child(NodeKind.TEMPLATE):
66 if t_node.template_name in ["topik", "C", "topics"]: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 clean_node(
68 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
69 )
72def parse_page(
73 wxr: WiktextractContext, page_title: str, page_text: str
74) -> list[dict[str, Any]]:
75 # Page format
76 # https://ms.wiktionary.org/wiki/Wikikamus:Memulakan_laman_baru#Format_laman
77 if page_title.startswith(("Portal:", "Reconstruction:")): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 return []
79 wxr.wtp.start_page(page_title)
80 tree = wxr.wtp.parse(page_text, pre_expand=True)
81 page_data: list[WordEntry] = []
83 for level2_node in tree.find_child(NodeKind.LEVEL2):
84 pre_data_len = len(page_data)
85 lang_name = clean_node(wxr, None, level2_node.largs)
86 lang_code = (
87 name_to_code(lang_name.removeprefix("Bahasa "), "ms") or "unknown"
88 )
89 wxr.wtp.start_section(lang_name)
90 base_data = WordEntry(
91 word=wxr.wtp.title,
92 lang_code=lang_code,
93 lang=lang_name,
94 pos="unknown",
95 )
96 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
97 parse_section(wxr, page_data, base_data, next_level_node)
98 if len(page_data) == pre_data_len:
99 page_data.append(base_data.model_copy(deep=True))
101 for data in page_data:
102 if len(data.senses) == 0:
103 data.senses.append(Sense(tags=["no-gloss"]))
104 return [m.model_dump(exclude_defaults=True) for m in page_data]
107def extract_etymology_section(
108 wxr: WiktextractContext,
109 page_data: list[WordEntry],
110 base_data: WordEntry,
111 level_node: LevelNode,
112) -> None:
113 cats = {}
114 e_text = clean_node(
115 wxr, cats, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
116 )
117 if e_text == "": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 return
119 if len(page_data) == 0 or page_data[-1].lang_code != base_data.lang_code: 119 ↛ 122line 119 didn't jump to line 122 because the condition on line 119 was always true
120 base_data.etymology_text = e_text
121 base_data.categories.extend(cats.get("categories", []))
122 elif level_node.kind == NodeKind.LEVEL3:
123 for data in page_data:
124 if data.lang_code == page_data[-1].lang_code:
125 data.etymology_text = e_text
126 data.categories.extend(cats.get("categories", []))
127 else:
128 page_data[-1].etymology_text = e_text
129 page_data[-1].categories.extend(cats.get("categories", []))
132def extract_note_section(
133 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
134) -> None:
135 has_list = False
136 for list_node in level_node.find_child(NodeKind.LIST):
137 has_list = True
138 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
139 note = clean_node(wxr, None, list_item.children)
140 if note != "":
141 word_entry.notes.append(note)
142 if not has_list:
143 note = clean_node(wxr, None, level_node.children)
144 if note != "":
145 word_entry.notes.append(note)