Coverage for src/wiktextract/extractor/id/page.py: 85%
55 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import string
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .linkage import extract_linkage_section
11from .models import Sense, WordEntry
12from .pos import extract_pos_section, extract_usage_section
13from .section_titles import LINKAGE_SECTIONS, POS_DATA
14from .sound import extract_sound_section
15from .translation import extract_translation_section
18def parse_section(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 base_data: WordEntry,
22 level_node: LevelNode,
23) -> None:
24 title_text = clean_node(wxr, None, level_node.largs).rstrip(
25 string.digits + string.whitespace
26 )
27 wxr.wtp.start_subsection(title_text)
28 if title_text in POS_DATA:
29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
30 elif title_text == "Etimologi": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 extract_etymology_section(
32 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
33 )
34 elif title_text == "Terjemahan":
35 extract_translation_section(
36 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
37 )
38 elif title_text in [
39 "Pelafalan",
40 "Ejaan",
41 "Pengucapan",
42 "Suara",
43 "Pemenggalan kata",
44 ]:
45 extract_sound_section(
46 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
47 )
48 elif title_text in LINKAGE_SECTIONS:
49 extract_linkage_section(
50 wxr,
51 page_data[-1] if len(page_data) > 0 else base_data,
52 level_node,
53 LINKAGE_SECTIONS[title_text],
54 )
55 elif title_text in ["Penggunaan", "Catatan penggunaan", "Catatan"]: 55 ↛ 59line 55 didn't jump to line 59 because the condition on line 55 was always true
56 extract_usage_section(
57 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
58 )
59 elif title_text not in [
60 "Bacaan lebih lanjut",
61 "Referensi",
62 "Pranala luar",
63 "Rujukan",
64 "Acuan",
65 "Bacaan lanjutan",
66 ]:
67 wxr.wtp.debug(f"Unknown section: {title_text}", sortid="id/page/47")
69 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
70 parse_section(wxr, page_data, base_data, next_level)
72 for link_node in level_node.find_child(NodeKind.LINK):
73 clean_node(
74 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
75 )
76 for t_node in level_node.find_child(NodeKind.TEMPLATE):
77 if t_node.template_name.endswith("-cat"): 77 ↛ 78line 77 didn't jump to line 78 because the condition on line 77 was never true
78 clean_node(
79 wxr, page_data[-1] if len(page_data) > 0 else base_data, t_node
80 )
83def parse_page(
84 wxr: WiktextractContext, page_title: str, page_text: str
85) -> list[dict[str, Any]]:
86 # page layout
87 # https://id.wiktionary.org/wiki/Wikikamus:Penjelasan_tataletak_entri
88 # https://id.wiktionary.org/wiki/Wikikamus:Format_Kamus
89 if page_title.startswith(("Portal:", "Rekonstruksi:", "Thesaurus:", "WK:")): 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true
90 return []
91 wxr.wtp.start_page(page_title)
92 tree = wxr.wtp.parse(page_text, pre_expand=True)
93 page_data: list[WordEntry] = []
94 for level2_node in tree.find_child(NodeKind.LEVEL2):
95 cats = {}
96 lang_name = clean_node(wxr, cats, level2_node.largs) or "unknown"
97 lang_code = (
98 name_to_code(lang_name.lower().removeprefix("bahasa "), "id")
99 or "unknown"
100 )
101 wxr.wtp.start_section(lang_name)
102 base_data = WordEntry(
103 word=wxr.wtp.title,
104 lang_code=lang_code,
105 lang=lang_name,
106 pos="unknown",
107 categories=cats.get("categories", []),
108 )
109 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
110 parse_section(wxr, page_data, base_data, next_level_node)
112 for data in page_data:
113 if len(data.senses) == 0: 113 ↛ 114line 113 didn't jump to line 114 because the condition on line 113 was never true
114 data.senses.append(Sense(tags=["no-gloss"]))
115 return [m.model_dump(exclude_defaults=True) for m in page_data]