Coverage for src/wiktextract/extractor/ku/page.py: 74%
71 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 14:03 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-09 14:03 +0000
1import string
2from typing import Any
4from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .descendant import extract_descendant_section
9from .etymology import extract_etymology_section
10from .example import extract_example_section
11from .linkage import extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import extract_pos_section
14from .section_titles import LINKAGE_SECTIONS, LINKAGE_TAGS, POS_DATA
15from .sound import extract_sound_section
16from .translation import extract_translation_section, is_translation_page
19def parse_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 base_data: WordEntry,
23 level_node: LevelNode,
24) -> None:
25 title_text = clean_node(wxr, None, level_node.largs)
26 title_text = title_text.rstrip(string.digits + string.whitespace)
27 wxr.wtp.start_subsection(title_text)
28 if title_text in POS_DATA:
29 extract_pos_section(wxr, page_data, base_data, level_node, title_text)
30 if len(page_data[-1].senses) == 0 and title_text in LINKAGE_SECTIONS: 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 page_data.pop()
32 extract_linkage_section(
33 wxr,
34 page_data[-1] if len(page_data) > 0 else base_data,
35 level_node,
36 LINKAGE_SECTIONS[title_text],
37 LINKAGE_TAGS.get(title_text, []),
38 )
39 elif title_text == "Etîmolojî": 39 ↛ 40line 39 didn't jump to line 40 because the condition on line 39 was never true
40 extract_etymology_section(
41 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
42 )
43 elif title_text in ["Werger", "Bi zaravayên din"]:
44 extract_translation_section(
45 wxr,
46 page_data[-1] if len(page_data) > 0 else base_data,
47 level_node,
48 tags=["dialectal"] if title_text == "Bi zaravayên din" else [],
49 )
50 elif title_text in ["Bi alfabeyên din", "Herwiha", "Bide ber"]:
51 extract_linkage_section(
52 wxr,
53 page_data[-1] if len(page_data) > 0 else base_data,
54 level_node,
55 "",
56 )
57 elif title_text in LINKAGE_SECTIONS:
58 extract_linkage_section(
59 wxr,
60 page_data[-1] if len(page_data) > 0 else base_data,
61 level_node,
62 LINKAGE_SECTIONS[title_text],
63 LINKAGE_TAGS.get(title_text, []),
64 )
65 elif title_text == "Bilêvkirin":
66 extract_sound_section(wxr, base_data, level_node)
67 elif title_text in ["Ji wêjeyê", "Ji wêjeya klasîk"]: 67 ↛ 68line 67 didn't jump to line 68 because the condition on line 67 was never true
68 extract_example_section(
69 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
70 )
71 elif title_text == "Bikaranîn": 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 extract_note_section(
73 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
74 )
75 elif title_text == "Dûnde": 75 ↛ 79line 75 didn't jump to line 79 because the condition on line 75 was always true
76 extract_descendant_section(
77 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
78 )
79 elif title_text not in ["Çavkanî"]:
80 wxr.wtp.debug(f"Unknown title: {title_text}")
82 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
83 parse_section(wxr, page_data, base_data, next_level)
86def parse_page(
87 wxr: WiktextractContext, page_title: str, page_text: str
88) -> list[dict[str, Any]]:
89 # page layout
90 # https://ku.wiktionary.org/wiki/Wîkîferheng:Normalkirina_gotaran
91 # https://ku.wiktionary.org/wiki/Alîkarî:Formata_nivîsînê
92 if is_translation_page(page_title): 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 return []
94 wxr.wtp.start_page(page_title)
95 tree = wxr.wtp.parse(page_text, pre_expand=True)
96 page_data: list[WordEntry] = []
97 for level2_node in tree.find_child(NodeKind.LEVEL2):
98 cats = {}
99 lang_name = clean_node(wxr, cats, level2_node.largs)
100 lang_code = "unknown"
101 for t_node in level2_node.find_content(NodeKind.TEMPLATE):
102 new_lang_code = clean_node(
103 wxr, None, t_node.template_parameters.get(1, "")
104 )
105 if new_lang_code != "": 105 ↛ 101line 105 didn't jump to line 101 because the condition on line 105 was always true
106 lang_code = new_lang_code
107 wxr.wtp.start_section(lang_name)
108 base_data = WordEntry(
109 word=wxr.wtp.title,
110 lang_code=lang_code,
111 lang=lang_name,
112 pos="unknown",
113 categories=cats.get("categories", []),
114 )
115 for next_level_node in level2_node.find_child(LEVEL_KIND_FLAGS):
116 parse_section(wxr, page_data, base_data, next_level_node)
118 for data in page_data:
119 if len(data.senses) == 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 data.senses.append(Sense(tags=["no-gloss"]))
121 return [m.model_dump(exclude_defaults=True) for m in page_data]
124def extract_note_section(
125 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
126) -> None:
127 for list_node in level_node.find_child(NodeKind.LIST):
128 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
129 note = clean_node(wxr, None, list_item.children)
130 if note != "":
131 word_entry.notes.append(note)