Coverage for src / wiktextract / extractor / cs / page.py: 85%
68 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .declension import extract_declension_section
10from .etymology import extract_etymology_section
11from .linkage import extract_alt_form_section, extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import (
14 extract_note_section,
15 extract_pos_section,
16 extract_sense_section,
17)
18from .section_titles import LINKAGE_SECTIONS, POS_DATA
19from .sound import (
20 extract_homophone_section,
21 extract_hyphenation_section,
22 extract_sound_section,
23 extract_transcript_section,
24)
25from .translation import extract_translation_section
28def parse_section(
29 wxr: WiktextractContext,
30 page_data: list[WordEntry],
31 base_data: WordEntry,
32 level_node: LevelNode,
33):
34 subtitle = clean_node(wxr, None, level_node.largs)
35 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip()
36 if "/" in subtitle: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true
37 subtitle = subtitle.split("/")[0].strip()
38 if subtitle in POS_DATA:
39 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
40 elif subtitle == "význam" and len(page_data) > 0:
41 extract_sense_section(wxr, page_data[-1], level_node)
42 elif subtitle == "výslovnost":
43 extract_sound_section(wxr, base_data, level_node)
44 elif subtitle == "dělení":
45 extract_hyphenation_section(wxr, base_data, level_node)
46 elif subtitle == "etymologie":
47 extract_etymology_section(
48 wxr,
49 page_data[-1]
50 if level_node.kind != NodeKind.LEVEL3 and len(page_data) > 0
51 else base_data,
52 level_node,
53 )
54 elif subtitle in ["varianty", "varianta zápisu", "varianty zápisu"]:
55 extract_alt_form_section(
56 wxr,
57 page_data[-1]
58 if level_node.kind == NodeKind.LEVEL4
59 and len(page_data) > 0
60 and base_data.lang == page_data[-1].lang
61 else base_data,
62 level_node,
63 )
64 elif subtitle == "překlady":
65 extract_translation_section(
66 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
67 )
68 elif subtitle in LINKAGE_SECTIONS:
69 extract_linkage_section(
70 wxr,
71 page_data[-1] if len(page_data) > 0 else base_data,
72 level_node,
73 LINKAGE_SECTIONS[subtitle],
74 )
75 elif subtitle in ["stupňování", "časování"] or subtitle.startswith(
76 "skloňování"
77 ):
78 extract_declension_section(
79 wxr,
80 page_data[-1] if len(page_data) > 0 else base_data,
81 level_node,
82 subtitle,
83 )
84 elif subtitle == "homofony": 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 extract_homophone_section(wxr, base_data, level_node)
86 elif subtitle == "přepis": 86 ↛ 90line 86 didn't jump to line 90 because the condition on line 86 was always true
87 extract_transcript_section(
88 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
89 )
90 elif subtitle == "poznámka k užití":
91 extract_note_section(
92 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
93 )
94 elif subtitle not in ["externí odkazy", "poznámky", "reference"]:
95 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27")
97 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
98 parse_section(wxr, page_data, base_data, next_level)
100 for link_node in level_node.find_child(NodeKind.LINK):
101 clean_node(
102 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
103 )
106def parse_page(
107 wxr: WiktextractContext, page_title: str, page_text: str
108) -> list[dict[str, Any]]:
109 # page layout
110 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla
111 wxr.wtp.start_page(page_title)
112 tree = wxr.wtp.parse(page_text)
113 page_data = []
114 for level2_node in tree.find_child(NodeKind.LEVEL2):
115 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown"
116 if lang_name in ["poznámky", "externí odkazy"]: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 continue
118 lang_code = name_to_code(lang_name, "cs") or "unknown"
119 if ( 119 ↛ 123line 119 didn't jump to line 123 because the condition on line 119 was never true
120 wxr.config.capture_language_codes is not None
121 and lang_code not in wxr.config.capture_language_codes
122 ):
123 continue
124 wxr.wtp.start_section(lang_name)
125 base_data = WordEntry(
126 word=wxr.wtp.title,
127 lang_code=lang_code,
128 lang=lang_name,
129 pos="unknown",
130 )
131 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
132 parse_section(wxr, page_data, base_data, next_level)
134 for data in page_data:
135 if len(data.senses) == 0:
136 data.senses.append(Sense(tags=["no-gloss"]))
138 return [d.model_dump(exclude_defaults=True) for d in page_data]