Coverage for src/wiktextract/extractor/cs/page.py: 86%
66 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .declension import extract_declension_section
10from .etymology import extract_etymology_section
11from .linkage import extract_alt_form_section, extract_linkage_section
12from .models import Sense, WordEntry
13from .pos import (
14 extract_note_section,
15 extract_pos_section,
16 extract_sense_section,
17)
18from .section_titles import LINKAGE_SECTIONS, POS_DATA
19from .sound import (
20 extract_homophone_section,
21 extract_hyphenation_section,
22 extract_sound_section,
23 extract_transcript_section,
24)
25from .translation import extract_translation_section
28def parse_section(
29 wxr: WiktextractContext,
30 page_data: list[WordEntry],
31 base_data: WordEntry,
32 level_node: LevelNode,
33):
34 subtitle = clean_node(wxr, None, level_node.largs)
35 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip()
36 if subtitle in POS_DATA and level_node.contain_node(LEVEL_KIND_FLAGS):
37 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
38 elif subtitle == "význam" and len(page_data) > 0:
39 extract_sense_section(wxr, page_data[-1], level_node)
40 elif subtitle == "výslovnost":
41 extract_sound_section(wxr, base_data, level_node)
42 elif subtitle == "dělení":
43 extract_hyphenation_section(wxr, base_data, level_node)
44 elif subtitle == "etymologie":
45 extract_etymology_section(
46 wxr,
47 page_data[-1]
48 if level_node.kind != NodeKind.LEVEL3 and len(page_data) > 0
49 else base_data,
50 level_node,
51 )
52 elif subtitle in ["varianty", "varianta zápisu", "varianty zápisu"]:
53 extract_alt_form_section(
54 wxr,
55 page_data[-1]
56 if level_node.kind == NodeKind.LEVEL4
57 and len(page_data) > 0
58 and base_data.lang == page_data[-1].lang
59 else base_data,
60 level_node,
61 )
62 elif subtitle == "překlady":
63 extract_translation_section(
64 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
65 )
66 elif subtitle in LINKAGE_SECTIONS:
67 extract_linkage_section(
68 wxr,
69 page_data[-1] if len(page_data) > 0 else base_data,
70 level_node,
71 LINKAGE_SECTIONS[subtitle],
72 )
73 elif subtitle in ["stupňování", "časování"] or subtitle.startswith(
74 "skloňování"
75 ):
76 extract_declension_section(
77 wxr,
78 page_data[-1] if len(page_data) > 0 else base_data,
79 level_node,
80 subtitle,
81 )
82 elif subtitle == "homofony": 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 extract_homophone_section(wxr, base_data, level_node)
84 elif subtitle == "přepis": 84 ↛ 88line 84 didn't jump to line 88 because the condition on line 84 was always true
85 extract_transcript_section(
86 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
87 )
88 elif subtitle == "poznámka k užití":
89 extract_note_section(
90 wxr, page_data[-1] if len(page_data) > 0 else base_data, level_node
91 )
92 elif subtitle not in ["externí odkazy", "poznámky", "reference"]:
93 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27")
95 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
96 parse_section(wxr, page_data, base_data, next_level)
98 for link_node in level_node.find_child(NodeKind.LINK):
99 clean_node(
100 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
101 )
104def parse_page(
105 wxr: WiktextractContext, page_title: str, page_text: str
106) -> list[dict[str, Any]]:
107 # page layout
108 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla
109 wxr.wtp.start_page(page_title)
110 tree = wxr.wtp.parse(page_text)
111 page_data = []
112 for level2_node in tree.find_child(NodeKind.LEVEL2):
113 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown"
114 if lang_name in ["poznámky", "externí odkazy"]: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 continue
116 lang_code = name_to_code(lang_name, "cs") or "unknown"
117 if ( 117 ↛ 121line 117 didn't jump to line 121 because the condition on line 117 was never true
118 wxr.config.capture_language_codes is not None
119 and lang_code not in wxr.config.capture_language_codes
120 ):
121 continue
122 wxr.wtp.start_section(lang_name)
123 base_data = WordEntry(
124 word=wxr.wtp.title,
125 lang_code=lang_code,
126 lang=lang_name,
127 pos="unknown",
128 )
129 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
130 parse_section(wxr, page_data, base_data, next_level)
132 for data in page_data:
133 if len(data.senses) == 0:
134 data.senses.append(Sense(tags=["no-gloss"]))
136 return [d.model_dump(exclude_defaults=True) for d in page_data]