Coverage for src/wiktextract/extractor/cs/page.py: 86%
51 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
2from typing import Any
4from mediawiki_langcodes import name_to_code
5from wikitextprocessor.parser import LEVEL_KIND_FLAGS, LevelNode, NodeKind
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .linkage import extract_alt_form_section
10from .models import Sense, WordEntry
11from .pos import extract_pos_section, extract_sense_section
12from .section_titles import POS_DATA
13from .sound import extract_hyphenation_section, extract_sound_section
16def parse_section(
17 wxr: WiktextractContext,
18 page_data: list[WordEntry],
19 base_data: WordEntry,
20 level_node: LevelNode,
21):
22 subtitle = clean_node(wxr, None, level_node.largs)
23 subtitle = re.sub(r"\(\d+\)", "", subtitle).strip()
24 if subtitle in POS_DATA and level_node.contain_node(LEVEL_KIND_FLAGS):
25 extract_pos_section(wxr, page_data, base_data, level_node, subtitle)
26 elif subtitle == "význam" and len(page_data) > 0:
27 extract_sense_section(wxr, page_data[-1], level_node)
28 elif subtitle == "výslovnost":
29 extract_sound_section(wxr, base_data, level_node)
30 elif subtitle == "dělení":
31 extract_hyphenation_section(wxr, base_data, level_node)
32 elif subtitle == "etymologie":
33 base_data.etymology_text = clean_node(
34 wxr, base_data, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
35 )
36 elif subtitle == "varianty": 36 ↛ 38line 36 didn't jump to line 38 because the condition on line 36 was always true
37 extract_alt_form_section(wxr, base_data, level_node)
38 elif subtitle not in ["externí odkazy"]:
39 wxr.wtp.debug(f"Unknown title: {subtitle}", sortid="cs/page/27")
41 for next_level in level_node.find_child(LEVEL_KIND_FLAGS):
42 parse_section(wxr, page_data, base_data, next_level)
44 for link_node in level_node.find_child(NodeKind.LINK):
45 clean_node(
46 wxr, page_data[-1] if len(page_data) > 0 else base_data, link_node
47 )
50def parse_page(
51 wxr: WiktextractContext, page_title: str, page_text: str
52) -> list[dict[str, Any]]:
53 # page layout
54 # https://cs.wiktionary.org/wiki/Wikislovník:Formát_hesla
55 wxr.wtp.start_page(page_title)
56 tree = wxr.wtp.parse(page_text)
57 page_data = []
58 for level2_node in tree.find_child(NodeKind.LEVEL2):
59 lang_name = clean_node(wxr, None, level2_node.largs) or "unknown"
60 if lang_name in ["poznámky", "externí odkazy"]: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true
61 continue
62 lang_code = name_to_code(lang_name, "cs") or "unknown"
63 if ( 63 ↛ 67line 63 didn't jump to line 67 because the condition on line 63 was never true
64 wxr.config.capture_language_codes is not None
65 and lang_code not in wxr.config.capture_language_codes
66 ):
67 continue
68 wxr.wtp.start_section(lang_name)
69 base_data = WordEntry(
70 word=wxr.wtp.title,
71 lang_code=lang_code,
72 lang=lang_name,
73 pos="unknown",
74 )
75 for next_level in level2_node.find_child(LEVEL_KIND_FLAGS):
76 parse_section(wxr, page_data, base_data, next_level)
78 for data in page_data:
79 if len(data.senses) == 0: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 data.senses.append(Sense(tags=["no-gloss"]))
82 return [d.model_dump(exclude_defaults=True) for d in page_data]