Coverage for src/wiktextract/extractor/pl/page.py: 79%
94 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import itertools
2import re
3from typing import Any
5from wikitextprocessor import LevelNode, NodeKind, TemplateNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .example import extract_example_section
11from .form import FORM_SECTIONS, extract_form_section
12from .inflection import extract_inflection_section
13from .linkage import LINKAGE_TYPES, extract_linkage_section
14from .models import Sense, WordEntry
15from .note import extract_note_section
16from .pos import extract_pos_section
17from .sound import extract_morphology_section, extract_sound_section
18from .translation import extract_translation_section
21def parse_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26) -> None:
27 # title templates
28 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł
29 title_text = clean_node(wxr, None, level_node.largs)
30 wxr.wtp.start_subsection(title_text)
31 if title_text == "wymowa" and wxr.config.capture_pronunciation: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true
32 extract_sound_section(wxr, base_data, level_node)
33 elif title_text == "znaczenia":
34 extract_pos_section(wxr, page_data, base_data, level_node)
35 elif title_text == "przykłady":
36 extract_example_section(wxr, page_data, base_data, level_node)
37 elif title_text == "etymologia" and wxr.config.capture_etymologies:
38 extract_etymology_section(wxr, page_data, base_data, level_node)
39 elif title_text == "tłumaczenia" and wxr.config.capture_translations:
40 extract_translation_section(
41 wxr, page_data, level_node, base_data.lang_code
42 )
43 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_linkage_section(
45 wxr,
46 page_data,
47 level_node,
48 LINKAGE_TYPES[title_text],
49 base_data.lang_code,
50 )
51 elif title_text in ["uwagi", "składnia"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_note_section(wxr, page_data, base_data, level_node)
53 elif title_text == "odmiana" and wxr.config.capture_inflections: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 extract_inflection_section(
55 wxr, page_data, base_data.lang_code, level_node
56 )
57 elif title_text in FORM_SECTIONS: 57 ↛ 61line 57 didn't jump to line 61 because the condition on line 57 was always true
58 extract_form_section(
59 wxr, page_data, base_data, level_node, FORM_SECTIONS[title_text]
60 )
61 elif title_text == "morfologia":
62 extract_morphology_section(wxr, base_data, level_node)
63 elif title_text not in ["źródła", "klucz"]:
64 wxr.wtp.debug(
65 f"Unknown section: {title_text}",
66 sortid="extractor/pl/page/parse_section/63",
67 )
70def parse_page(
71 wxr: WiktextractContext, page_title: str, page_text: str
72) -> list[dict[str, Any]]:
73 # page layout
74 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł
75 wxr.wtp.start_page(page_title)
76 tree = wxr.wtp.parse(page_text, pre_expand=True)
77 page_data: list[WordEntry] = []
78 for level2_node in tree.find_child(NodeKind.LEVEL2):
79 after_parenthesis = False
80 lang_code = "unknown"
81 lang_name = "unknown"
82 lang_title_cats = {}
83 for title_content_node in itertools.chain.from_iterable( 83 ↛ 103line 83 didn't jump to line 103 because the loop on line 83 didn't complete
84 level2_node.largs
85 ):
86 if isinstance(
87 title_content_node, str
88 ) and title_content_node.strip().endswith("("):
89 after_parenthesis = True
90 elif (
91 isinstance(title_content_node, TemplateNode)
92 and after_parenthesis
93 ):
94 expanded_template = wxr.wtp.parse(
95 wxr.wtp.node_to_wikitext(title_content_node),
96 expand_all=True,
97 )
98 for span_tag in expanded_template.find_html("span"):
99 lang_code = span_tag.attrs.get("id", "")
100 break
101 lang_name = clean_node(wxr, lang_title_cats, expanded_template)
102 break
103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true
104 wxr.config.capture_language_codes is not None
105 and lang_code not in wxr.config.capture_language_codes
106 ):
107 continue
108 wxr.wtp.start_section(lang_name)
109 base_data = WordEntry(
110 word=wxr.wtp.title,
111 lang_code=lang_code,
112 lang=lang_name,
113 pos="unknown",
114 categories=lang_title_cats.get("categories", []),
115 )
116 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
117 parse_section(wxr, page_data, base_data, level3_node)
119 for data in page_data:
120 if len(data.senses) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 data.senses.append(Sense(tags=["no-gloss"]))
122 return [m.model_dump(exclude_defaults=True) for m in page_data]
125def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool:
126 # return `True` if `WordEntry` has a `Sense` with same POS section
127 # index number, usually the first number before "."
128 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0:
129 return False
130 if hasattr(word_entry, "senses"):
131 sense = word_entry.senses[0]
132 elif isinstance(word_entry, Sense): 132 ↛ 141line 132 didn't jump to line 141 because the condition on line 132 was always true
133 sense = word_entry
134 # find exact match for index like "1.1"
135 exact_match = not (
136 "," in sense_index or "-" in sense_index or "." not in sense_index
137 )
138 if exact_match: 138 ↛ 141line 138 didn't jump to line 141 because the condition on line 138 was always true
139 return sense_index == sense.sense_index
141 pos_index_str = sense.sense_index[: sense_index.find(".")]
142 pos_section_index = 0
143 if pos_index_str.isdigit(): 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true
144 pos_section_index = int(pos_index_str)
145 else:
146 return False
148 for part_of_index in sense_index.split(","):
149 part_of_index = part_of_index.strip()
150 if (
151 "." in part_of_index
152 and pos_index_str == part_of_index[: part_of_index.find(".")]
153 ):
154 return True
155 elif re.fullmatch(r"\d+-\d+", part_of_index): 155 ↛ 156line 155 didn't jump to line 156 because the condition on line 155 was never true
156 start_str, end_str = part_of_index.split("-")
157 if int(start_str) <= pos_section_index and pos_section_index <= int(
158 end_str
159 ):
160 return True
162 return False