Coverage for src / wiktextract / extractor / pl / page.py: 81%
99 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-12 08:09 +0000
1import itertools
2import re
3from typing import Any
5from wikitextprocessor import LevelNode, NodeKind, TemplateNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .etymology import extract_etymology_section
10from .example import extract_example_section
11from .form import FORM_SECTIONS, extract_form_section
12from .inflection import extract_inflection_section
13from .linkage import LINKAGE_TYPES, extract_linkage_section
14from .models import Sense, WordEntry
15from .note import extract_note_section
16from .pos import extract_pos_section
17from .sound import extract_morphology_section, extract_sound_section
18from .translation import extract_translation_section
21def parse_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26) -> None:
27 # title templates
28 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_szablonów_haseł
29 title_text = clean_node(wxr, None, level_node.largs)
30 wxr.wtp.start_subsection(title_text)
31 if title_text == "wymowa" and wxr.config.capture_pronunciation:
32 extract_sound_section(wxr, base_data, level_node)
33 elif title_text == "znaczenia":
34 extract_pos_section(wxr, page_data, base_data, level_node)
35 elif title_text == "przykłady":
36 extract_example_section(wxr, page_data, base_data, level_node)
37 elif title_text == "etymologia" and wxr.config.capture_etymologies:
38 extract_etymology_section(wxr, page_data, base_data, level_node)
39 elif title_text == "tłumaczenia" and wxr.config.capture_translations:
40 extract_translation_section(
41 wxr, page_data, level_node, base_data.lang_code
42 )
43 elif title_text in LINKAGE_TYPES and wxr.config.capture_inflections: 43 ↛ 44line 43 didn't jump to line 44 because the condition on line 43 was never true
44 extract_linkage_section(
45 wxr,
46 page_data,
47 level_node,
48 LINKAGE_TYPES[title_text],
49 base_data.lang_code,
50 )
51 elif title_text in ["uwagi", "składnia"]: 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_note_section(wxr, page_data, base_data, level_node)
53 elif title_text == "odmiana" and wxr.config.capture_inflections: 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 extract_inflection_section(
55 wxr, page_data, base_data.lang_code, level_node
56 )
57 elif title_text in FORM_SECTIONS: 57 ↛ 61line 57 didn't jump to line 61 because the condition on line 57 was always true
58 extract_form_section(
59 wxr, page_data, base_data, level_node, FORM_SECTIONS[title_text]
60 )
61 elif title_text == "morfologia":
62 extract_morphology_section(wxr, base_data, level_node)
63 elif title_text not in ["źródła", "klucz"]:
64 wxr.wtp.debug(
65 f"Unknown section: {title_text}",
66 sortid="extractor/pl/page/parse_section/63",
67 )
70def parse_page(
71 wxr: WiktextractContext, page_title: str, page_text: str
72) -> list[dict[str, Any]]:
73 # page layout
74 # https://pl.wiktionary.org/wiki/Wikisłownik:Zasady_tworzenia_haseł
75 wxr.wtp.start_page(page_title)
76 tree = wxr.wtp.parse(page_text, pre_expand=True)
77 page_data: list[WordEntry] = []
78 for level2_node in tree.find_child(NodeKind.LEVEL2):
79 after_parenthesis = False
80 lang_code = "unknown"
81 lang_name = "unknown"
82 lang_title_cats = {}
83 for title_content_node in itertools.chain.from_iterable( 83 ↛ 103line 83 didn't jump to line 103 because the loop on line 83 didn't complete
84 level2_node.largs
85 ):
86 if isinstance(
87 title_content_node, str
88 ) and title_content_node.strip().endswith("("):
89 after_parenthesis = True
90 elif (
91 isinstance(title_content_node, TemplateNode)
92 and after_parenthesis
93 ):
94 expanded_template = wxr.wtp.parse(
95 wxr.wtp.node_to_wikitext(title_content_node),
96 expand_all=True,
97 )
98 for span_tag in expanded_template.find_html("span"):
99 lang_code = span_tag.attrs.get("id", "")
100 break
101 lang_name = clean_node(wxr, lang_title_cats, expanded_template)
102 break
103 if ( 103 ↛ 107line 103 didn't jump to line 107 because the condition on line 103 was never true
104 wxr.config.capture_language_codes is not None
105 and lang_code not in wxr.config.capture_language_codes
106 ):
107 continue
108 wxr.wtp.start_section(lang_name)
109 base_data = WordEntry(
110 word=wxr.wtp.title,
111 lang_code=lang_code,
112 lang=lang_name,
113 pos="unknown",
114 categories=lang_title_cats.get("categories", []),
115 )
116 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
117 parse_section(wxr, page_data, base_data, level3_node)
119 for data in page_data:
120 if len(data.senses) == 0: 120 ↛ 121line 120 didn't jump to line 121 because the condition on line 120 was never true
121 data.senses.append(Sense(tags=["no-gloss"]))
122 new_sounds = []
123 for sound in data.sounds:
124 if sound.sense_index == "" or match_sense_index(
125 sound.sense_index, data
126 ):
127 new_sounds.append(sound)
128 data.sounds = new_sounds
129 return [m.model_dump(exclude_defaults=True) for m in page_data]
132def match_sense_index(sense_index: str, word_entry: WordEntry) -> bool:
133 # return `True` if `WordEntry` has a `Sense` with same POS section
134 # index number, usually the first number before "."
135 if hasattr(word_entry, "senses") and len(word_entry.senses) == 0:
136 return False
137 if hasattr(word_entry, "senses"):
138 sense = word_entry.senses[0]
139 elif isinstance(word_entry, Sense): 139 ↛ 148line 139 didn't jump to line 148 because the condition on line 139 was always true
140 sense = word_entry
141 # find exact match for index like "1.1"
142 exact_match = not (
143 "," in sense_index or "-" in sense_index or "." not in sense_index
144 )
145 if exact_match: 145 ↛ 148line 145 didn't jump to line 148 because the condition on line 145 was always true
146 return sense_index == sense.sense_index
148 pos_index_str = sense.sense_index[: sense_index.find(".")]
149 pos_section_index = 0
150 if pos_index_str.isdigit(): 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 pos_section_index = int(pos_index_str)
152 else:
153 return False
155 for part_of_index in sense_index.split(","):
156 part_of_index = part_of_index.strip()
157 if (
158 "." in part_of_index
159 and pos_index_str == part_of_index[: part_of_index.find(".")]
160 ):
161 return True
162 elif re.fullmatch(r"\d+-\d+", part_of_index): 162 ↛ 163line 162 didn't jump to line 163 because the condition on line 162 was never true
163 start_str, end_str = part_of_index.split("-")
164 if int(start_str) <= pos_section_index and pos_section_index <= int(
165 end_str
166 ):
167 return True
169 return False