Coverage for src/wiktextract/extractor/pl/pos.py: 78%
119 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import AltForm, Sense, WordEntry
8from .tags import TAGS, translate_raw_tags
10# All POS categories
11# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_wg_języków
12# Polish POS
13# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_języka_polskiego
14POS_DATA = {
15 "czasownik": {"pos": "verb"},
16 "czasownika": {"pos": "verb"},
17 # Szablon:szwedzki czasownik frazowy
18 "czasownik frazowy (partikelverb)": {"pos": "verb", "tags": ["phrase"]},
19 "fraza": {"pos": "phrase"},
20 "klasyfikator": {"pos": "classifier"},
21 "liczebnik": {"pos": "num"},
22 "liczebnikowa": {"pos": "num"},
23 "międzyrostek": {"pos": "interfix", "tags": ["morpheme"]},
24 "morfem": {"pos": "unknown", "tags": ["morpheme"]},
25 "określnik": {"pos": "det"},
26 "partykuła": {"pos": "particle"},
27 "partykułowa": {"pos": "particle"},
28 # Szablon:phrasal verb
29 "phrasal verb (czasownik frazowy)": {"pos": "verb", "tags": ["phrase"]},
30 "przedimek": {"pos": "article"},
31 "przedrostek": {"pos": "prefix", "tags": ["morpheme"]},
32 "przedrostkowy": {"pos": "prefix", "tags": ["morpheme"]},
33 "przyimek": {"pos": "prep"},
34 "przyimkowa": {"pos": "prep_phrase"},
35 "przymiotnik": {"pos": "adj"},
36 "przymiotnikowym": {"pos": "adj"},
37 "przymiotnikowa": {"pos": "adj_phrase"},
38 "przyrostek": {"pos": "suffix", "tags": ["morpheme"]},
39 "przyrostkowy": {"pos": "suffix", "tags": ["morpheme"]},
40 "przysłówek": {"pos": "adv"},
41 "przysłówkowa": {"pos": "adv_phrase"},
42 "pytajny": {"pos": "pron", "tags": ["interrogative"]}, # "zaimek pytajny"
43 "rodzajnik": {"pos": "article", "tags": ["gendered"]},
44 "rzeczownik": {"pos": "noun"},
45 "rzeczownikowa": {"pos": "noun"},
46 "skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]},
47 "skrót": {"pos": "abbrev", "tags": ["abbreviation"]},
48 "spójnik": {"pos": "conj"},
49 "symbol": {"pos": "symbol"},
50 "wrostek": {"pos": "infix", "tags": ["morpheme"]},
51 "wykrzyknik": {"pos": "intj"},
52 "wykrzyknika": {"pos": "intj"},
53 "wykrzyknikowa": {"pos": "intj"},
54 "zaimka": {"pos": "pron"},
55 "zaimek": {"pos": "pron"},
56 "zaimkowy": {"pos": "pron"},
57 "znak interpunkcyjny": {"pos": "punct", "tags": ["punctuation"]},
58 "dopełniacz saksoński": {"pos": "unknown"},
59 "forma ściągnięta": {
60 "pos": "contraction",
61 "tags": ["contraction", "form-of"],
62 },
63 "słowotwórczy": {"pos": "unknown", "tags": ["morpheme"]},
64 "liczebnik porządkowy": {"pos": "adj", "tags": ["ordinal"]},
65 "liczebnik główny": {"pos": "adj", "tags": ["cardinal"]},
66 "litera": {"pos": "character", "tags": ["letter"]},
67 "związek frazeologiczny": {"pos": "phrase", "tags": ["idiomatic"]},
68 "związek wyrazów": {"pos": "unknown"},
69 "sentencja łacińska": {"pos": "unknown"},
70 "imiesłów": {"pos": "verb", "tags": ["participle"]},
71 "postpozycja": {"pos": "postp"},
72 "zwrot": {"pos": "phrase"},
73 "słowo pomocnicze": {"pos": "unknown"},
74 "wyrażenie": {"pos": "phrase"},
75 "czasownik frazowy": {"pos": "verb", "tags": ["phrasal"]},
76 "zaimek osobowy": {"pos": "pron", "tags": ["person"]},
77 "zaimek pytajny": {"pos": "pron", "tags": ["interrogative"]},
78 "zbitka": {"pos": "unknown"},
79 "nazwa własna": {"pos": "name"},
80 "rzeczownik odczasownikowy": {
81 "pos": "verb",
82 "tags": ["participle", "gerund"],
83 },
84}
86# Category:Proverb Templates
87# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów
88POS_PREFIXES = {
89 "przysłowie": {"pos": "proverb"},
90 "sentencja": {"pos": "phrase"},
91}
93IGNORE_POS_LINE_TEXT = frozenset(["rodzaj", "lub"])
96def extract_pos_section(
97 wxr: WiktextractContext,
98 page_data: list[WordEntry],
99 base_data: WordEntry,
100 level_node: LevelNode,
101) -> None:
102 has_pos = False
103 last_node_is_list = True
104 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST):
105 if node.kind == NodeKind.ITALIC:
106 if last_node_is_list:
107 new_has_pos = process_pos_line_italic_node(
108 wxr, page_data, base_data, node
109 )
110 if new_has_pos: 110 ↛ 112line 110 didn't jump to line 112 because the condition on line 110 was always true
111 has_pos = True
112 last_node_is_list = False
113 elif node.kind == NodeKind.LIST: 113 ↛ 104line 113 didn't jump to line 104 because the condition on line 113 was always true
114 if not has_pos:
115 page_data.append(base_data.model_copy(deep=True))
116 for list_item in node.find_child(NodeKind.LIST_ITEM):
117 process_gloss_list_item(wxr, page_data[-1], list_item)
118 last_node_is_list = True
121def process_pos_line_italic_node(
122 wxr: WiktextractContext,
123 page_data: list[WordEntry],
124 base_data: WordEntry,
125 italic_node: WikiNode,
126) -> bool:
127 has_pos = False
128 page_data.append(base_data.model_copy(deep=True))
129 for child in italic_node.children:
130 if isinstance(child, TemplateNode):
131 child_text = clean_node(wxr, page_data[-1], child)
132 if child.template_name.startswith("forma "): 132 ↛ 140line 132 didn't jump to line 140 because the condition on line 132 was always true
133 # inflection form header templates
134 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych
135 pos_text = child_text.split(", ")[0]
136 if pos_text in POS_DATA: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true
137 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text])
138 has_pos = True
139 page_data[-1].tags.append("form-of")
140 elif child_text in POS_DATA:
141 update_pos_data(page_data[-1], child_text, POS_DATA[child_text])
142 has_pos = True
143 else:
144 for prefix, pos_data in POS_PREFIXES.items():
145 if child_text.startswith(prefix):
146 update_pos_data(page_data[-1], child_text, pos_data)
147 has_pos = True
148 break
149 if not has_pos:
150 for text in child_text.split():
151 if text in POS_DATA:
152 update_pos_data(page_data[-1], text, POS_DATA[text])
153 has_pos = True
154 break
155 if not has_pos and child_text not in IGNORE_POS_LINE_TEXT:
156 page_data[-1].raw_tags.append(child_text)
157 elif isinstance(child, str): 157 ↛ 129line 157 didn't jump to line 129 because the condition on line 157 was always true
158 if child.strip() in POS_DATA:
159 child = child.strip()
160 update_pos_data(page_data[-1], child, POS_DATA[child])
161 has_pos = True
162 else:
163 for text in child.strip(", ").split(","):
164 text = text.strip()
165 if text in POS_DATA:
166 update_pos_data(page_data[-1], text, POS_DATA[text])
167 has_pos = True
168 elif text in TAGS: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 page_data[-1].raw_tags.append(text)
170 else:
171 for t in text.split():
172 if t in POS_DATA:
173 update_pos_data(page_data[-1], t, POS_DATA[t])
174 has_pos = True
175 elif t not in IGNORE_POS_LINE_TEXT:
176 page_data[-1].raw_tags.append(t)
177 translate_raw_tags(page_data[-1])
178 if not has_pos: 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 page_data.pop()
180 return has_pos
183def update_pos_data(
184 word_entry: WordEntry, pos_text: str, pos_data: dict
185) -> None:
186 word_entry.pos = pos_data["pos"]
187 word_entry.tags.extend(pos_data.get("tags", []))
188 word_entry.pos_text = pos_text
191def process_gloss_list_item(
192 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
193) -> None:
194 sense = Sense()
195 gloss_nodes = []
196 raw_tags = []
197 for gloss_node in list_item_node.children:
198 if isinstance(gloss_node, TemplateNode):
199 if gloss_node.template_name == "wikipedia":
200 continue
201 process_form_of_template(wxr, sense, gloss_node)
202 expanded_node = wxr.wtp.parse(
203 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
204 )
205 expanded_text = clean_node(wxr, sense, expanded_node.children)
206 if (
207 expanded_text.endswith(".")
208 and len(gloss_node.template_parameters) == 0
209 ):
210 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
211 raw_tags.append(expanded_text)
212 else:
213 gloss_nodes.extend(expanded_node.children)
214 else:
215 gloss_nodes.append(gloss_node)
216 gloss_text = clean_node(wxr, sense, gloss_nodes)
217 m = re.match(r"\(\d+\.\d+\)", gloss_text)
218 sense_index = ""
219 if m is not None: 219 ↛ 222line 219 didn't jump to line 222 because the condition on line 219 was always true
220 sense_index = m.group(0).strip("()")
221 gloss_text = gloss_text[m.end() :].strip("=; ")
222 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
223 form_of = ""
224 for node in gloss_nodes:
225 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
226 form_of = clean_node(wxr, None, node)
227 if len(form_of) > 0: 227 ↛ 229line 227 didn't jump to line 229 because the condition on line 227 was always true
228 sense.form_of.append(AltForm(word=form_of))
229 if len(gloss_text) > 0: 229 ↛ exitline 229 didn't return from function 'process_gloss_list_item' because the condition on line 229 was always true
230 sense.raw_tags = raw_tags
231 sense.sense_index = sense_index
232 sense.glosses.append(gloss_text)
233 translate_raw_tags(sense)
234 word_entry.senses.append(sense)
237def process_form_of_template(
238 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode
239) -> None:
240 if template_node.template_name == "zob-ekwiw-pupr":
241 if "form-of" not in sense.tags: 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was always true
242 sense.tags.append("form-of")
243 word = clean_node(
244 wxr, None, template_node.template_parameters.get(1, "")
245 )
246 sense.form_of.append(AltForm(word=word))