Coverage for src/wiktextract/extractor/pl/pos.py: 83%
96 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import AltForm, Sense, WordEntry
8from .tags import translate_raw_tags
10# All POS categories
11# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_wg_języków
12# Polish POS
13# https://pl.wiktionary.org/wiki/Kategoria:Części_mowy_języka_polskiego
14POS_DATA = {
15 "czasownik": {"pos": "verb"},
16 "czasownika": {"pos": "verb"},
17 # Szablon:szwedzki czasownik frazowy
18 "czasownik frazowy (partikelverb)": {"pos": "verb", "tags": ["phrase"]},
19 "fraza": {"pos": "phrase"},
20 "klasyfikator": {"pos": "classifier"},
21 "liczebnik": {"pos": "num"},
22 "liczebnikowa": {"pos": "num"},
23 "międzyrostek": {"pos": "interfix", "tags": ["morpheme"]},
24 "morfem": {"pos": "unknown", "tags": ["morpheme"]},
25 "określnik": {"pos": "det"},
26 "partykuła": {"pos": "particle"},
27 "partykułowa": {"pos": "particle"},
28 # Szablon:phrasal verb
29 "phrasal verb (czasownik frazowy)": {"pos": "verb", "tags": ["phrase"]},
30 "przedimek": {"pos": "article"},
31 "przedrostek": {"pos": "prefix", "tags": ["morpheme"]},
32 "przyimek": {"pos": "prep"},
33 "przyimkowa": {"pos": "prep_phrase"},
34 "przymiotnik": {"pos": "adj"},
35 "przymiotnikowym": {"pos": "adj"},
36 "przymiotnikowa": {"pos": "adj_phrase"},
37 "przyrostek": {"pos": "suffix", "tags": ["morpheme"]},
38 "przysłówek": {"pos": "adv"},
39 "przysłówkowa": {"pos": "adv_phrase"},
40 "pytajny": {"pos": "pron", "tags": ["interrogative"]}, # "zaimek pytajny"
41 "rodzajnik": {"pos": "article", "tags": ["gendered"]},
42 "rzeczownik": {"pos": "noun"},
43 "rzeczownikowa": {"pos": "noun"},
44 "skrótowiec": {"pos": "abbrev", "tags": ["abbreviation"]},
45 "spójnik": {"pos": "conj"},
46 "symbol": {"pos": "symbol"},
47 "wrostek": {"pos": "infix", "tags": ["morpheme"]},
48 "wykrzyknik": {"pos": "intj"},
49 "wykrzyknika": {"pos": "intj"},
50 "wykrzyknikowa": {"pos": "intj"},
51 "zaimka": {"pos": "pron"},
52 "zaimek": {"pos": "pron"},
53 "zaimkowy": {"pos": "pron"},
54}
56# Category:Proverb Templates
57# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów
58POS_PREFIXES = {
59 "przysłowie": {"pos": "proverb"},
60 "sentencja": {"pos": "phrase"},
61}
63IGNORE_POS_LINE_TEXT = frozenset(["rodzaj"])
66def extract_pos_section(
67 wxr: WiktextractContext,
68 page_data: list[WordEntry],
69 base_data: WordEntry,
70 level_node: LevelNode,
71) -> None:
72 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST):
73 if node.kind == NodeKind.ITALIC:
74 process_pos_line_italic_node(wxr, page_data, base_data, node)
75 elif node.kind == NodeKind.LIST: 75 ↛ 72line 75 didn't jump to line 72 because the condition on line 75 was always true
76 for list_item in node.find_child(NodeKind.LIST_ITEM):
77 if len(page_data) == 0:
78 page_data.append(base_data.model_copy(deep=True))
79 process_gloss_list_item(wxr, page_data[-1], list_item)
82def process_pos_line_italic_node(
83 wxr: WiktextractContext,
84 page_data: list[WordEntry],
85 base_data: WordEntry,
86 italic_node: WikiNode,
87) -> None:
88 has_pos = False
89 page_data.append(base_data.model_copy(deep=True))
90 for child in italic_node.children:
91 if isinstance(child, TemplateNode):
92 child_text = clean_node(wxr, page_data[-1], child)
93 if child.template_name.startswith("forma "): 93 ↛ 101line 93 didn't jump to line 101 because the condition on line 93 was always true
94 # inflection form header templates
95 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych
96 pos_text = child_text.split(", ")[0]
97 if pos_text in POS_DATA: 97 ↛ 100line 97 didn't jump to line 100 because the condition on line 97 was always true
98 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text])
99 has_pos = True
100 page_data[-1].tags.append("form-of")
101 elif child_text in POS_DATA:
102 update_pos_data(page_data[-1], child_text, POS_DATA[child_text])
103 has_pos = True
104 else:
105 is_pos = False
106 for prefix, pos_data in POS_PREFIXES.items():
107 if child_text.startswith(prefix):
108 update_pos_data(page_data[-1], child_text, pos_data)
109 is_pos = True
110 break
111 if not is_pos and child_text not in IGNORE_POS_LINE_TEXT:
112 page_data[-1].raw_tags.append(child_text)
113 elif isinstance(child, str): 113 ↛ 90line 113 didn't jump to line 90 because the condition on line 113 was always true
114 for text in child.strip(", ").split():
115 text = text.strip(", ")
116 if text in POS_DATA:
117 update_pos_data(page_data[-1], text, POS_DATA[text])
118 has_pos = True
119 elif text not in IGNORE_POS_LINE_TEXT:
120 page_data[-1].raw_tags.append(text)
121 translate_raw_tags(page_data[-1])
122 if not has_pos:
123 page_data.pop()
126def update_pos_data(
127 word_entry: WordEntry, pos_text: str, pos_data: dict
128) -> None:
129 word_entry.pos = pos_data["pos"]
130 word_entry.tags.extend(pos_data.get("tags", []))
131 word_entry.pos_text = pos_text
134def process_gloss_list_item(
135 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
136) -> None:
137 sense = Sense()
138 gloss_nodes = []
139 raw_tags = []
140 for gloss_node in list_item_node.children:
141 if isinstance(gloss_node, TemplateNode):
142 if gloss_node.template_name == "wikipedia":
143 continue
144 process_form_of_template(wxr, sense, gloss_node)
145 expanded_node = wxr.wtp.parse(
146 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
147 )
148 expanded_text = clean_node(wxr, sense, expanded_node.children)
149 if (
150 expanded_text.endswith(".")
151 and len(gloss_node.template_parameters) == 0
152 ):
153 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
154 raw_tags.append(expanded_text)
155 else:
156 gloss_nodes.extend(expanded_node.children)
157 else:
158 gloss_nodes.append(gloss_node)
159 gloss_text = clean_node(wxr, sense, gloss_nodes)
160 m = re.match(r"\(\d+\.\d+\)", gloss_text)
161 sense_index = ""
162 if m is not None: 162 ↛ 165line 162 didn't jump to line 165 because the condition on line 162 was always true
163 sense_index = m.group(0).strip("()")
164 gloss_text = gloss_text[m.end() :].strip("=; ")
165 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
166 form_of = ""
167 for node in gloss_nodes:
168 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
169 form_of = clean_node(wxr, None, node)
170 if len(form_of) > 0: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true
171 sense.form_of.append(AltForm(word=form_of))
172 if len(gloss_text) > 0: 172 ↛ exitline 172 didn't return from function 'process_gloss_list_item' because the condition on line 172 was always true
173 sense.raw_tags = raw_tags
174 sense.sense_index = sense_index
175 sense.glosses.append(gloss_text)
176 translate_raw_tags(sense)
177 word_entry.senses.append(sense)
180def process_form_of_template(
181 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode
182) -> None:
183 if template_node.template_name == "zob-ekwiw-pupr":
184 if "form-of" not in sense.tags: 184 ↛ 186line 184 didn't jump to line 186 because the condition on line 184 was always true
185 sense.tags.append("form-of")
186 word = clean_node(
187 wxr, None, template_node.template_parameters.get(1, "")
188 )
189 sense.form_of.append(AltForm(word=word))