Coverage for src/wiktextract/extractor/pl/pos.py: 79%
125 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import AltForm, Attestation, Sense, WordEntry
8from .section_titles import POS_DATA
9from .tags import TAGS, translate_raw_tags
11# Category:Proverb Templates
12# https://pl.wiktionary.org/wiki/Kategoria:Szablony_przysłów
13POS_PREFIXES = {
14 "przysłowie": {"pos": "proverb"},
15 "sentencja": {"pos": "phrase"},
16}
18IGNORE_POS_LINE_TEXT = frozenset(["rodzaj", "lub"])
21def extract_pos_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: LevelNode,
26) -> None:
27 has_pos = False
28 last_node_is_list = True
29 for node in level_node.find_child(NodeKind.ITALIC | NodeKind.LIST):
30 if node.kind == NodeKind.ITALIC:
31 if last_node_is_list:
32 new_has_pos = process_pos_line_italic_node(
33 wxr, page_data, base_data, node
34 )
35 if new_has_pos: 35 ↛ 37line 35 didn't jump to line 37 because the condition on line 35 was always true
36 has_pos = True
37 last_node_is_list = False
38 elif node.kind == NodeKind.LIST: 38 ↛ 29line 38 didn't jump to line 29 because the condition on line 38 was always true
39 if not has_pos:
40 page_data.append(base_data.model_copy(deep=True))
41 for list_item in node.find_child(NodeKind.LIST_ITEM):
42 process_gloss_list_item(wxr, page_data[-1], list_item)
43 last_node_is_list = True
46def process_pos_line_italic_node(
47 wxr: WiktextractContext,
48 page_data: list[WordEntry],
49 base_data: WordEntry,
50 italic_node: WikiNode,
51) -> bool:
52 has_pos = False
53 page_data.append(base_data.model_copy(deep=True))
54 for child in italic_node.children:
55 if isinstance(child, TemplateNode):
56 child_text = clean_node(wxr, page_data[-1], child)
57 if child.template_name.startswith("forma "): 57 ↛ 65line 57 didn't jump to line 65 because the condition on line 57 was always true
58 # inflection form header templates
59 # https://pl.wiktionary.org/wiki/Kategoria:Szablony_nagłówków_form_fleksyjnych
60 pos_text = child_text.split(", ")[0]
61 if pos_text in POS_DATA: 61 ↛ 64line 61 didn't jump to line 64 because the condition on line 61 was always true
62 update_pos_data(page_data[-1], pos_text, POS_DATA[pos_text])
63 has_pos = True
64 page_data[-1].tags.append("form-of")
65 elif child_text in POS_DATA:
66 update_pos_data(page_data[-1], child_text, POS_DATA[child_text])
67 has_pos = True
68 else:
69 for prefix, pos_data in POS_PREFIXES.items():
70 if child_text.startswith(prefix):
71 update_pos_data(page_data[-1], child_text, pos_data)
72 has_pos = True
73 break
74 if not has_pos:
75 for text in child_text.split():
76 if text in POS_DATA:
77 update_pos_data(page_data[-1], text, POS_DATA[text])
78 has_pos = True
79 break
80 if not has_pos and child_text not in IGNORE_POS_LINE_TEXT:
81 page_data[-1].raw_tags.append(child_text)
82 elif isinstance(child, str): 82 ↛ 54line 82 didn't jump to line 54 because the condition on line 82 was always true
83 if child.strip() in POS_DATA:
84 child = child.strip()
85 update_pos_data(page_data[-1], child, POS_DATA[child])
86 has_pos = True
87 else:
88 for text in child.strip(", ").split(","):
89 text = text.strip()
90 if text in POS_DATA:
91 update_pos_data(page_data[-1], text, POS_DATA[text])
92 has_pos = True
93 elif text in TAGS: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 page_data[-1].raw_tags.append(text)
95 else:
96 for t in text.split():
97 if t in POS_DATA:
98 update_pos_data(page_data[-1], t, POS_DATA[t])
99 has_pos = True
100 elif t not in IGNORE_POS_LINE_TEXT:
101 page_data[-1].raw_tags.append(t)
102 translate_raw_tags(page_data[-1])
103 if not has_pos: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true
104 page_data.pop()
105 return has_pos
108def update_pos_data(
109 word_entry: WordEntry, pos_text: str, pos_data: dict
110) -> None:
111 word_entry.pos = pos_data["pos"]
112 word_entry.tags.extend(pos_data.get("tags", []))
113 word_entry.pos_title = pos_text
116def process_gloss_list_item(
117 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
118) -> None:
119 sense = Sense()
120 gloss_nodes = []
121 raw_tags = []
122 for gloss_node in list_item_node.children:
123 if (
124 isinstance(gloss_node, TemplateNode)
125 and gloss_node.template_name == "datadef"
126 ):
127 extract_datedef_template(wxr, sense, gloss_node)
128 elif isinstance(gloss_node, TemplateNode):
129 if gloss_node.template_name == "wikipedia":
130 continue
131 process_form_of_template(wxr, sense, gloss_node)
132 expanded_node = wxr.wtp.parse(
133 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
134 )
135 expanded_text = clean_node(wxr, sense, expanded_node.children)
136 if (
137 expanded_text.endswith(".")
138 and len(gloss_node.template_parameters) == 0
139 ):
140 # https://pl.wiktionary.org/wiki/Pomoc:Skróty_używane_w_Wikisłowniku
141 raw_tags.append(expanded_text)
142 else:
143 gloss_nodes.extend(expanded_node.children)
144 else:
145 gloss_nodes.append(gloss_node)
146 gloss_text = clean_node(wxr, sense, gloss_nodes)
147 m = re.match(r"\(\d+\.\d+\)", gloss_text)
148 sense_index = ""
149 if m is not None: 149 ↛ 152line 149 didn't jump to line 152 because the condition on line 149 was always true
150 sense_index = m.group(0).strip("()")
151 gloss_text = gloss_text[m.end() :].strip("=; ")
152 if "form-of" in word_entry.tags and len(sense.form_of) == 0:
153 form_of = ""
154 for node in gloss_nodes:
155 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
156 form_of = clean_node(wxr, None, node)
157 if len(form_of) > 0: 157 ↛ 159line 157 didn't jump to line 159 because the condition on line 157 was always true
158 sense.form_of.append(AltForm(word=form_of))
159 if len(gloss_text) > 0: 159 ↛ exitline 159 didn't return from function 'process_gloss_list_item' because the condition on line 159 was always true
160 sense.raw_tags = raw_tags
161 sense.sense_index = sense_index
162 sense.glosses.append(gloss_text)
163 translate_raw_tags(sense)
164 word_entry.senses.append(sense)
167def process_form_of_template(
168 wxr: WiktextractContext, sense: Sense, template_node: TemplateNode
169) -> None:
170 if template_node.template_name == "zob-ekwiw-pupr":
171 if "form-of" not in sense.tags: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true
172 sense.tags.append("form-of")
173 word = clean_node(
174 wxr, None, template_node.template_parameters.get(1, "")
175 )
176 sense.form_of.append(AltForm(word=word))
179def extract_datedef_template(
180 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
181):
182 date = clean_node(wxr, None, t_node).strip("[] ")
183 if date != "": 183 ↛ exitline 183 didn't return from function 'extract_datedef_template' because the condition on line 183 was always true
184 sense.attestations.append(Attestation(date=date))