Coverage for src/wiktextract/extractor/pl/form.py: 90%
96 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .tags import translate_raw_tags
10FORM_SECTIONS = {
11 "zapis": [],
12 "transliteracja": ["transliteration"],
13 "transkrypcja": ["transcription"],
14 "zapisy w ortografiach alternatywnych": ["alternative"],
15 "warianty": ["alternative"],
16 "kody": ["alternative"],
17 "kolejność": ["alternative"],
18 "kreski": ["alternative"],
19 "słowniki": ["alternative"],
20 "hanja": ["hanja"],
21}
24def extract_form_section(
25 wxr: WiktextractContext,
26 page_data: list[WordEntry],
27 base_data: WordEntry,
28 level_node: LevelNode,
29 tags: list[str],
30) -> None:
31 forms = []
32 # get around "preformatted" node
33 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
34 for node in list_item.children:
35 if isinstance(node, str):
36 m = re.search(r"\([\d\s,-.]+\)", node)
37 if m is not None:
38 sense_index = m.group(0).strip("()")
39 roman = node[m.end() :].strip()
40 if roman != "": 40 ↛ 34line 40 didn't jump to line 34 because the condition on line 40 was always true
41 forms.append(
42 Form(
43 form=roman,
44 sense_index=sense_index,
45 tags=tags,
46 )
47 )
48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
49 form = clean_node(wxr, None, node)
50 if form != "": 50 ↛ 34line 50 didn't jump to line 34 because the condition on line 50 was always true
51 forms.append(Form(form=form, tags=tags))
53 for t_node in level_node.find_child_recursively(NodeKind.TEMPLATE):
54 if t_node.template_name in ["ptrad", "pupr"]:
55 forms.extend(extract_ptrad_template(wxr, t_node, tags))
56 elif t_node.template_name == "translit": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 roman = clean_node(wxr, None, t_node)
58 if roman != "":
59 forms.append(Form(form=roman, tags=tags))
60 elif t_node.template_name.startswith("ortografie"):
61 forms.extend(extract_ortografie_template(wxr, t_node, tags))
62 elif t_node.template_name == "hep": 62 ↛ 53line 62 didn't jump to line 53 because the condition on line 62 was always true
63 forms.extend(extract_hep_template(wxr, t_node, tags))
65 if len(forms) == 0:
66 form = clean_node(wxr, None, level_node.children)
67 if form != "": 67 ↛ 70line 67 didn't jump to line 70 because the condition on line 67 was always true
68 forms.append(Form(form=form, tags=tags))
70 for data in page_data:
71 if data.lang_code == base_data.lang_code: 71 ↛ 70line 71 didn't jump to line 70 because the condition on line 71 was always true
72 data.forms.extend(forms)
73 if len(page_data) == 0:
74 base_data.forms.extend(forms)
77def extract_ptrad_template(
78 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str]
79) -> list[Form]:
80 forms = []
81 expanded_node = wxr.wtp.parse(
82 wxr.wtp.node_to_wikitext(t_node), expand_all=True
83 )
84 raw_tag = ""
85 for span_tag in expanded_node.find_html("span"):
86 if span_tag.attrs.get("class", "") == "short-container":
87 raw_tag = clean_node(wxr, None, span_tag)
88 if span_tag.attrs.get("lang", "") == "zh":
89 word = clean_node(wxr, None, span_tag)
90 if word not in ["", wxr.wtp.title]:
91 form = Form(form=word, tags=tags)
92 if raw_tag != "": 92 ↛ 95line 92 didn't jump to line 95 because the condition on line 92 was always true
93 form.raw_tags.append(raw_tag)
94 translate_raw_tags(form)
95 forms.append(form)
96 return forms
99def extract_ortografie_template(
100 wxr: WiktextractContext,
101 t_node: TemplateNode,
102 tags: list[str],
103) -> list[Form]:
104 forms = []
105 expanded_node = wxr.wtp.parse(
106 wxr.wtp.node_to_wikitext(t_node), expand_all=True
107 )
108 forms.extend(extract_ortografie_list_item(wxr, expanded_node, tags))
109 for list_node in expanded_node.find_child(NodeKind.LIST):
110 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
111 forms.extend(extract_ortografie_list_item(wxr, list_item, tags))
112 return forms
115def extract_ortografie_list_item(
116 wxr: WiktextractContext, list_item: WikiNode, tags: list[str]
117) -> list[Form]:
118 forms = []
119 for node in list_item.children:
120 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
121 node_str = clean_node(wxr, None, node)
122 if node_str.endswith(":"): 122 ↛ 119line 122 didn't jump to line 119 because the condition on line 122 was always true
123 raw_tag = node_str.strip(": ")
124 elif isinstance(node, str) and node.strip() != "":
125 form = Form(form=node.strip(), tags=tags)
126 if raw_tag != "": 126 ↛ 129line 126 didn't jump to line 129 because the condition on line 126 was always true
127 form.raw_tags.append(raw_tag)
128 translate_raw_tags(form)
129 forms.append(form)
130 return forms
133def extract_hep_template(
134 wxr: WiktextractContext, t_node: TemplateNode, tags: list[str]
135) -> list[Form]:
136 forms = []
137 expanded_node = wxr.wtp.parse(
138 wxr.wtp.node_to_wikitext(t_node), expand_all=True
139 )
140 raw_tag = ""
141 for node in expanded_node.children:
142 if isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
143 node_str = clean_node(wxr, None, node)
144 if node_str.endswith(":"): 144 ↛ 141line 144 didn't jump to line 141 because the condition on line 144 was always true
145 raw_tag = node_str.strip(":")
146 elif isinstance(node, str) and node.strip() != "": 146 ↛ 141line 146 didn't jump to line 141 because the condition on line 146 was always true
147 form = Form(form=node.strip(), tags=tags)
148 if raw_tag != "": 148 ↛ 151line 148 didn't jump to line 151 because the condition on line 148 was always true
149 form.raw_tags.append(raw_tag)
150 translate_raw_tags(form)
151 forms.append(form)
152 return forms