Coverage for src/wiktextract/extractor/pl/linkage.py: 91%
104 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Linkage, WordEntry
9from .tags import TAGS, translate_raw_tags
11LINKAGE_TYPES = {
12 "antonimy": "antonyms",
13 "hiperonimy": "hypernyms",
14 "hiponimy": "hyponyms",
15 "holonimy": "holonyms",
16 "kolokacje": "related",
17 "meronimy": "meronyms",
18 "synonimy": "synonyms",
19 "wyrazy pochodne": "derived",
20 "wyrazy pokrewne": "related",
21 "związki frazeologiczne": "proverbs",
22 "złożenia": "derived",
23}
26def extract_linkage_section(
27 wxr: WiktextractContext,
28 page_data: list[WordEntry],
29 level_node: WikiNode,
30 linkage_type: str,
31 lang_code: str,
32) -> None:
33 from .page import match_sense_index
35 linkages = defaultdict(list)
36 has_list = False
37 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
38 process_linkage_list_item(wxr, list_item, linkages)
39 has_list = True
41 if not has_list:
42 # get around "preformatted" node
43 for node in level_node.find_child_recursively(
44 NodeKind.LINK | NodeKind.TEMPLATE
45 ):
46 if node.kind == NodeKind.LINK:
47 word = clean_node(wxr, None, node)
48 if word != "": 48 ↛ 43line 48 didn't jump to line 43 because the condition on line 48 was always true
49 linkages[""].append(Linkage(word=word))
50 elif isinstance(node, TemplateNode): 50 ↛ 43line 50 didn't jump to line 43 because the condition on line 50 was always true
51 process_linkage_template(
52 wxr, node, linkages, "", False, [], [], []
53 )
55 matched_indexes = set()
56 for data in page_data:
57 if data.lang_code == lang_code: 57 ↛ 56line 57 didn't jump to line 56 because the condition on line 57 was always true
58 for sense_index in linkages.keys():
59 if match_sense_index(sense_index, data):
60 getattr(data, linkage_type).extend(linkages[sense_index])
61 matched_indexes.add(sense_index)
62 getattr(data, linkage_type).extend(linkages.get("", []))
64 # add not matched data
65 if "" in linkages:
66 del linkages[""]
67 for data in page_data: 67 ↛ exitline 67 didn't return from function 'extract_linkage_section' because the loop on line 67 didn't complete
68 if data.lang_code == lang_code: 68 ↛ 67line 68 didn't jump to line 67 because the condition on line 68 was always true
69 for sense_index, linkage_list in linkages.items():
70 if sense_index not in matched_indexes: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 getattr(data, linkage_type).extend(linkage_list)
72 break
75def process_linkage_list_item(
76 wxr: WiktextractContext,
77 list_item: WikiNode,
78 linkages: dict[str, list[Linkage]],
79) -> None:
80 raw_tags = []
81 sense_index = ""
82 word_nodes = []
83 translation_nodes = []
84 is_translation = False
85 for node in list_item.children:
86 if isinstance(node, str):
87 m = re.search(r"\([\d\s,-.]+\)", node)
88 if m is not None:
89 sense_index = m.group(0).strip("()")
90 node = node[m.end() :]
92 if "→" in node:
93 is_translation = True
94 tr_start = node.index("→")
95 word_nodes.append(node[:tr_start])
96 translation_nodes.append(node[tr_start + 1 :])
97 else:
98 has_sep = False
99 for sep in [";", "•", ",", "/"]:
100 if sep in node:
101 has_sep = True
102 sep_index = node.index(sep)
103 if is_translation:
104 translation_nodes.append(node[:sep_index])
105 else:
106 word_nodes.append(node[:sep_index])
107 linkage = Linkage(
108 word=clean_node(wxr, None, word_nodes),
109 translation=clean_node(
110 wxr, None, translation_nodes
111 ),
112 raw_tags=raw_tags,
113 sense_index=sense_index,
114 )
115 translate_raw_tags(linkage)
116 if len(linkage.word) > 0: 116 ↛ 119line 116 didn't jump to line 119 because the condition on line 116 was always true
117 linkages[sense_index].append(linkage)
119 word_nodes.clear()
120 translation_nodes.clear()
121 is_translation = False
122 raw_tags.clear()
123 word_nodes.append(node[sep_index + 1 :])
124 break
125 if not has_sep:
126 if is_translation:
127 translation_nodes.append(node)
128 else:
129 word_nodes.append(node)
130 elif isinstance(node, TemplateNode):
131 process_linkage_template(
132 wxr,
133 node,
134 linkages,
135 sense_index,
136 is_translation,
137 word_nodes,
138 translation_nodes,
139 raw_tags,
140 )
141 elif is_translation:
142 translation_nodes.append(node)
143 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
144 continue
145 else:
146 word_nodes.append(node)
148 if len(word_nodes) > 0: 148 ↛ exitline 148 didn't return from function 'process_linkage_list_item' because the condition on line 148 was always true
149 word = clean_node(wxr, None, word_nodes)
150 if len(word) > 0:
151 linkage = Linkage(
152 word=word,
153 translation=clean_node(wxr, None, translation_nodes),
154 raw_tags=raw_tags,
155 sense_index=sense_index,
156 )
157 translate_raw_tags(linkage)
158 linkages[sense_index].append(linkage)
161def process_linkage_template(
162 wxr: WiktextractContext,
163 template_node: TemplateNode,
164 linkages: dict[str, list[Linkage]],
165 sense_index: str,
166 is_translation: bool,
167 word_nodes: list[WikiNode],
168 tr_nodes: list[WikiNode],
169 raw_tags: list[str],
170) -> None:
171 if template_node.template_name == "furi":
172 expanded_text = clean_node(wxr, None, template_node)
173 if "(" in expanded_text: 173 ↛ exitline 173 didn't return from function 'process_linkage_template' because the condition on line 173 was always true
174 furigana_start = expanded_text.rindex("(")
175 linkage = Linkage(
176 word=expanded_text[:furigana_start],
177 furigana=expanded_text[furigana_start:].strip("() "),
178 sense_index=sense_index,
179 )
180 linkages[sense_index].append(linkage)
181 else:
182 raw_tag = clean_node(wxr, None, template_node)
183 if raw_tag.endswith(".") or raw_tag in TAGS: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true
184 raw_tags.append(raw_tag)
185 elif is_translation:
186 tr_nodes.append(raw_tag)
187 else:
188 word_nodes.append(raw_tag)