Coverage for src/wiktextract/extractor/pl/linkage.py: 91%
104 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Linkage, WordEntry
9from .tags import TAGS, translate_raw_tags
11LINKAGE_TYPES = {
12 "antonimy": "antonyms",
13 "hiperonimy": "hypernyms",
14 "hiponimy": "hyponyms",
15 "holonimy": "holonyms",
16 "kolokacje": "related",
17 "meronimy": "meronyms",
18 "synonimy": "synonyms",
19 "wyrazy pokrewne": "related",
20 "związki frazeologiczne": "proverbs",
21 "złożenia": "derived",
22}
25def extract_linkage_section(
26 wxr: WiktextractContext,
27 page_data: list[WordEntry],
28 level_node: WikiNode,
29 linkage_type: str,
30 lang_code: str,
31) -> None:
32 from .page import match_sense_index
34 linkages = defaultdict(list)
35 has_list = False
36 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
37 process_linkage_list_item(wxr, list_item, linkages)
38 has_list = True
40 if not has_list:
41 # get around "preformatted" node
42 for node in level_node.find_child_recursively(
43 NodeKind.LINK | NodeKind.TEMPLATE
44 ):
45 if node.kind == NodeKind.LINK:
46 word = clean_node(wxr, None, node)
47 if word != "": 47 ↛ 42line 47 didn't jump to line 42 because the condition on line 47 was always true
48 linkages[""].append(Linkage(word=word))
49 elif isinstance(node, TemplateNode): 49 ↛ 42line 49 didn't jump to line 42 because the condition on line 49 was always true
50 process_linkage_template(
51 wxr, node, linkages, "", False, [], [], []
52 )
54 matched_indexes = set()
55 for data in page_data:
56 if data.lang_code == lang_code: 56 ↛ 55line 56 didn't jump to line 55 because the condition on line 56 was always true
57 for sense_index in linkages.keys():
58 if match_sense_index(sense_index, data):
59 getattr(data, linkage_type).extend(linkages[sense_index])
60 matched_indexes.add(sense_index)
61 getattr(data, linkage_type).extend(linkages.get("", []))
63 # add not matched data
64 if "" in linkages:
65 del linkages[""]
66 for data in page_data: 66 ↛ exitline 66 didn't return from function 'extract_linkage_section' because the loop on line 66 didn't complete
67 if data.lang_code == lang_code: 67 ↛ 66line 67 didn't jump to line 66 because the condition on line 67 was always true
68 for sense_index, linkage_list in linkages.items():
69 if sense_index not in matched_indexes: 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 getattr(data, linkage_type).extend(linkage_list)
71 break
74def process_linkage_list_item(
75 wxr: WiktextractContext,
76 list_item: WikiNode,
77 linkages: dict[str, list[Linkage]],
78) -> None:
79 raw_tags = []
80 sense_index = ""
81 word_nodes = []
82 translation_nodes = []
83 is_translation = False
84 for node in list_item.children:
85 if isinstance(node, str):
86 m = re.search(r"\([\d\s,-.]+\)", node)
87 if m is not None:
88 sense_index = m.group(0).strip("()")
89 node = node[m.end() :]
91 if "→" in node:
92 is_translation = True
93 tr_start = node.index("→")
94 word_nodes.append(node[:tr_start])
95 translation_nodes.append(node[tr_start + 1 :])
96 else:
97 has_sep = False
98 for sep in [";", "•", ",", "/"]:
99 if sep in node:
100 has_sep = True
101 sep_index = node.index(sep)
102 if is_translation:
103 translation_nodes.append(node[:sep_index])
104 else:
105 word_nodes.append(node[:sep_index])
106 linkage = Linkage(
107 word=clean_node(wxr, None, word_nodes),
108 translation=clean_node(
109 wxr, None, translation_nodes
110 ),
111 raw_tags=raw_tags,
112 sense_index=sense_index,
113 )
114 translate_raw_tags(linkage)
115 if len(linkage.word) > 0: 115 ↛ 118line 115 didn't jump to line 118 because the condition on line 115 was always true
116 linkages[sense_index].append(linkage)
118 word_nodes.clear()
119 translation_nodes.clear()
120 is_translation = False
121 raw_tags.clear()
122 word_nodes.append(node[sep_index + 1 :])
123 break
124 if not has_sep:
125 if is_translation:
126 translation_nodes.append(node)
127 else:
128 word_nodes.append(node)
129 elif isinstance(node, TemplateNode):
130 process_linkage_template(
131 wxr,
132 node,
133 linkages,
134 sense_index,
135 is_translation,
136 word_nodes,
137 translation_nodes,
138 raw_tags,
139 )
140 elif is_translation:
141 translation_nodes.append(node)
142 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
143 continue
144 else:
145 word_nodes.append(node)
147 if len(word_nodes) > 0: 147 ↛ exitline 147 didn't return from function 'process_linkage_list_item' because the condition on line 147 was always true
148 word = clean_node(wxr, None, word_nodes)
149 if len(word) > 0:
150 linkage = Linkage(
151 word=word,
152 translation=clean_node(wxr, None, translation_nodes),
153 raw_tags=raw_tags,
154 sense_index=sense_index,
155 )
156 translate_raw_tags(linkage)
157 linkages[sense_index].append(linkage)
160def process_linkage_template(
161 wxr: WiktextractContext,
162 template_node: TemplateNode,
163 linkages: dict[str, list[Linkage]],
164 sense_index: str,
165 is_translation: bool,
166 word_nodes: list[WikiNode],
167 tr_nodes: list[WikiNode],
168 raw_tags: list[str],
169) -> None:
170 if template_node.template_name == "furi":
171 expanded_text = clean_node(wxr, None, template_node)
172 if "(" in expanded_text: 172 ↛ exitline 172 didn't return from function 'process_linkage_template' because the condition on line 172 was always true
173 furigana_start = expanded_text.rindex("(")
174 linkage = Linkage(
175 word=expanded_text[:furigana_start],
176 furigana=expanded_text[furigana_start:].strip("() "),
177 sense_index=sense_index,
178 )
179 linkages[sense_index].append(linkage)
180 else:
181 raw_tag = clean_node(wxr, None, template_node)
182 if raw_tag.endswith(".") or raw_tag in TAGS: 182 ↛ 184line 182 didn't jump to line 184 because the condition on line 182 was always true
183 raw_tags.append(raw_tag)
184 elif is_translation:
185 tr_nodes.append(raw_tag)
186 else:
187 word_nodes.append(raw_tag)