Coverage for src/wiktextract/extractor/zh/linkage.py: 91%
227 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..ruby import extract_ruby
15from .models import Form, Linkage, WordEntry
16from .tags import translate_raw_tags
19def extract_linkage_section(
20 wxr: WiktextractContext,
21 page_data: list[WordEntry],
22 level_node: LevelNode,
23 linkage_type: str,
24) -> None:
25 sense = ""
26 linkage_list = []
27 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
28 if node.kind == NodeKind.LIST:
29 for item_node in node.find_child(NodeKind.LIST_ITEM):
30 sense, new_linkage_list = process_linkage_list_item(
31 wxr, item_node, sense
32 )
33 linkage_list.extend(new_linkage_list)
34 elif isinstance(node, TemplateNode): 34 ↛ 27line 34 didn't jump to line 27 because the condition on line 34 was always true
35 if node.template_name in ["s", "sense"]:
36 sense = clean_node(wxr, None, node).strip("(): ")
37 elif node.template_name == "zh-dial":
38 linkage_list.extend(extract_zh_dial_template(wxr, node, sense))
39 elif re.fullmatch(
40 r"(?:col|der|rel)\d", node.template_name, re.I
41 ) or node.template_name.endswith("-saurus"):
42 linkage_list.extend(
43 process_linkage_col_template(wxr, node, sense)
44 )
45 elif node.template_name == "ja-r/multi":
46 linkage_list.extend(
47 extract_ja_r_multi_template(wxr, node, sense)
48 )
50 if linkage_type == "alt_forms":
51 forms = [
52 Form(
53 form=l_data.word,
54 sense=l_data.sense,
55 tags=l_data.tags + ["alternative"],
56 raw_tags=l_data.raw_tags,
57 roman=l_data.roman,
58 ruby=l_data.ruby,
59 )
60 for l_data in linkage_list
61 ]
62 page_data[-1].forms.extend(forms)
63 else:
64 getattr(page_data[-1], linkage_type).extend(linkage_list)
65 for data in page_data[:-1]:
66 if ( 66 ↛ 72line 66 didn't jump to line 72 because the condition on line 66 was never true
67 data.lang_code == page_data[-1].lang_code
68 and data.sounds == page_data[-1].sounds
69 and data.etymology_text == page_data[-1].etymology_text
70 and data.pos_level == page_data[-1].pos_level == level_node.kind
71 ):
72 getattr(data, linkage_type).extend(linkage_list)
75def process_linkage_list_item(
76 wxr: WiktextractContext, list_item: WikiNode, sense: str
77) -> tuple[str, list[Linkage]]:
78 raw_tags = []
79 linkage_list = []
80 for item_child in list_item.children:
81 if isinstance(item_child, TemplateNode):
82 if item_child.template_name in ["s", "sense"]:
83 sense = clean_node(wxr, None, item_child).strip("(): ")
84 elif item_child.template_name in ["qualifier", "qual"]:
85 raw_tags.append(clean_node(wxr, None, item_child).strip("()"))
86 elif item_child.template_name == "zh-l":
87 linkage_list.extend(
88 process_zh_l_template(wxr, item_child, sense, raw_tags)
89 )
90 raw_tags.clear()
91 elif item_child.template_name == "ja-r":
92 linkage_list.append(
93 process_ja_r_template(wxr, item_child, sense, raw_tags)
94 )
95 raw_tags.clear()
96 elif item_child.template_name in ["l", "link", "alter"]: 96 ↛ 80line 96 didn't jump to line 80 because the condition on line 96 was always true
97 linkage_list.extend(
98 process_l_template(wxr, item_child, sense, raw_tags)
99 )
100 raw_tags.clear()
101 elif (
102 isinstance(item_child, WikiNode)
103 and item_child.kind == NodeKind.LINK
104 ):
105 word = clean_node(wxr, None, item_child)
106 if len(word) > 0: 106 ↛ 80line 106 didn't jump to line 80 because the condition on line 106 was always true
107 linkage_data = Linkage(
108 word=word, sense=sense, raw_tags=raw_tags
109 )
110 translate_raw_tags(linkage_data)
111 linkage_list.append(linkage_data)
112 raw_tags.clear()
113 elif ( 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was never true
114 isinstance(item_child, WikiNode)
115 and item_child.kind == NodeKind.LIST
116 ):
117 for child_list_item in item_child.find_child(NodeKind.LIST_ITEM):
118 _, new_list = process_linkage_list_item(
119 wxr, child_list_item, sense
120 )
121 linkage_list.extend(new_list)
123 return sense, linkage_list
126def extract_zh_dial_template(
127 wxr: WiktextractContext, template_node: TemplateNode, sense: str
128) -> list[Linkage]:
129 linkage_list = []
130 dial_data = defaultdict(set)
131 tag_data = defaultdict(set)
132 expanded_node = wxr.wtp.parse(
133 wxr.wtp.node_to_wikitext(template_node), expand_all=True
134 )
135 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):
136 lang_tag = ""
137 region_tag = ""
138 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
139 if not row_node.contain_node(NodeKind.TABLE_CELL):
140 continue # skip header row
141 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
142 lang_tag = clean_node(wxr, None, header_node)
143 if lang_tag == "註解": # skip last note row
144 continue
145 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
146 for link_node in cell_node.find_child(NodeKind.LINK):
147 region_tag = clean_node(wxr, None, link_node)
148 word = ""
149 for span_tag in cell_node.find_html("span"):
150 span_text = clean_node(wxr, None, span_tag)
151 if span_text == "": 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true
152 continue
153 if (
154 span_tag.attrs.get("lang", "") == "zh"
155 and span_text != wxr.wtp.title
156 ):
157 word = span_text
158 if lang_tag != "": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true
159 dial_data[span_text].add(lang_tag)
160 if region_tag != "":
161 dial_data[span_text].add(region_tag)
162 elif (
163 span_tag.attrs.get("style", "") == "font-size:60%"
164 and word != ""
165 ):
166 tag_data[word].add(span_text)
168 for term, lang_tags in dial_data.items():
169 linkage_data = Linkage(word=term, sense=sense, raw_tags=list(lang_tags))
170 linkage_data.raw_tags.extend(list(tag_data.get(term, {})))
171 translate_raw_tags(linkage_data)
172 linkage_list.append(linkage_data)
173 return linkage_list
176def process_zh_l_template(
177 wxr: WiktextractContext,
178 template_node: TemplateNode,
179 sense: str,
180 raw_tags: list[str] = [],
181) -> list[Linkage]:
182 # https://zh.wiktionary.org/wiki/Template:Zh-l
183 expanded_node = wxr.wtp.parse(
184 wxr.wtp.node_to_wikitext(template_node), expand_all=True
185 )
186 roman = ""
187 linkage_list = []
188 for i_tag in expanded_node.find_html_recursively(
189 "span", attr_name="class", attr_value="Latn"
190 ):
191 roman = clean_node(wxr, None, i_tag)
192 for span_tag in expanded_node.find_html(
193 "span", attr_name="lang", attr_value="zh"
194 ):
195 linkage_data = Linkage(
196 sense=sense,
197 raw_tags=raw_tags,
198 roman=roman,
199 word=clean_node(wxr, None, span_tag),
200 )
201 lang_attr = span_tag.attrs.get("lang", "")
202 if lang_attr == "zh-Hant":
203 linkage_data.tags.append("Traditional Chinese")
204 elif lang_attr == "zh-Hans":
205 linkage_data.tags.append("Simplified Chinese")
206 if len(linkage_data.word) > 0 and linkage_data.word != "/":
207 translate_raw_tags(linkage_data)
208 linkage_list.append(linkage_data)
209 return linkage_list
212def process_ja_r_template(
213 wxr: WiktextractContext,
214 template_node: TemplateNode,
215 sense: str,
216 raw_tags: list[str] = [],
217) -> Linkage:
218 # https://zh.wiktionary.org/wiki/Template:Ja-r
219 expanded_node = wxr.wtp.parse(
220 wxr.wtp.node_to_wikitext(template_node), expand_all=True
221 )
222 return process_expanded_ja_r_node(wxr, expanded_node, sense, raw_tags)
225def process_expanded_ja_r_node(
226 wxr: WiktextractContext,
227 expanded_node: WikiNode,
228 sense: str,
229 raw_tags: list[str] = [],
230) -> Linkage:
231 linkage_data = Linkage(sense=sense, raw_tags=raw_tags)
232 for span_node in expanded_node.find_html("span"):
233 span_class = span_node.attrs.get("class", "")
234 if "lang" in span_node.attrs:
235 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)
236 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)
237 linkage_data.ruby = ruby_data
238 elif "tr" in span_class:
239 linkage_data.roman = clean_node(wxr, None, span_node)
240 elif "mention-gloss" == span_class: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true
241 linkage_data.sense = clean_node(wxr, None, span_node)
243 translate_raw_tags(linkage_data)
244 return linkage_data
247def process_l_template(
248 wxr: WiktextractContext,
249 t_node: TemplateNode,
250 sense: str,
251 raw_tags: list[str] = [],
252) -> None:
253 # https://zh.wiktionary.org/wiki/Template:l
254 expanded_node = wxr.wtp.parse(
255 wxr.wtp.node_to_wikitext(t_node), expand_all=True
256 )
257 linkage_list = []
258 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
259 for span_tag in expanded_node.find_html("span"):
260 span_lang = span_tag.attrs.get("lang", "")
261 span_class = span_tag.attrs.get("class", "")
262 if span_lang == lang_code:
263 linkage_data = Linkage(
264 sense=sense,
265 raw_tags=raw_tags,
266 word=clean_node(wxr, None, span_tag),
267 )
268 if len(linkage_data.word) > 0: 268 ↛ 259line 268 didn't jump to line 259 because the condition on line 268 was always true
269 translate_raw_tags(linkage_data)
270 linkage_list.append(linkage_data)
271 elif span_lang.endswith("-Latn") and len(linkage_list) > 0:
272 linkage_list[-1].roman = clean_node(wxr, None, span_tag)
273 elif "mention-gloss" == span_class and len(linkage_list) > 0:
274 linkage_list[-1].sense = clean_node(wxr, None, span_tag)
276 return linkage_list
279def process_linkage_col_template(
280 wxr: WiktextractContext, template_node: TemplateNode, sense: str
281) -> list[Linkage]:
282 # https://zh.wiktionary.org/wiki/Template:Col3
283 linkage_list = []
284 expanded_template = wxr.wtp.parse(
285 wxr.wtp.node_to_wikitext(template_node), expand_all=True
286 )
287 for ui_tag in expanded_template.find_html_recursively("li"):
288 current_data = []
289 roman = ""
290 raw_tags = []
291 for span_tag in ui_tag.find_html("span"):
292 span_lang = span_tag.attrs.get("lang", "")
293 if span_lang.endswith("-Latn"): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 roman = clean_node(wxr, None, span_tag)
295 elif "qualifier-content" in span_tag.attrs.get("class", ""):
296 span_text = clean_node(wxr, None, span_tag)
297 for raw_tag in re.split(r"或|、", span_text):
298 raw_tag = raw_tag.strip()
299 if raw_tag != "": 299 ↛ 297line 299 didn't jump to line 297 because the condition on line 299 was always true
300 raw_tags.append(raw_tag)
301 elif span_lang != "":
302 l_data = Linkage(
303 word=clean_node(wxr, None, span_tag), sense=sense
304 )
305 class_names = span_tag.attrs.get("class", "")
306 if class_names == "Hant": 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 l_data.tags.append("Traditional Chinese")
308 elif class_names == "Hans": 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true
309 l_data.tags.append("Simplified Chinese")
310 if l_data.word != "": 310 ↛ 291line 310 didn't jump to line 291 because the condition on line 310 was always true
311 current_data.append(l_data)
313 for data in current_data:
314 data.raw_tags.extend(raw_tags)
315 data.roman = roman
316 translate_raw_tags(data)
317 linkage_list.extend(current_data)
319 return linkage_list
322def process_linkage_templates_in_gloss(
323 wxr: WiktextractContext,
324 word_entry: WordEntry,
325 t_node: TemplateNode,
326 linkage_type: str,
327 sense: str,
328) -> None:
329 # https://en.wiktionary.org/wiki/Template:synonyms
330 expanded_node = wxr.wtp.parse(
331 wxr.wtp.node_to_wikitext(t_node), expand_all=True
332 )
333 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
334 l_list = []
335 raw_tags = []
336 for top_span_tag in expanded_node.find_html("span"):
337 for node in top_span_tag.children:
338 if isinstance(node, HTMLNode) and node.tag == "span":
339 span_lang = node.attrs.get("lang", "")
340 span_class = node.attrs.get("class", "")
341 if span_lang == lang_code:
342 l_data = Linkage(
343 word=clean_node(wxr, None, node),
344 sense=sense,
345 raw_tags=raw_tags,
346 )
347 if span_class == "Hant":
348 l_data.tags.append("Traditional Chinese")
349 elif span_class == "Hans":
350 l_data.tags.append("Simplified Chinese")
351 if l_data.word != "": 351 ↛ 337line 351 didn't jump to line 337 because the condition on line 351 was always true
352 l_list.append(l_data)
353 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:
354 roman = clean_node(wxr, None, node)
355 for d in l_list:
356 d.roman = roman
357 elif span_class == "mention-gloss": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true
358 sense = clean_node(wxr, None, node)
359 for d in l_list:
360 d.sense = sense
361 elif "qualifier-content" in span_class:
362 raw_tag_str = clean_node(wxr, None, node)
363 for raw_tag in raw_tag_str.split(","):
364 raw_tag = raw_tag.strip()
365 if raw_tag != "": 365 ↛ 363line 365 didn't jump to line 363 because the condition on line 365 was always true
366 raw_tags.append(raw_tag)
367 elif isinstance(node, str) and node.strip() == "、":
368 getattr(word_entry, linkage_type).extend(l_list)
369 l_list.clear()
371 getattr(word_entry, linkage_type).extend(l_list)
372 for data in getattr(word_entry, linkage_type):
373 translate_raw_tags(data)
376def extract_ja_r_multi_template(
377 wxr: WiktextractContext, template_node: TemplateNode, sense: str
378) -> Linkage:
379 expanded_node = wxr.wtp.parse(
380 wxr.wtp.node_to_wikitext(template_node), expand_all=True
381 )
382 linkage_list = []
383 for list_node in expanded_node.find_child(NodeKind.LIST):
384 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
385 linkage_list.append(
386 process_expanded_ja_r_node(wxr, list_item, sense, [])
387 )
389 return linkage_list