Coverage for src / wiktextract / extractor / zh / linkage.py: 91%
256 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-23 01:12 +0000
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-23 01:12 +0000
1import re
3from wikitextprocessor import (
4 HTMLNode,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ..ruby import extract_ruby
14from .models import Form, Linkage, WordEntry
15from .tags import translate_raw_tags
18def extract_linkage_section(
19 wxr: WiktextractContext,
20 page_data: list[WordEntry],
21 level_node: LevelNode,
22 linkage_type: str,
23) -> None:
24 sense = ""
25 linkage_list = []
26 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):
27 if node.kind == NodeKind.LIST:
28 for item_node in node.find_child(NodeKind.LIST_ITEM):
29 sense, new_linkage_list = process_linkage_list_item(
30 wxr, item_node, sense
31 )
32 linkage_list.extend(new_linkage_list)
33 elif isinstance(node, TemplateNode): 33 ↛ 26line 33 didn't jump to line 26 because the condition on line 33 was always true
34 if node.template_name in ["s", "sense"]:
35 sense = clean_node(wxr, None, node).strip("(): ")
36 elif node.template_name == "zh-dial":
37 linkage_list.extend(extract_zh_dial_template(wxr, node, sense))
38 elif re.fullmatch(
39 r"(?:col|der|rel)\d", node.template_name, re.I
40 ) or node.template_name.endswith("-saurus"):
41 linkage_list.extend(
42 process_linkage_col_template(wxr, node, sense)
43 )
44 elif node.template_name == "ja-r/multi":
45 linkage_list.extend(
46 extract_ja_r_multi_template(wxr, node, sense)
47 )
49 if linkage_type == "alt_forms":
50 forms = [
51 Form(
52 form=l_data.word,
53 sense=l_data.sense,
54 tags=l_data.tags + ["alternative"],
55 raw_tags=l_data.raw_tags,
56 roman=l_data.roman,
57 ruby=l_data.ruby,
58 attestations=l_data.attestations,
59 )
60 for l_data in linkage_list
61 ]
62 page_data[-1].forms.extend(forms)
63 else:
64 getattr(page_data[-1], linkage_type).extend(linkage_list)
65 for data in page_data[:-1]:
66 if ( 66 ↛ 72line 66 didn't jump to line 72 because the condition on line 66 was never true
67 data.lang_code == page_data[-1].lang_code
68 and data.sounds == page_data[-1].sounds
69 and data.etymology_texts == page_data[-1].etymology_texts
70 and data.pos_level == page_data[-1].pos_level == level_node.kind
71 ):
72 getattr(data, linkage_type).extend(linkage_list)
75def process_linkage_list_item(
76 wxr: WiktextractContext, list_item: WikiNode, sense: str
77) -> tuple[str, list[Linkage]]:
78 raw_tags = []
79 linkage_list = []
80 for item_child in list_item.children:
81 if isinstance(item_child, TemplateNode):
82 if item_child.template_name in ["s", "sense"]:
83 sense = clean_node(wxr, None, item_child).strip("(): ")
84 elif item_child.template_name in ["qualifier", "qual"]:
85 raw_tags.append(clean_node(wxr, None, item_child).strip("()"))
86 elif item_child.template_name == "zh-l":
87 linkage_list.extend(
88 process_zh_l_template(wxr, item_child, sense, raw_tags)
89 )
90 raw_tags.clear()
91 elif item_child.template_name == "ja-r":
92 linkage_list.append(
93 process_ja_r_template(wxr, item_child, sense, raw_tags)
94 )
95 raw_tags.clear()
96 elif item_child.template_name.lower() in [
97 "l",
98 "link",
99 "alter",
100 "alt",
101 ]:
102 linkage_list.extend(
103 process_l_template(wxr, item_child, sense, raw_tags)
104 )
105 raw_tags.clear()
106 elif ( 106 ↛ 80line 106 didn't jump to line 80 because the condition on line 106 was always true
107 item_child.template_name.lower() in ["defdate", "datedef"]
108 and len(linkage_list) > 0
109 ):
110 from .gloss import extract_defdate_template
112 extract_defdate_template(wxr, linkage_list[-1], item_child)
113 elif (
114 isinstance(item_child, WikiNode)
115 and item_child.kind == NodeKind.LINK
116 ):
117 word = clean_node(wxr, None, item_child)
118 if len(word) > 0: 118 ↛ 80line 118 didn't jump to line 80 because the condition on line 118 was always true
119 linkage_data = Linkage(
120 word=word, sense=sense, raw_tags=raw_tags
121 )
122 translate_raw_tags(linkage_data)
123 linkage_list.append(linkage_data)
124 raw_tags.clear()
125 elif ( 125 ↛ 129line 125 didn't jump to line 129 because the condition on line 125 was never true
126 isinstance(item_child, WikiNode)
127 and item_child.kind == NodeKind.LIST
128 ):
129 for child_list_item in item_child.find_child(NodeKind.LIST_ITEM):
130 _, new_list = process_linkage_list_item(
131 wxr, child_list_item, sense
132 )
133 linkage_list.extend(new_list)
135 if len(raw_tags) > 0 and len(linkage_list) > 0:
136 linkage_list[-1].raw_tags.extend(raw_tags)
137 translate_raw_tags(linkage_list[-1])
138 return sense, linkage_list
141def extract_zh_dial_template(
142 wxr: WiktextractContext, template_node: TemplateNode, sense: str
143) -> list[Linkage]:
144 from .pronunciation import split_zh_pron_raw_tag
146 linkage_list = []
147 expanded_node = wxr.wtp.parse(
148 wxr.wtp.node_to_wikitext(template_node), expand_all=True
149 )
150 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):
151 is_note_row = False
152 note_tags = {}
153 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
154 for cell_node in row_node.find_child(
155 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
156 ):
157 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
158 is_note_row = clean_node(wxr, None, cell_node) == "註解"
159 elif is_note_row:
160 for note_str in clean_node(wxr, None, cell_node).split(";"):
161 if "-" in note_str: 161 ↛ 160line 161 didn't jump to line 160 because the condition on line 161 was always true
162 note_symbol, note = note_str.split("-", maxsplit=1)
163 note_symbol = note_symbol.strip()
164 note = note.strip()
165 if note_symbol != "" and note != "": 165 ↛ 160line 165 didn't jump to line 160 because the condition on line 165 was always true
166 note_tags[note_symbol] = note
167 lang_tags = []
168 region_tags = []
169 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
170 if not row_node.contain_node(NodeKind.TABLE_CELL):
171 continue # skip header row
172 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
173 lang_tags = split_zh_pron_raw_tag(
174 clean_node(wxr, None, header_node)
175 )
176 if lang_tags == ["註解"]: # skip last note row
177 continue
178 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
179 for link_node in cell_node.find_child(NodeKind.LINK):
180 region_tags = split_zh_pron_raw_tag(
181 clean_node(wxr, None, link_node)
182 )
183 for span_tag in cell_node.find_html("span"):
184 span_text = clean_node(wxr, None, span_tag)
185 if span_text == "": 185 ↛ 186line 185 didn't jump to line 186 because the condition on line 185 was never true
186 continue
187 if (
188 span_tag.attrs.get("lang", "") == "zh"
189 and span_text != wxr.wtp.title
190 ):
191 l_data = Linkage(word=span_text, sense=sense)
192 if len(lang_tags) > 0: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true
193 l_data.raw_tags.extend(lang_tags)
194 if len(region_tags) > 0:
195 l_data.raw_tags.extend(region_tags)
196 translate_raw_tags(l_data)
197 linkage_list.append(l_data)
198 elif (
199 span_tag.attrs.get("style", "") == "font-size:60%"
200 and len(linkage_list) > 0
201 ):
202 for note_symbol in span_text.split(","):
203 note_symbol = note_symbol.strip()
204 raw_tag = note_symbol
205 if note_symbol in note_tags:
206 raw_tag = note_tags[note_symbol]
207 if raw_tag != "": 207 ↛ 202line 207 didn't jump to line 202 because the condition on line 207 was always true
208 linkage_list[-1].raw_tags.append(raw_tag)
209 translate_raw_tags(linkage_list[-1])
211 return linkage_list
214def process_zh_l_template(
215 wxr: WiktextractContext,
216 t_node: TemplateNode,
217 sense: str,
218 raw_tags: list[str] = [],
219) -> list[Linkage]:
220 # https://zh.wiktionary.org/wiki/Template:Zh-l
221 expanded_node = wxr.wtp.parse(
222 wxr.wtp.node_to_wikitext(t_node), expand_all=True
223 )
224 roman = ""
225 linkage_list = []
226 new_sense = clean_node(wxr, None, t_node.template_parameters.get(2, ""))
227 if new_sense != "": 227 ↛ 228line 227 didn't jump to line 228 because the condition on line 227 was never true
228 sense = new_sense
229 for i_tag in expanded_node.find_html_recursively(
230 "span", attr_name="class", attr_value="Latn"
231 ):
232 roman = clean_node(wxr, None, i_tag)
233 for span_tag in expanded_node.find_html(
234 "span", attr_name="lang", attr_value="zh"
235 ):
236 linkage_data = Linkage(
237 sense=sense,
238 raw_tags=raw_tags,
239 roman=roman,
240 word=clean_node(wxr, None, span_tag),
241 )
242 lang_attr = span_tag.attrs.get("lang", "")
243 if lang_attr == "zh-Hant":
244 linkage_data.tags.append("Traditional-Chinese")
245 elif lang_attr == "zh-Hans":
246 linkage_data.tags.append("Simplified-Chinese")
247 if linkage_data.word not in ["/", ""]:
248 translate_raw_tags(linkage_data)
249 linkage_list.append(linkage_data)
250 return linkage_list
253def process_ja_r_template(
254 wxr: WiktextractContext,
255 template_node: TemplateNode,
256 sense: str,
257 raw_tags: list[str] = [],
258) -> Linkage:
259 # https://zh.wiktionary.org/wiki/Template:Ja-r
260 expanded_node = wxr.wtp.parse(
261 wxr.wtp.node_to_wikitext(template_node), expand_all=True
262 )
263 return process_expanded_ja_r_node(wxr, expanded_node, sense, raw_tags)
266def process_expanded_ja_r_node(
267 wxr: WiktextractContext,
268 expanded_node: WikiNode,
269 sense: str,
270 raw_tags: list[str] = [],
271) -> Linkage:
272 linkage_data = Linkage(sense=sense, raw_tags=raw_tags)
273 for span_node in expanded_node.find_html("span"):
274 span_class = span_node.attrs.get("class", "")
275 if "lang" in span_node.attrs:
276 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)
277 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)
278 linkage_data.ruby = ruby_data
279 elif "tr" in span_class:
280 linkage_data.roman = clean_node(wxr, None, span_node)
281 elif "mention-gloss" == span_class: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 linkage_data.sense = clean_node(wxr, None, span_node)
284 translate_raw_tags(linkage_data)
285 return linkage_data
288def process_l_template(
289 wxr: WiktextractContext,
290 t_node: TemplateNode,
291 sense: str,
292 raw_tags: list[str] = [],
293) -> None:
294 # https://zh.wiktionary.org/wiki/Template:l
295 expanded_node = wxr.wtp.parse(
296 wxr.wtp.node_to_wikitext(t_node), expand_all=True
297 )
298 linkage_list = []
299 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
300 for span_tag in expanded_node.find_html("span"):
301 span_lang = span_tag.attrs.get("lang", "")
302 span_class = span_tag.attrs.get("class", "").split()
303 if span_lang == lang_code:
304 linkage_data = Linkage(
305 sense=sense,
306 raw_tags=raw_tags,
307 word=clean_node(wxr, None, span_tag),
308 )
309 if len(linkage_data.word) > 0: 309 ↛ 300line 309 didn't jump to line 300 because the condition on line 309 was always true
310 translate_raw_tags(linkage_data)
311 linkage_list.append(linkage_data)
312 elif span_lang.endswith("-Latn") and len(linkage_list) > 0:
313 linkage_list[-1].roman = clean_node(wxr, None, span_tag)
314 elif "mention-gloss" in span_class and len(linkage_list) > 0:
315 linkage_list[-1].sense = clean_node(wxr, None, span_tag)
316 elif "ib-content" in span_class and len(linkage_list) > 0:
317 raw_tag = clean_node(wxr, None, span_tag)
318 if raw_tag != "": 318 ↛ 300line 318 didn't jump to line 300 because the condition on line 318 was always true
319 linkage_list[-1].raw_tags.append(raw_tag)
320 translate_raw_tags(linkage_list[-1])
322 return linkage_list
325def process_linkage_col_template(
326 wxr: WiktextractContext, template_node: TemplateNode, sense: str
327) -> list[Linkage]:
328 # https://zh.wiktionary.org/wiki/Template:Col3
329 linkage_list = []
330 expanded_template = wxr.wtp.parse(
331 wxr.wtp.node_to_wikitext(template_node), expand_all=True
332 )
333 for ui_tag in expanded_template.find_html_recursively("li"):
334 current_data = []
335 roman = ""
336 raw_tags = []
337 for span_tag in ui_tag.find_html("span"):
338 span_lang = span_tag.attrs.get("lang", "")
339 if span_lang.endswith("-Latn"): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 roman = clean_node(wxr, None, span_tag)
341 elif "qualifier-content" in span_tag.attrs.get("class", ""):
342 span_text = clean_node(wxr, None, span_tag)
343 for raw_tag in re.split(r"或|、", span_text):
344 raw_tag = raw_tag.strip()
345 if raw_tag != "": 345 ↛ 343line 345 didn't jump to line 343 because the condition on line 345 was always true
346 raw_tags.append(raw_tag)
347 elif span_lang != "":
348 l_data = Linkage(
349 word=clean_node(wxr, None, span_tag), sense=sense
350 )
351 class_names = span_tag.attrs.get("class", "")
352 if class_names == "Hant": 352 ↛ 353line 352 didn't jump to line 353 because the condition on line 352 was never true
353 l_data.tags.append("Traditional-Chinese")
354 elif class_names == "Hans": 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true
355 l_data.tags.append("Simplified-Chinese")
356 if l_data.word != "": 356 ↛ 337line 356 didn't jump to line 337 because the condition on line 356 was always true
357 current_data.append(l_data)
359 for data in current_data:
360 data.raw_tags.extend(raw_tags)
361 data.roman = roman
362 translate_raw_tags(data)
363 linkage_list.extend(current_data)
365 return linkage_list
368def process_linkage_templates_in_gloss(
369 wxr: WiktextractContext,
370 word_entry: WordEntry,
371 t_node: TemplateNode,
372 linkage_type: str,
373 sense: str,
374) -> None:
375 # https://en.wiktionary.org/wiki/Template:synonyms
376 expanded_node = wxr.wtp.parse(
377 wxr.wtp.node_to_wikitext(t_node), expand_all=True
378 )
379 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
380 l_list = []
381 raw_tags = []
382 for top_span_tag in expanded_node.find_html("span"):
383 for node in top_span_tag.children:
384 if isinstance(node, HTMLNode) and node.tag == "span":
385 span_lang = node.attrs.get("lang", "")
386 span_class = node.attrs.get("class", "")
387 if span_lang == lang_code:
388 l_data = Linkage(
389 word=clean_node(wxr, None, node),
390 sense=sense,
391 raw_tags=raw_tags,
392 )
393 if span_class == "Hant":
394 l_data.tags.append("Traditional-Chinese")
395 elif span_class == "Hans":
396 l_data.tags.append("Simplified-Chinese")
397 if l_data.word != "": 397 ↛ 383line 397 didn't jump to line 383 because the condition on line 397 was always true
398 l_list.append(l_data)
399 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:
400 roman = clean_node(wxr, None, node)
401 for d in l_list:
402 d.roman = roman
403 elif span_class == "mention-gloss": 403 ↛ 404line 403 didn't jump to line 404 because the condition on line 403 was never true
404 sense = clean_node(wxr, None, node)
405 for d in l_list:
406 d.sense = sense
407 elif "qualifier-content" in span_class:
408 raw_tag_str = clean_node(wxr, None, node)
409 for raw_tag in raw_tag_str.split(","):
410 raw_tag = raw_tag.strip()
411 if raw_tag != "": 411 ↛ 409line 411 didn't jump to line 409 because the condition on line 411 was always true
412 raw_tags.append(raw_tag)
413 elif isinstance(node, str) and node.strip() == "、":
414 getattr(word_entry, linkage_type).extend(l_list)
415 l_list.clear()
417 getattr(word_entry, linkage_type).extend(l_list)
418 for data in getattr(word_entry, linkage_type):
419 translate_raw_tags(data)
422def extract_ja_r_multi_template(
423 wxr: WiktextractContext, template_node: TemplateNode, sense: str
424) -> Linkage:
425 expanded_node = wxr.wtp.parse(
426 wxr.wtp.node_to_wikitext(template_node), expand_all=True
427 )
428 linkage_list = []
429 for list_node in expanded_node.find_child(NodeKind.LIST):
430 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
431 linkage_list.append(
432 process_expanded_ja_r_node(wxr, list_item, sense, [])
433 )
435 return linkage_list