Coverage for src/wiktextract/extractor/zh/descendant.py: 94%
92 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from ..ruby import extract_ruby
7from .models import Descendant, WordEntry
8from .tags import translate_raw_tags
11def extract_descendant_section(
12 wxr: WiktextractContext, level_node: WikiNode, page_data: list[WordEntry]
13) -> None:
14 desc_list = []
15 for list_node in level_node.find_child(NodeKind.LIST):
16 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
17 for data in process_desc_list_item(wxr, list_item, []):
18 if data.word != "": 18 ↛ 17line 18 didn't jump to line 17 because the condition on line 18 was always true
19 desc_list.append(data)
20 for node in level_node.find_child(NodeKind.TEMPLATE):
21 if node.template_name.lower() == "cjkv": 21 ↛ 20line 21 didn't jump to line 20 because the condition on line 21 was always true
22 desc_list.extend(process_cjkv_template(wxr, node))
24 page_data[-1].descendants.extend(desc_list)
25 for data in page_data[:-1]:
26 if ( 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true
27 data.lang_code == page_data[-1].lang_code
28 and data.sounds == page_data[-1].sounds
29 and data.etymology_text == page_data[-1].etymology_text
30 and data.pos_level == page_data[-1].pos_level == level_node.kind
31 ):
32 data.descendants.extend(desc_list)
35def process_cjkv_template(
36 wxr: WiktextractContext, template_node: TemplateNode
37) -> list[Descendant]:
38 expanded_template = wxr.wtp.parse(
39 wxr.wtp.node_to_wikitext(template_node), expand_all=True
40 )
41 seen_lists = set()
42 desc_list = []
43 for list_node in expanded_template.find_child_recursively(NodeKind.LIST):
44 if list_node in seen_lists: 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 continue
46 seen_lists.add(list_node)
47 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
48 for data in process_desc_list_item(wxr, list_item, []):
49 if data.word != "": 49 ↛ 48line 49 didn't jump to line 48 because the condition on line 49 was always true
50 desc_list.append(data)
51 return desc_list
54def process_desc_list_item(
55 wxr: WiktextractContext,
56 list_item: WikiNode,
57 parent_data: list[Descendant],
58 lang_code: str = "",
59 lang_name: str = "",
60) -> list[Descendant]:
61 # process list item node and <li> tag
62 data_list = []
63 data = Descendant(lang=lang_name, lang_code=lang_code)
64 for child in list_item.children:
65 if isinstance(child, str) and child.strip().endswith(":"):
66 data.lang = child.strip(": ")
67 data.lang_code = name_to_code(data.lang, "zh")
68 elif isinstance(child, HTMLNode) and child.tag == "span":
69 class_names = child.attrs.get("class", "")
70 if "Latn" in class_names or "tr" in class_names:
71 data.roman = clean_node(wxr, None, child)
72 elif "qualifier-content" in class_names:
73 raw_tag = clean_node(wxr, None, clean_node(wxr, None, child))
74 if raw_tag != "": 74 ↛ 64line 74 didn't jump to line 64 because the condition on line 74 was always true
75 data.raw_tags.append(raw_tag)
76 elif isinstance(child, HTMLNode) and child.tag == "i":
77 for span_tag in child.find_html(
78 "span", attr_name="class", attr_value="Latn"
79 ):
80 data.roman = clean_node(wxr, None, span_tag)
81 elif isinstance(child, TemplateNode) and child.template_name in [
82 "desctree",
83 "descendants tree",
84 "desc",
85 "descendant",
86 "ja-r",
87 "zh-l",
88 ]:
89 if child.template_name.startswith("desc"):
90 data.lang_code = child.template_parameters.get(1)
91 expanded_template = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(child), expand_all=True
93 )
94 for new_data in process_desc_list_item(
95 wxr,
96 expanded_template,
97 [], # avoid add twice
98 data.lang_code,
99 data.lang,
100 ):
101 if new_data.word != "":
102 data_list.append(new_data)
103 else: # save lang data from desc template
104 data = new_data
106 for span_tag in list_item.find_html("span", attr_name="lang"):
107 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
108 old_word = data.word
109 data.word = clean_node(wxr, None, nodes_without_ruby)
110 data.ruby = ruby_data
111 if data.lang_code == "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 data.lang_code = span_tag.attrs["lang"]
113 span_lang = span_tag.attrs["lang"]
114 if span_lang == "zh-Hant":
115 data.tags.append("Traditional-Chinese")
116 elif span_lang == "zh-Hans":
117 if "Traditional-Chinese" in data.tags: 117 ↛ 119line 117 didn't jump to line 119 because the condition on line 117 was always true
118 data.tags.remove("Traditional-Chinese")
119 data.tags.append("Simplified-Chinese")
120 if data.roman == data.word:
121 if old_word == "":
122 data.roman = ""
123 else: # roman tag also could have "lang"
124 continue
125 if data.word not in ["", "/"]:
126 data_list.append(data.model_copy(deep=True))
128 for ul_tag in list_item.find_html("ul"):
129 for li_tag in ul_tag.find_html("li"):
130 process_desc_list_item(wxr, li_tag, data_list)
131 for next_list in list_item.find_child(NodeKind.LIST):
132 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
133 process_desc_list_item(wxr, next_list_item, data_list)
135 translate_raw_tags(data)
136 for p_data in parent_data:
137 p_data.descendants.extend(data_list)
138 if len(data_list) == 0 and data.lang != "":
139 # save lang name from desc template
140 data_list.append(data)
141 return data_list