Coverage for src/wiktextract/extractor/zh/descendant.py: 91%
101 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ..ruby import extract_ruby
15from .models import Descendant, WordEntry
16from .tags import TEMPLATE_TAG_ARGS, translate_raw_tags
19def extract_descendant_section(
20 wxr: WiktextractContext, level_node: LevelNode, page_data: list[WordEntry]
21) -> None:
22 desc_list = []
23 for node in level_node.children:
24 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
25 for list_node in level_node.find_child(NodeKind.LIST):
26 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
27 desc_list.extend(
28 process_desc_list_item(wxr, list_item, [], [])[0]
29 )
30 elif (
31 isinstance(node, TemplateNode)
32 and node.template_name.lower() == "cjkv"
33 ):
34 desc_list.extend(process_cjkv_template(wxr, node))
36 page_data[-1].descendants.extend(desc_list)
37 for data in page_data[:-1]:
38 if ( 38 ↛ 37line 38 didn't jump to line 37 because the condition on line 38 was always true
39 data.lang_code == page_data[-1].lang_code
40 and data.sounds == page_data[-1].sounds
41 and data.etymology_text == page_data[-1].etymology_text
42 and data.pos_level == page_data[-1].pos_level == level_node.kind
43 ):
44 data.descendants.extend(desc_list)
47def process_cjkv_template(
48 wxr: WiktextractContext, t_node: TemplateNode
49) -> list[Descendant]:
50 expanded_node = wxr.wtp.parse(
51 wxr.wtp.node_to_wikitext(t_node), expand_all=True
52 )
53 desc_list = []
54 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
55 desc_list.extend(process_desc_list_item(wxr, list_item, [], [])[0])
56 return desc_list
59def process_desc_list_item(
60 wxr: WiktextractContext,
61 list_item: WikiNode,
62 parent_data: list[Descendant],
63 raw_tags: list[str],
64 lang_code: str = "unknown",
65 lang_name: str = "unknown",
66) -> tuple[list[Descendant], str, str]:
67 # process list item node and <li> tag
68 data_list = []
69 before_word_raw_tags = []
70 after_word = False
71 for child in list_item.children:
72 if isinstance(child, str) and child.strip().endswith(":"):
73 lang_name = child.strip(": ") or "unknown"
74 lang_code = name_to_code(lang_name, "zh") or "unknown"
75 elif isinstance(child, str) and child.strip() == ",": 75 ↛ 76line 75 didn't jump to line 76 because the condition on line 75 was never true
76 after_word = False
77 elif isinstance(child, HTMLNode) and child.tag == "span":
78 extract_desc_span_tag(
79 wxr,
80 child,
81 data_list,
82 lang_code,
83 lang_name,
84 raw_tags,
85 before_word_raw_tags,
86 after_word,
87 )
88 elif (
89 isinstance(child, HTMLNode)
90 and child.tag == "i"
91 and len(data_list) > 0
92 ):
93 for span_tag in child.find_html(
94 "span", attr_name="class", attr_value="Latn"
95 ):
96 roman = clean_node(wxr, None, span_tag)
97 data_list[-1].roman = roman
98 if (
99 len(data_list) > 1
100 and "Traditional-Chinese" in data_list[-2].tags
101 ):
102 data_list[-2].roman = roman
103 elif isinstance(child, TemplateNode) and child.template_name in [
104 "desctree",
105 "descendants tree",
106 "desc",
107 "descendant",
108 "ja-r",
109 "zh-l",
110 "zh-m",
111 ]:
112 if child.template_name.startswith("desc"):
113 lang_code = child.template_parameters.get(1, "") or "unknown"
114 expanded_template = wxr.wtp.parse(
115 wxr.wtp.node_to_wikitext(child), expand_all=True
116 )
117 new_data, new_l_code, new_l_name = process_desc_list_item(
118 wxr,
119 expanded_template,
120 [], # avoid add twice
121 raw_tags,
122 lang_code,
123 lang_name,
124 )
125 data_list.extend(new_data)
126 # save lang data from desc template
127 lang_code = new_l_code
128 lang_name = new_l_name
130 for ul_tag in list_item.find_html("ul"):
131 for li_tag in ul_tag.find_html("li"):
132 process_desc_list_item(wxr, li_tag, data_list, [])
133 for next_list in list_item.find_child(NodeKind.LIST):
134 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
135 process_desc_list_item(wxr, next_list_item, data_list, [])
137 for p_data in parent_data:
138 p_data.descendants.extend(data_list)
139 return data_list, lang_code, lang_name
142def extract_desc_span_tag(
143 wxr: WiktextractContext,
144 span_tag: HTMLNode,
145 desc_lists: list[Descendant],
146 lang_code: str,
147 lang_name: str,
148 raw_tags: list[str],
149 before_word_raw_tags: list[str],
150 after_word: bool,
151) -> bool:
152 class_names = span_tag.attrs.get("class", "").split()
153 span_lang = span_tag.attrs.get("lang", "")
154 span_title = span_tag.attrs.get("title", "")
155 if ("tr" in class_names or span_lang.endswith("-Latn")) and len(
156 desc_lists
157 ) > 0:
158 roman = clean_node(wxr, None, span_tag)
159 desc_lists[-1].roman = roman
160 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[-2].tags:
161 desc_lists[-2].roman = roman
162 elif (
163 "qualifier-content" in class_names
164 or "gender" in class_names
165 or "label-content" in class_names
166 ) and len(desc_lists) > 0:
167 for raw_tag in re.split(r",|,", clean_node(wxr, None, span_tag)):
168 raw_tag = raw_tag.strip()
169 if raw_tag == "": 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 continue
171 if after_word: 171 ↛ 172line 171 didn't jump to line 172 because the condition on line 171 was never true
172 if raw_tag in TEMPLATE_TAG_ARGS:
173 desc_lists[-1].tags.append(TEMPLATE_TAG_ARGS[raw_tag])
174 else:
175 desc_lists[-1].raw_tags.append(raw_tag)
176 translate_raw_tags(desc_lists[-1])
177 else:
178 before_word_raw_tags.append(raw_tag)
179 elif span_lang != "":
180 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
181 desc_data = Descendant(
182 lang=lang_name,
183 lang_code=lang_code,
184 word=clean_node(wxr, None, nodes_without_ruby),
185 ruby=ruby_data,
186 raw_tags=before_word_raw_tags + raw_tags,
187 )
188 before_word_raw_tags.clear()
189 if desc_data.lang_code == "unknown": 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 desc_data.lang_code = span_lang
191 if "Hant" in class_names:
192 desc_data.tags.append("Traditional-Chinese")
193 elif "Hans" in class_names:
194 desc_data.tags.append("Simplified-Chinese")
195 if desc_data.word not in ["", "/"]:
196 translate_raw_tags(desc_data)
197 desc_lists.append(desc_data)
198 after_word = True
199 elif span_title != "" and clean_node(wxr, None, span_tag) in [
200 "→",
201 "⇒",
202 ">",
203 "?",
204 ]:
205 raw_tags.append(span_title)
206 elif "mention-gloss" in class_names and len(desc_lists) > 0: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 desc_lists[-1].sense = clean_node(wxr, None, span_tag)
209 return after_word