Coverage for src/wiktextract/extractor/en/descendant.py: 83%
154 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1from copy import deepcopy
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...datautils import data_append, data_extend
13from ...page import clean_node
14from ...tags import valid_tags
15from ...wxr_context import WiktextractContext
16from ..ruby import extract_ruby
17from .type_utils import DescendantData, WordData
20def extract_descendant_section(
21 wxr: WiktextractContext,
22 word_entry: WordData,
23 level_node: LevelNode,
24 is_derived: bool,
25):
26 desc_list = []
27 for t_node in level_node.find_child(NodeKind.TEMPLATE):
28 if (
29 isinstance(t_node, TemplateNode)
30 and t_node.template_name.lower() == "cjkv"
31 ):
32 desc_list.extend(extract_cjkv_template(wxr, t_node))
34 seen_lists = set()
35 # get around unnecessarily pre-expanded "top" template
36 for list_node in level_node.find_child_recursively(NodeKind.LIST):
37 if list_node in seen_lists:
38 continue
39 seen_lists.add(list_node)
40 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
41 desc_list.extend(
42 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]
43 )
45 if is_derived:
46 for data in desc_list:
47 if "derived" not in data.get("tags", []): 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true
48 data_append(data, "tags", "derived")
49 if len(desc_list) > 0:
50 data_extend(word_entry, "descendants", desc_list)
53def extract_cjkv_template(
54 wxr: WiktextractContext, t_node: TemplateNode
55) -> list[DescendantData]:
56 expanded_template = wxr.wtp.parse(
57 wxr.wtp.node_to_wikitext(t_node), expand_all=True
58 )
59 seen_lists = set()
60 desc_list = []
61 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started
62 if list_node in seen_lists:
63 continue
64 seen_lists.add(list_node)
65 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
66 desc_list.extend(
67 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]
68 )
69 return desc_list
72def extract_desc_list_item(
73 wxr: WiktextractContext,
74 list_item: WikiNode,
75 parent_descendant_datas: list[DescendantData],
76 seen_lists: set[WikiNode],
77 raw_tags: list[str],
78 lang_code: str = "unknown",
79 lang_name: str = "unknown",
80) -> tuple[list[DescendantData], str, str]:
81 # process list item node and <li> tag
82 data_list = []
83 before_word_raw_tags = []
84 after_word = False
85 for child in list_item.children:
86 if isinstance(child, str):
87 child = child.strip()
88 if child == ",":
89 after_word = False
90 elif child.endswith(":"):
91 lang_name = child.strip(": \n") or "unknown"
92 lang_code = (
93 choose_more_specific_langcode(
94 name_to_code(lang_name, "en"), lang_code
95 )
96 or "unknown"
97 )
98 elif lcode := name_to_code(child): 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 lang_name = child
100 lang_code = lcode
101 lang_code = (
102 choose_more_specific_langcode(lcode, lang_code) or "unknown"
103 )
104 elif lname := does_text_look_like_language_name(child):
105 lang_name = lname
106 lang_code = (
107 choose_more_specific_langcode(
108 name_to_code(lang_name, "en"), lang_code
109 )
110 or "unknown"
111 )
112 elif isinstance(child, HTMLNode) and child.tag == "span":
113 after_word = extract_desc_span_tag(
114 wxr,
115 child,
116 data_list,
117 lang_code,
118 lang_name,
119 raw_tags,
120 before_word_raw_tags,
121 after_word,
122 )
123 elif ( 123 ↛ 128line 123 didn't jump to line 128 because the condition on line 123 was never true
124 isinstance(child, HTMLNode)
125 and child.tag == "i"
126 and len(data_list) > 0
127 ):
128 for span_tag in child.find_html(
129 "span", attr_name="class", attr_value="Latn"
130 ):
131 roman = clean_node(wxr, None, span_tag)
132 if roman != "":
133 data_list[-1]["roman"] = roman
134 if len(
135 data_list
136 ) > 1 and "Traditional-Chinese" in data_list[-2].get(
137 "tags", []
138 ):
139 data_list[-2]["roman"] = roman
140 elif isinstance(child, TemplateNode) and child.template_name in [
141 "desctree",
142 "descendants tree",
143 "desc",
144 "descendant",
145 "ja-r",
146 "zh-l",
147 "zh-m",
148 "link", # used in Reconstruction pages
149 "l",
150 ]:
151 if child.template_name.startswith("desc"):
152 lang_code = child.template_parameters.get(1, "") or "unknown"
153 expanded_template = wxr.wtp.parse(
154 wxr.wtp.node_to_wikitext(child), expand_all=True
155 )
156 new_data, new_l_code, new_l_name = extract_desc_list_item(
157 wxr,
158 expanded_template,
159 [], # avoid add twice
160 seen_lists,
161 raw_tags,
162 lang_code,
163 lang_name,
164 )
165 data_list.extend(new_data)
166 # save lang data from desc template
167 lang_code = new_l_code
168 lang_name = new_l_name
170 if len(data_list) == 0 and (
171 lang_code != "unknown" or lang_name != "unknown"
172 ):
173 data = DescendantData(lang_code=lang_code, lang=lang_name)
174 if len(raw_tags) > 0:
175 data["raw_tags"] = raw_tags
176 data_list.append(data)
178 for ul_tag in list_item.find_html("ul"):
179 for li_tag in ul_tag.find_html("li"):
180 extract_desc_list_item(wxr, li_tag, data_list, seen_lists, [])
181 for next_list in list_item.find_child(NodeKind.LIST):
182 if next_list in seen_lists: 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true
183 continue
184 seen_lists.add(next_list)
185 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
186 extract_desc_list_item(
187 wxr, next_list_item, data_list, seen_lists, []
188 )
190 for p_data in parent_descendant_datas:
191 data_extend(p_data, "descendants", data_list)
192 return data_list, lang_code, lang_name
195def extract_desc_span_tag(
196 wxr: WiktextractContext,
197 span_tag: HTMLNode,
198 desc_lists: list[DescendantData],
199 lang_code: str,
200 lang_name: str,
201 raw_tags: list[str],
202 before_word_raw_tags: list[str],
203 after_word: bool,
204) -> bool:
205 class_names = span_tag.attrs.get("class", "").split()
206 span_lang = span_tag.attrs.get("lang", "")
207 span_title = span_tag.attrs.get("title", "")
208 if ("tr" in class_names or span_lang.endswith("-Latn")) and len(
209 desc_lists
210 ) > 0:
211 roman = clean_node(wxr, None, span_tag)
212 if roman != "": 212 ↛ 272line 212 didn't jump to line 272 because the condition on line 212 was always true
213 desc_lists[-1]["roman"] = clean_node(wxr, None, span_tag)
214 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[ 214 ↛ 217line 214 didn't jump to line 217 because the condition on line 214 was never true
215 -2
216 ].get("tags", []):
217 desc_lists[-2]["roman"] = roman
218 elif (
219 "qualifier-content" in class_names
220 or "gender" in class_names
221 or "label-content" in class_names
222 ) and len(desc_lists) > 0:
223 for raw_tag in clean_node(wxr, None, span_tag).split(","):
224 raw_tag = raw_tag.strip()
225 if raw_tag != "": 225 ↛ 223line 225 didn't jump to line 223 because the condition on line 225 was always true
226 if after_word:
227 data_append(
228 desc_lists[-1],
229 "tags" if raw_tag in valid_tags else "raw_tags",
230 raw_tag,
231 )
232 else:
233 before_word_raw_tags.append(raw_tag)
234 elif span_lang != "":
235 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
236 desc_data = DescendantData(
237 lang=lang_name,
238 lang_code=lang_code,
239 word=clean_node(wxr, None, nodes_without_ruby),
240 )
241 for raw_tag_list in [before_word_raw_tags, raw_tags]:
242 for raw_tag in raw_tag_list:
243 data_append(
244 desc_data,
245 "tags" if raw_tag in valid_tags else "raw_tags",
246 raw_tag,
247 )
248 before_word_raw_tags.clear()
249 if len(ruby_data) > 0: 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true
250 desc_data["ruby"] = ruby_data
251 if desc_data["lang_code"] == "unknown":
252 desc_data["lang_code"] = span_lang
253 if "Hant" in class_names: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 data_append(desc_data, "tags", "Traditional-Chinese")
255 elif "Hans" in class_names: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 data_append(desc_data, "tags", "Simplified-Chinese")
257 if desc_data["word"] not in ["", "/"]: 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was always true
258 desc_lists.append(deepcopy(desc_data))
259 after_word = True
260 elif span_title != "" and clean_node(wxr, None, span_tag) in [
261 "→",
262 "⇒",
263 ">",
264 "?",
265 ]:
266 raw_tags.append(span_title)
267 elif "mention-gloss" in class_names and len(desc_lists) > 0:
268 sense = clean_node(wxr, None, span_tag)
269 if sense != "": 269 ↛ 272line 269 didn't jump to line 272 because the condition on line 269 was always true
270 desc_lists[-1]["sense"] = sense
272 return after_word
275def does_text_look_like_language_name(text: str) -> str | None:
276 text = text.strip()
277 if not text:
278 return None
279 split_text = text.replace("-", " ").split()
280 if any(name_to_code(s.strip(), "en") for s in split_text): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 return text
282 if len(split_text) >= 2:
283 if all(s != "" and s[0].isupper() for s in split_text):
284 return text
285 # len(text) == 1
286 elif text.endswith(("ic", "ish", "an")):
287 return text
288 return None
291def choose_more_specific_langcode(new: str | None, old: str) -> str | None:
292 if old == "unknown":
293 return new
294 if new is None or new == "":
295 return old
296 if old.startswith(new + "-"): 296 ↛ 298line 296 didn't jump to line 298 because the condition on line 296 was never true
297 # "fa-cls" or "fa" -> "fa-cls"
298 return old
299 return new