Coverage for src/wiktextract/extractor/en/descendant.py: 80%
124 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1from copy import deepcopy
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor import (
5 HTMLNode,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...datautils import data_append, data_extend
13from ...page import clean_node
14from ...tags import valid_tags
15from ...wxr_context import WiktextractContext
16from ..ruby import extract_ruby
17from .type_utils import DescendantData, WordData
20def extract_descendant_section(
21 wxr: WiktextractContext,
22 word_entry: WordData,
23 level_node: LevelNode,
24 is_derived: bool,
25):
26 desc_list = []
27 for t_node in level_node.find_child(NodeKind.TEMPLATE):
28 if (
29 isinstance(t_node, TemplateNode)
30 and t_node.template_name.lower() == "cjkv"
31 ):
32 desc_list.extend(extract_cjkv_template(wxr, t_node))
34 seen_lists = set()
35 # get around unnecessarily pre-expanded "top" template
36 for list_node in level_node.find_child_recursively(NodeKind.LIST):
37 if list_node in seen_lists:
38 continue
39 seen_lists.add(list_node)
40 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
41 desc_list.extend(
42 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]
43 )
45 if is_derived:
46 for data in desc_list:
47 if "derived" not in data.get("tags", []): 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true
48 data_append(data, "tags", "derived")
49 if len(desc_list) > 0:
50 data_extend(word_entry, "descendants", desc_list)
53def extract_cjkv_template(
54 wxr: WiktextractContext, t_node: TemplateNode
55) -> list[DescendantData]:
56 expanded_template = wxr.wtp.parse(
57 wxr.wtp.node_to_wikitext(t_node), expand_all=True
58 )
59 seen_lists = set()
60 desc_list = []
61 for list_node in expanded_template.find_child_recursively(NodeKind.LIST): 61 ↛ 62line 61 didn't jump to line 62 because the loop on line 61 never started
62 if list_node in seen_lists:
63 continue
64 seen_lists.add(list_node)
65 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
66 desc_list.extend(
67 extract_desc_list_item(wxr, list_item, [], seen_lists, [])[0]
68 )
69 return desc_list
72def extract_desc_list_item(
73 wxr: WiktextractContext,
74 list_item: WikiNode,
75 parent_data: list[DescendantData],
76 seen_lists: set[WikiNode],
77 raw_tags: list[str],
78 lang_code: str = "unknown",
79 lang_name: str = "unknown",
80) -> tuple[list[DescendantData], str, str]:
81 # process list item node and <li> tag
82 data_list = []
83 before_word_raw_tags = []
84 after_word = False
85 for child in list_item.children:
86 if isinstance(child, str) and child.strip().endswith(":"):
87 lang_name = child.strip(": \n") or "unknown"
88 lang_code = name_to_code(lang_name, "en") or "unknown"
89 elif isinstance(child, str) and child.strip() == ",":
90 after_word = False
91 elif isinstance(child, HTMLNode) and child.tag == "span":
92 after_word = extract_desc_span_tag(
93 wxr,
94 child,
95 data_list,
96 lang_code,
97 lang_name,
98 raw_tags,
99 before_word_raw_tags,
100 after_word,
101 )
102 elif ( 102 ↛ 107line 102 didn't jump to line 107 because the condition on line 102 was never true
103 isinstance(child, HTMLNode)
104 and child.tag == "i"
105 and len(data_list) > 0
106 ):
107 for span_tag in child.find_html(
108 "span", attr_name="class", attr_value="Latn"
109 ):
110 roman = clean_node(wxr, None, span_tag)
111 if roman != "":
112 data_list[-1]["roman"] = roman
113 if len(
114 data_list
115 ) > 1 and "Traditional-Chinese" in data_list[-2].get(
116 "tags", []
117 ):
118 data_list[-2]["roman"] = roman
119 elif isinstance(child, TemplateNode) and child.template_name in [
120 "desctree",
121 "descendants tree",
122 "desc",
123 "descendant",
124 "ja-r",
125 "zh-l",
126 "zh-m",
127 "link", # used in Reconstruction pages
128 "l",
129 ]:
130 if child.template_name.startswith("desc"):
131 lang_code = child.template_parameters.get(1, "") or "unknown"
132 expanded_template = wxr.wtp.parse(
133 wxr.wtp.node_to_wikitext(child), expand_all=True
134 )
135 new_data, new_l_code, new_l_name = extract_desc_list_item(
136 wxr,
137 expanded_template,
138 [], # avoid add twice
139 seen_lists,
140 raw_tags,
141 lang_code,
142 lang_name,
143 )
144 data_list.extend(new_data)
145 # save lang data from desc template
146 lang_code = new_l_code
147 lang_name = new_l_name
149 if (
150 wxr.wtp.title.startswith("Reconstruction:")
151 and len(data_list) == 0
152 and (lang_code != "unknown" or lang_name != "unknown")
153 ):
154 data = DescendantData(lang_code=lang_code, lang=lang_name)
155 if len(raw_tags) > 0:
156 data["raw_tags"] = raw_tags
157 data_list.append(data)
159 for ul_tag in list_item.find_html("ul"): 159 ↛ 160line 159 didn't jump to line 160 because the loop on line 159 never started
160 for li_tag in ul_tag.find_html("li"):
161 extract_desc_list_item(wxr, li_tag, data_list, seen_lists, [])
162 for next_list in list_item.find_child(NodeKind.LIST):
163 if next_list in seen_lists: 163 ↛ 164line 163 didn't jump to line 164 because the condition on line 163 was never true
164 continue
165 seen_lists.add(next_list)
166 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
167 extract_desc_list_item(
168 wxr, next_list_item, data_list, seen_lists, []
169 )
171 for p_data in parent_data:
172 data_extend(p_data, "descendants", data_list)
173 return data_list, lang_code, lang_name
176def extract_desc_span_tag(
177 wxr: WiktextractContext,
178 span_tag: HTMLNode,
179 desc_lists: list[DescendantData],
180 lang_code: str,
181 lang_name: str,
182 raw_tags: list[str],
183 before_word_raw_tags: list[str],
184 after_word: bool,
185) -> bool:
186 class_names = span_tag.attrs.get("class", "").split()
187 span_lang = span_tag.attrs.get("lang", "")
188 span_title = span_tag.attrs.get("title", "")
189 if ("tr" in class_names or span_lang.endswith("-Latn")) and len(
190 desc_lists
191 ) > 0:
192 roman = clean_node(wxr, None, span_tag)
193 if roman != "": 193 ↛ 253line 193 didn't jump to line 253 because the condition on line 193 was always true
194 desc_lists[-1]["roman"] = clean_node(wxr, None, span_tag)
195 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[ 195 ↛ 198line 195 didn't jump to line 198 because the condition on line 195 was never true
196 -2
197 ].get("tags", []):
198 desc_lists[-2]["roman"] = roman
199 elif (
200 "qualifier-content" in class_names
201 or "gender" in class_names
202 or "label-content" in class_names
203 ) and len(desc_lists) > 0:
204 for raw_tag in clean_node(wxr, None, span_tag).split(","):
205 raw_tag = raw_tag.strip()
206 if raw_tag != "": 206 ↛ 204line 206 didn't jump to line 204 because the condition on line 206 was always true
207 if after_word:
208 data_append(
209 desc_lists[-1],
210 "tags" if raw_tag in valid_tags else "raw_tags",
211 raw_tag,
212 )
213 else:
214 before_word_raw_tags.append(raw_tag)
215 elif span_lang != "":
216 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
217 desc_data = DescendantData(
218 lang=lang_name,
219 lang_code=lang_code,
220 word=clean_node(wxr, None, nodes_without_ruby),
221 )
222 for raw_tag_list in [before_word_raw_tags, raw_tags]:
223 for raw_tag in raw_tag_list:
224 data_append(
225 desc_data,
226 "tags" if raw_tag in valid_tags else "raw_tags",
227 raw_tag,
228 )
229 before_word_raw_tags.clear()
230 if len(ruby_data) > 0: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 desc_data["ruby"] = ruby_data
232 if desc_data["lang_code"] == "unknown":
233 desc_data["lang_code"] = span_lang
234 if "Hant" in class_names: 234 ↛ 235line 234 didn't jump to line 235 because the condition on line 234 was never true
235 data_append(desc_data, "tags", "Traditional-Chinese")
236 elif "Hans" in class_names: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 data_append(desc_data, "tags", "Simplified-Chinese")
238 if desc_data["word"] not in ["", "/"]: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true
239 desc_lists.append(deepcopy(desc_data))
240 after_word = True
241 elif span_title != "" and clean_node(wxr, None, span_tag) in [
242 "→",
243 "⇒",
244 ">",
245 "?",
246 ]:
247 raw_tags.append(span_title)
248 elif "mention-gloss" in class_names and len(desc_lists) > 0:
249 sense = clean_node(wxr, None, span_tag)
250 if sense != "": 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was always true
251 desc_lists[-1]["sense"] = sense
253 return after_word