Coverage for src/wiktextract/extractor/vi/descendant.py: 7%
95 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor import (
3 HTMLNode,
4 LevelNode,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ..ruby import extract_ruby
13from .models import Descendant, WordEntry
14from .tags import translate_raw_tags
17def extract_descendant_section(
18 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
19):
20 desc_list = []
21 for node in level_node.children:
22 if isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
23 for list_node in level_node.find_child(NodeKind.LIST):
24 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
25 desc_list.extend(
26 extract_desc_list_item(wxr, list_item, [], [])[0]
27 )
28 elif (
29 isinstance(node, TemplateNode)
30 and node.template_name.lower() == "cjkv"
31 ):
32 desc_list.extend(extract_cjkv_template(wxr, node))
33 word_entry.descendants.extend(desc_list)
36def extract_cjkv_template(
37 wxr: WiktextractContext, t_node: TemplateNode
38) -> list[Descendant]:
39 expanded_node = wxr.wtp.parse(
40 wxr.wtp.node_to_wikitext(t_node), expand_all=True
41 )
42 desc_list = []
43 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
44 desc_list.extend(extract_desc_list_item(wxr, list_item, [], [])[0])
45 return desc_list
48def extract_desc_list_item(
49 wxr: WiktextractContext,
50 list_item: WikiNode,
51 parent_data: list[Descendant],
52 raw_tags: list[str],
53 lang_code: str = "unknown",
54 lang_name: str = "unknown",
55) -> tuple[list[Descendant], str, str]:
56 # process list item node and <li> tag
57 data_list = []
58 before_word_raw_tags = []
59 after_word = False
60 for child in list_item.children:
61 if isinstance(child, str) and child.strip().endswith(":"):
62 lang_name = child.strip(": ") or "unknown"
63 lang_code = name_to_code(lang_name, "vi") or "unknown"
64 elif isinstance(child, str) and child.strip() == ",":
65 after_word = False
66 elif isinstance(child, HTMLNode) and child.tag == "span":
67 extract_desc_span_tag(
68 wxr,
69 child,
70 data_list,
71 lang_code,
72 lang_name,
73 raw_tags,
74 before_word_raw_tags,
75 after_word,
76 )
77 elif (
78 isinstance(child, HTMLNode)
79 and child.tag == "i"
80 and len(data_list) > 0
81 ):
82 for span_tag in child.find_html(
83 "span", attr_name="class", attr_value="Latn"
84 ):
85 roman = clean_node(wxr, None, span_tag)
86 data_list[-1].roman = roman
87 if (
88 len(data_list) > 1
89 and "Traditional-Chinese" in data_list[-2].tags
90 ):
91 data_list[-2].roman = roman
92 elif isinstance(child, TemplateNode) and child.template_name in [
93 "desctree",
94 "descendants tree",
95 "desc",
96 "descendant",
97 "ja-r",
98 "jpn-r",
99 "zh-l",
100 "zho-l",
101 "zh-m",
102 "zho-m",
103 ]:
104 if child.template_name.startswith("desc"):
105 lang_code = child.template_parameters.get(1, "") or "unknown"
106 expanded_template = wxr.wtp.parse(
107 wxr.wtp.node_to_wikitext(child), expand_all=True
108 )
109 new_data, new_l_code, new_l_name = extract_desc_list_item(
110 wxr,
111 expanded_template,
112 [], # avoid add twice
113 raw_tags,
114 lang_code,
115 lang_name,
116 )
117 data_list.extend(new_data)
118 # save lang data from desc template
119 lang_code = new_l_code
120 lang_name = new_l_name
122 for ul_tag in list_item.find_html("ul"):
123 for li_tag in ul_tag.find_html("li"):
124 extract_desc_list_item(wxr, li_tag, data_list, [])
125 for next_list in list_item.find_child(NodeKind.LIST):
126 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM):
127 extract_desc_list_item(wxr, next_list_item, data_list, [])
129 for p_data in parent_data:
130 p_data.descendants.extend(data_list)
131 return data_list, lang_code, lang_name
134def extract_desc_span_tag(
135 wxr: WiktextractContext,
136 span_tag: HTMLNode,
137 desc_lists: list[Descendant],
138 lang_code: str,
139 lang_name: str,
140 raw_tags: list[str],
141 before_word_raw_tags: list[str],
142 after_word: bool,
143) -> bool:
144 class_names = span_tag.attrs.get("class", "").split()
145 span_lang = span_tag.attrs.get("lang", "")
146 span_title = span_tag.attrs.get("title", "")
147 if ("tr" in class_names or span_lang.endswith("-Latn")) and len(
148 desc_lists
149 ) > 0:
150 roman = clean_node(wxr, None, span_tag)
151 desc_lists[-1].roman = roman
152 if len(desc_lists) > 1 and "Traditional-Chinese" in desc_lists[-2].tags:
153 desc_lists[-2].roman = roman
154 elif (
155 "qualifier-content" in class_names
156 or "gender" in class_names
157 or "label-content" in class_names
158 ) and len(desc_lists) > 0:
159 for raw_tag in clean_node(wxr, None, span_tag).split(","):
160 raw_tag = raw_tag.strip()
161 if raw_tag == "":
162 continue
163 if after_word:
164 desc_lists[-1].raw_tags.append(raw_tag)
165 translate_raw_tags(desc_lists[-1])
166 else:
167 before_word_raw_tags.append(raw_tag)
168 elif span_lang != "":
169 ruby_data, nodes_without_ruby = extract_ruby(wxr, span_tag)
170 desc_data = Descendant(
171 lang=lang_name,
172 lang_code=lang_code,
173 word=clean_node(wxr, None, nodes_without_ruby),
174 ruby=ruby_data,
175 raw_tags=before_word_raw_tags + raw_tags,
176 )
177 before_word_raw_tags.clear()
178 if desc_data.lang_code == "unknown":
179 desc_data.lang_code = span_lang
180 if "Hant" in class_names:
181 desc_data.tags.append("Traditional-Chinese")
182 elif "Hans" in class_names:
183 desc_data.tags.append("Simplified-Chinese")
184 if desc_data.word not in ["", "/"]:
185 translate_raw_tags(desc_data)
186 desc_lists.append(desc_data)
187 after_word = True
188 elif span_title != "" and clean_node(wxr, None, span_tag) in [
189 "→",
190 "⇒",
191 ">",
192 "?",
193 ]:
194 raw_tags.append(span_title)
195 elif "mention-gloss" in class_names and len(desc_lists) > 0:
196 desc_lists[-1].sense = clean_node(wxr, None, span_tag)
198 return after_word