Coverage for src/wiktextract/extractor/zh/gloss.py: 95%
176 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 08:19 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-17 08:19 +0000
1import re
3from wikitextprocessor import NodeKind, WikiNode
4from wikitextprocessor.parser import TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from .example import extract_example_list_item
10from .models import AltForm, Classifier, Linkage, Sense, WordEntry
11from .tags import translate_raw_tags
13# https://zh.wiktionary.org/wiki/Template:Label
14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
16# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板
17FORM_OF_TEMPLATES = frozenset(
18 [
19 "alt case",
20 "alt formaltform",
21 "alt sp",
22 "construed with",
23 "honor alt case",
24 "missp",
25 "obs sp",
26 "rare sp",
27 "rfform",
28 "short for",
29 "stand sp",
30 "sup sp",
31 ]
32)
33ABBR_TEMPALTES = frozenset(
34 [
35 "之縮寫",
36 "abbreviation of",
37 "abbr of",
38 "abbrev of",
39 "zh-short",
40 "zh-abbrev",
41 "中文简称",
42 ]
43)
44ZH_ALT_OF_TEMPLATES = frozenset(
45 ["zh-altname", "zh-alt-name", "中文別名", "中文别名"]
46)
49def extract_gloss(
50 wxr: WiktextractContext,
51 page_data: list[WordEntry],
52 list_node: WikiNode,
53 parent_gloss_data: Sense,
54) -> None:
55 lang_code = page_data[-1].lang_code
56 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
57 gloss_nodes = []
58 raw_tags = []
59 gloss_data = parent_gloss_data.model_copy(deep=True)
60 for node in list_item_node.children:
61 if isinstance(node, TemplateNode):
62 if node.template_name == "rfdef": 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 continue
64 raw_tag = clean_node(wxr, gloss_data, node)
65 if node.template_name.lower() in LABEL_TEMPLATES:
66 for r_tag in re.split(r",|或", raw_tag.strip("()")):
67 r_tag = r_tag.strip()
68 if r_tag != "": 68 ↛ 66line 68 didn't jump to line 66 because the condition on line 68 was always true
69 raw_tags.append(r_tag)
70 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raw_tags.append(raw_tag.strip("〈〉"))
72 elif (
73 node.template_name
74 in FORM_OF_TEMPLATES | ABBR_TEMPALTES | ZH_ALT_OF_TEMPLATES
75 or node.template_name.endswith((" of", " form", "-form"))
76 ) and process_form_of_template(
77 wxr, node, gloss_data, page_data
78 ):
79 pass
80 elif node.template_name == "zh-mw":
81 process_zh_mw_template(wxr, node, gloss_data)
82 elif node.template_name.lower() in ["zh-obsolete", "†", "zh-o"]:
83 if "obsolete" not in gloss_data.tags: 83 ↛ 60line 83 didn't jump to line 60 because the condition on line 83 was always true
84 gloss_data.tags.append("obsolete")
85 elif node.template_name.lower() in ["defdate", "datedef"]:
86 extract_defdate_template(wxr, gloss_data, node)
87 else:
88 gloss_nodes.append(node)
89 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
90 continue
91 else:
92 gloss_nodes.append(node)
94 if lang_code == "ja":
95 expanded_node = wxr.wtp.parse(
96 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
97 )
98 ruby_data, nodes_without_ruby = extract_ruby(
99 wxr, expanded_node.children
100 )
101 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
102 else:
103 ruby_data = []
104 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
106 gloss_data.raw_tags.extend(raw_tags)
107 if len(gloss_text) > 0:
108 gloss_data.glosses.append(gloss_text)
109 if len(ruby_data) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 gloss_data.ruby = ruby_data
112 translate_raw_tags(gloss_data)
113 if len(gloss_data.glosses) > 0:
114 page_data[-1].senses.append(gloss_data)
116 if list_item_node.contain_node(NodeKind.LIST):
117 for next_list in list_item_node.find_child(NodeKind.LIST):
118 if next_list.sarg.endswith("#"): # nested gloss
119 extract_gloss(wxr, page_data, next_list, gloss_data)
120 else:
121 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):
122 extract_example_list_item(
123 wxr, gloss_data, e_list_item, page_data[-1]
124 )
127def process_form_of_template(
128 wxr: WiktextractContext,
129 t_node: TemplateNode,
130 sense: Sense,
131 page_data: list[WordEntry],
132) -> bool:
133 # Return `True` if template expands to list or don't want add gloss again
134 # in `extract_gloss()`
135 # https://en.wiktionary.org/wiki/Category:Form-of_templates
136 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language
137 is_alt_of = (
138 re.search(r"^alt|alt[\s-]|alternative", t_node.template_name.lower())
139 or t_node.template_name.lower() in ZH_ALT_OF_TEMPLATES
140 )
141 is_abbr = t_node.template_name.lower() in ABBR_TEMPALTES
142 if is_alt_of:
143 sense.tags.append("alt-of")
144 elif is_abbr:
145 sense.tags.extend(["alt-of", "abbreviation"])
146 else:
147 sense.tags.append("form-of")
148 expanded_template = wxr.wtp.parse(
149 wxr.wtp.node_to_wikitext(t_node), expand_all=True
150 )
151 if t_node.template_name.endswith("-erhua form of"):
152 process_erhua_form_of_template(wxr, expanded_template, sense)
153 return True
154 elif (
155 t_node.template_name.lower()
156 in {"zh-short", "zh-abbrev", "中文简称"} | ZH_ALT_OF_TEMPLATES
157 ):
158 extract_zh_abbr_template(wxr, expanded_template, sense)
159 return False
161 form_of_words = []
162 for i_tag in expanded_template.find_html_recursively("i"):
163 form_of_words = process_form_of_template_child(wxr, i_tag)
165 if len(form_of_words) == 0:
166 for link_node in expanded_template.find_child_recursively( 166 ↛ 171line 166 didn't jump to line 171 because the loop on line 166 didn't complete
167 NodeKind.LINK
168 ):
169 form_of_words = process_form_of_template_child(wxr, link_node)
170 break
171 for form_of_word in form_of_words:
172 form_of = AltForm(word=form_of_word)
173 if is_alt_of or is_abbr:
174 sense.alt_of.append(form_of)
175 else:
176 sense.form_of.append(form_of)
178 if expanded_template.contain_node(NodeKind.LIST):
179 shared_gloss = clean_node(
180 wxr,
181 None,
182 list(
183 expanded_template.invert_find_child(
184 NodeKind.LIST, include_empty_str=True
185 )
186 ),
187 )
188 for list_item_node in expanded_template.find_child_recursively(
189 NodeKind.LIST_ITEM
190 ):
191 new_sense = sense.model_copy(deep=True)
192 new_sense.glosses.append(shared_gloss)
193 new_sense.glosses.append(
194 clean_node(wxr, None, list_item_node.children)
195 )
196 page_data[-1].senses.append(new_sense)
197 return True
199 return False
202def process_form_of_template_child(
203 wxr: WiktextractContext, node: WikiNode
204) -> list[str]:
205 form_of_words = []
206 span_text = clean_node(wxr, None, node)
207 for form_of_word in span_text.split("和"):
208 form_of_word = form_of_word.strip()
209 if form_of_word != "": 209 ↛ 207line 209 didn't jump to line 207 because the condition on line 209 was always true
210 form_of_words.append(form_of_word)
211 return form_of_words
214def process_erhua_form_of_template(
215 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
216) -> None:
217 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of
218 for index, span_node in enumerate(
219 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
220 ):
221 span_text = clean_node(wxr, None, span_node)
222 form = AltForm(word=span_text)
223 if index == 0:
224 form.tags.append("Traditional-Chinese")
225 else:
226 form.tags.append("Simplified-Chinese")
227 if len(form.word) > 0: 227 ↛ 218line 227 didn't jump to line 218 because the condition on line 227 was always true
228 sense.form_of.append(form)
229 gloss_text = clean_node(wxr, sense, expanded_node)
230 if gloss_text.startswith("(官話)"): 230 ↛ 233line 230 didn't jump to line 233 because the condition on line 230 was always true
231 gloss_text = gloss_text.removeprefix("(官話)").strip()
232 sense.tags.append("Mandarin")
233 sense.tags.append("Erhua")
234 if len(gloss_text) > 0: 234 ↛ exitline 234 didn't return from function 'process_erhua_form_of_template' because the condition on line 234 was always true
235 sense.glosses.append(gloss_text)
238def process_zh_mw_template(
239 wxr: WiktextractContext, node: TemplateNode, sense: Sense
240) -> None:
241 # Chinese inline classifier template
242 # https://zh.wiktionary.org/wiki/Template:分類詞
243 expanded_node = wxr.wtp.parse(
244 wxr.wtp.node_to_wikitext(node), expand_all=True
245 )
246 classifiers = []
247 last_word = ""
248 for span_tag in expanded_node.find_html_recursively("span"):
249 span_class = span_tag.attrs.get("class", "")
250 if span_class in ["Hani", "Hant", "Hans"]:
251 word = clean_node(wxr, None, span_tag)
252 if word != "/":
253 classifier = Classifier(classifier=word)
254 if span_class == "Hant":
255 classifier.tags.append("Traditional-Chinese")
256 elif span_class == "Hans":
257 classifier.tags.append("Simplified-Chinese")
259 if len(classifiers) > 0 and last_word != "/":
260 sense.classifiers.extend(classifiers)
261 classifiers.clear()
262 classifiers.append(classifier)
263 last_word = word
264 elif "title" in span_tag.attrs:
265 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
266 if len(raw_tag) > 0: 266 ↛ 248line 266 didn't jump to line 248 because the condition on line 266 was always true
267 for classifier in classifiers:
268 classifier.raw_tags.append(raw_tag)
269 sense.classifiers.extend(classifiers)
270 for classifier in sense.classifiers:
271 translate_raw_tags(classifier)
274def extract_zh_abbr_template(
275 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
276):
277 # https://zh.wiktionary.org/wiki/Template:Zh-short
278 roman = ""
279 for i_tag in expanded_node.find_html("i"):
280 roman = clean_node(wxr, None, i_tag)
281 for span_tag in expanded_node.find_html("span"):
282 span_class = span_tag.attrs.get("class", "")
283 alt_form = AltForm(word=clean_node(wxr, None, span_tag), roman=roman)
284 if span_class == "Hant":
285 alt_form.tags.append("Traditional-Chinese")
286 elif span_class == "Hans":
287 alt_form.tags.append("Simplified-Chinese")
288 if alt_form.word not in ["", "/"]:
289 sense.alt_of.append(alt_form)
292def extract_defdate_template(
293 wxr: WiktextractContext, sense: Sense | Linkage, t_node: TemplateNode
294):
295 from .models import AttestationData, ReferenceData
297 expanded_node = wxr.wtp.parse(
298 wxr.wtp.node_to_wikitext(t_node), expand_all=True
299 )
300 date = clean_node(wxr, None, expanded_node).strip("() ")
301 if date != "": 301 ↛ exitline 301 didn't return from function 'extract_defdate_template' because the condition on line 301 was always true
302 attestation = AttestationData(date=date)
303 for ref_tag in expanded_node.find_html_recursively("ref"):
304 ref_text = clean_node(wxr, None, ref_tag.children)
305 ref_name = ref_tag.attrs.get("name", "")
306 if ref_text != "": 306 ↛ 303line 306 didn't jump to line 303 because the condition on line 306 was always true
307 attestation.references.append(
308 ReferenceData(text=ref_text, refn=ref_name)
309 )
310 sense.attestations.append(attestation)