Coverage for src/wiktextract/extractor/zh/gloss.py: 95%
176 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from wikitextprocessor import NodeKind, WikiNode
4from wikitextprocessor.parser import TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from .example import extract_example_list_item
10from .models import AltForm, Classifier, Linkage, Sense, WordEntry
11from .tags import translate_raw_tags
13# https://zh.wiktionary.org/wiki/Template:Label
14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
16# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板
17FORM_OF_TEMPLATES = frozenset(
18 [
19 "alt case",
20 "alt formaltform",
21 "alt sp",
22 "construed with",
23 "honor alt case",
24 "missp",
25 "obs sp",
26 "rare sp",
27 "rfform",
28 "short for",
29 "stand sp",
30 "sup sp",
31 ]
32)
33ABBR_TEMPALTES = frozenset(
34 [
35 "之縮寫",
36 "abbreviation of",
37 "abbr of",
38 "abbrev of",
39 "zh-short",
40 "zh-abbrev",
41 "中文简称",
42 ]
43)
44ZH_ALT_OF_TEMPLATES = frozenset(
45 ["zh-altname", "zh-alt-name", "中文別名", "中文别名"]
46)
49def extract_gloss(
50 wxr: WiktextractContext,
51 page_data: list[WordEntry],
52 list_node: WikiNode,
53 parent_gloss_data: Sense,
54) -> None:
55 lang_code = page_data[-1].lang_code
56 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
57 gloss_nodes = []
58 raw_tags = []
59 gloss_data = parent_gloss_data.model_copy(deep=True)
60 for node in list_item_node.children:
61 if isinstance(node, TemplateNode):
62 if node.template_name == "rfdef": 62 ↛ 63line 62 didn't jump to line 63 because the condition on line 62 was never true
63 continue
64 raw_tag = clean_node(wxr, gloss_data, node)
65 if node.template_name.lower() in LABEL_TEMPLATES:
66 for r_tag in re.split(r",|或", raw_tag.strip("()")):
67 r_tag = r_tag.strip()
68 if r_tag != "": 68 ↛ 66line 68 didn't jump to line 66 because the condition on line 68 was always true
69 raw_tags.append(r_tag)
70 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true
71 raw_tags.append(raw_tag.strip("〈〉"))
72 elif (
73 node.template_name
74 in FORM_OF_TEMPLATES | ABBR_TEMPALTES | ZH_ALT_OF_TEMPLATES
75 or node.template_name.endswith((" of", " form", "-form"))
76 ) and process_form_of_template(
77 wxr, node, gloss_data, page_data
78 ):
79 pass
80 elif node.template_name == "zh-mw":
81 process_zh_mw_template(wxr, node, gloss_data)
82 elif node.template_name.lower() in ["zh-obsolete", "†", "zh-o"]:
83 if "obsolete" not in gloss_data.tags: 83 ↛ 60line 83 didn't jump to line 60 because the condition on line 83 was always true
84 gloss_data.tags.append("obsolete")
85 elif node.template_name.lower() in ["defdate", "datedef"]:
86 extract_defdate_template(wxr, gloss_data, node)
87 else:
88 gloss_nodes.append(node)
89 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
90 continue
91 else:
92 gloss_nodes.append(node)
94 if lang_code == "ja":
95 expanded_node = wxr.wtp.parse(
96 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
97 )
98 ruby_data, nodes_without_ruby = extract_ruby(
99 wxr, expanded_node.children
100 )
101 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
102 else:
103 ruby_data = []
104 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
106 gloss_data.raw_tags.extend(raw_tags)
107 if len(gloss_text) > 0:
108 gloss_data.glosses.append(gloss_text)
109 if len(ruby_data) > 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 gloss_data.ruby = ruby_data
112 translate_raw_tags(gloss_data)
113 if len(gloss_data.glosses) > 0:
114 page_data[-1].senses.append(gloss_data)
116 if list_item_node.contain_node(NodeKind.LIST):
117 for next_list in list_item_node.find_child(NodeKind.LIST):
118 if next_list.sarg.endswith("#"): # nested gloss
119 extract_gloss(wxr, page_data, next_list, gloss_data)
120 else:
121 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):
122 extract_example_list_item(
123 wxr, gloss_data, e_list_item, page_data[-1]
124 )
127def process_form_of_template(
128 wxr: WiktextractContext,
129 t_node: TemplateNode,
130 sense: Sense,
131 page_data: list[WordEntry],
132) -> bool:
133 # Return `True` if template expands to list or don't want add gloss again
134 # in `extract_gloss()`
135 # https://en.wiktionary.org/wiki/Category:Form-of_templates
136 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language
137 is_alt_of = (
138 re.search(r"^alt|alt[\s-]|alternative", t_node.template_name.lower())
139 or t_node.template_name.lower() in ZH_ALT_OF_TEMPLATES
140 )
141 is_abbr = t_node.template_name.lower() in ABBR_TEMPALTES
142 if is_alt_of:
143 sense.tags.append("alt-of")
144 elif is_abbr:
145 sense.tags.extend(["alt-of", "abbreviation"])
146 else:
147 sense.tags.append("form-of")
148 expanded_template = wxr.wtp.parse(
149 wxr.wtp.node_to_wikitext(t_node), expand_all=True
150 )
151 if t_node.template_name.endswith("-erhua form of"):
152 process_erhua_form_of_template(wxr, expanded_template, sense)
153 return True
154 elif (
155 t_node.template_name.lower()
156 in {"zh-short", "zh-abbrev", "中文简称"} | ZH_ALT_OF_TEMPLATES
157 ):
158 extract_zh_abbr_template(wxr, expanded_template, sense)
159 return False
161 form_of_words = []
162 for i_tag in expanded_template.find_html_recursively("i"):
163 form_of_words = process_form_of_template_child(wxr, i_tag)
165 if len(form_of_words) == 0:
166 for link_node in expanded_template.find_child_recursively( 166 ↛ 171line 166 didn't jump to line 171 because the loop on line 166 didn't complete
167 NodeKind.LINK
168 ):
169 form_of_words = process_form_of_template_child(wxr, link_node)
170 break
171 for form_of_word in form_of_words:
172 form_of = AltForm(word=form_of_word)
173 if is_alt_of or is_abbr:
174 sense.alt_of.append(form_of)
175 else:
176 sense.form_of.append(form_of)
178 if expanded_template.contain_node(NodeKind.LIST):
179 shared_gloss = clean_node(
180 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST))
181 )
182 for list_item_node in expanded_template.find_child_recursively(
183 NodeKind.LIST_ITEM
184 ):
185 new_sense = sense.model_copy(deep=True)
186 new_sense.glosses.append(shared_gloss)
187 new_sense.glosses.append(
188 clean_node(wxr, None, list_item_node.children)
189 )
190 page_data[-1].senses.append(new_sense)
191 return True
193 return False
196def process_form_of_template_child(
197 wxr: WiktextractContext, node: WikiNode
198) -> list[str]:
199 form_of_words = []
200 span_text = clean_node(wxr, None, node)
201 for form_of_word in span_text.split("和"):
202 form_of_word = form_of_word.strip()
203 if form_of_word != "": 203 ↛ 201line 203 didn't jump to line 201 because the condition on line 203 was always true
204 form_of_words.append(form_of_word)
205 return form_of_words
208def process_erhua_form_of_template(
209 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
210) -> None:
211 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of
212 for index, span_node in enumerate(
213 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
214 ):
215 span_text = clean_node(wxr, None, span_node)
216 form = AltForm(word=span_text)
217 if index == 0:
218 form.tags.append("Traditional-Chinese")
219 else:
220 form.tags.append("Simplified-Chinese")
221 if len(form.word) > 0: 221 ↛ 212line 221 didn't jump to line 212 because the condition on line 221 was always true
222 sense.form_of.append(form)
223 gloss_text = clean_node(wxr, sense, expanded_node)
224 if gloss_text.startswith("(官話)"): 224 ↛ 227line 224 didn't jump to line 227 because the condition on line 224 was always true
225 gloss_text = gloss_text.removeprefix("(官話)").strip()
226 sense.tags.append("Mandarin")
227 sense.tags.append("Erhua")
228 if len(gloss_text) > 0: 228 ↛ exitline 228 didn't return from function 'process_erhua_form_of_template' because the condition on line 228 was always true
229 sense.glosses.append(gloss_text)
232def process_zh_mw_template(
233 wxr: WiktextractContext, node: TemplateNode, sense: Sense
234) -> None:
235 # Chinese inline classifier template
236 # https://zh.wiktionary.org/wiki/Template:分類詞
237 expanded_node = wxr.wtp.parse(
238 wxr.wtp.node_to_wikitext(node), expand_all=True
239 )
240 classifiers = []
241 last_word = ""
242 for span_tag in expanded_node.find_html_recursively("span"):
243 span_class = span_tag.attrs.get("class", "")
244 if span_class in ["Hani", "Hant", "Hans"]:
245 word = clean_node(wxr, None, span_tag)
246 if word != "/":
247 classifier = Classifier(classifier=word)
248 if span_class == "Hant":
249 classifier.tags.append("Traditional-Chinese")
250 elif span_class == "Hans":
251 classifier.tags.append("Simplified-Chinese")
253 if len(classifiers) > 0 and last_word != "/":
254 sense.classifiers.extend(classifiers)
255 classifiers.clear()
256 classifiers.append(classifier)
257 last_word = word
258 elif "title" in span_tag.attrs:
259 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
260 if len(raw_tag) > 0: 260 ↛ 242line 260 didn't jump to line 242 because the condition on line 260 was always true
261 for classifier in classifiers:
262 classifier.raw_tags.append(raw_tag)
263 sense.classifiers.extend(classifiers)
264 for classifier in sense.classifiers:
265 translate_raw_tags(classifier)
268def extract_zh_abbr_template(
269 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
270):
271 # https://zh.wiktionary.org/wiki/Template:Zh-short
272 roman = ""
273 for i_tag in expanded_node.find_html("i"):
274 roman = clean_node(wxr, None, i_tag)
275 for span_tag in expanded_node.find_html("span"):
276 span_class = span_tag.attrs.get("class", "")
277 alt_form = AltForm(word=clean_node(wxr, None, span_tag), roman=roman)
278 if span_class == "Hant":
279 alt_form.tags.append("Traditional-Chinese")
280 elif span_class == "Hans":
281 alt_form.tags.append("Simplified-Chinese")
282 if alt_form.word not in ["", "/"]:
283 sense.alt_of.append(alt_form)
286def extract_defdate_template(
287 wxr: WiktextractContext, sense: Sense | Linkage, t_node: TemplateNode
288):
289 from .models import AttestationData, ReferenceData
291 expanded_node = wxr.wtp.parse(
292 wxr.wtp.node_to_wikitext(t_node), expand_all=True
293 )
294 date = clean_node(wxr, None, expanded_node).strip("() ")
295 if date != "": 295 ↛ exitline 295 didn't return from function 'extract_defdate_template' because the condition on line 295 was always true
296 attestation = AttestationData(date=date)
297 for ref_tag in expanded_node.find_html_recursively("ref"):
298 ref_text = clean_node(wxr, None, ref_tag.children)
299 ref_name = ref_tag.attrs.get("name", "")
300 if ref_text != "": 300 ↛ 297line 300 didn't jump to line 297 because the condition on line 300 was always true
301 attestation.references.append(
302 ReferenceData(text=ref_text, refn=ref_name)
303 )
304 sense.attestations.append(attestation)