Coverage for src / wiktextract / extractor / zh / gloss.py: 94%
183 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1import re
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from .example import extract_example_list_item
9from .models import AltForm, Classifier, Linkage, Sense, WordEntry
10from .tags import translate_raw_tags
12# https://zh.wiktionary.org/wiki/Template:Label
13LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
15# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板
16FORM_OF_TEMPLATES = frozenset(
17 [
18 "alt case",
19 "alt formaltform",
20 "alt sp",
21 "construed with",
22 "honor alt case",
23 "missp",
24 "obs sp",
25 "rare sp",
26 "rfform",
27 "short for",
28 "stand sp",
29 "sup sp",
30 ]
31)
32ABBR_TEMPALTES = frozenset(
33 [
34 "之縮寫",
35 "abbreviation of",
36 "abbr of",
37 "abbrev of",
38 "zh-short",
39 "zh-abbrev",
40 "中文简称",
41 ]
42)
43ZH_ALT_OF_TEMPLATES = frozenset(
44 ["zh-altname", "zh-alt-name", "中文別名", "中文别名"]
45)
48def extract_gloss(
49 wxr: WiktextractContext,
50 page_data: list[WordEntry],
51 list_node: WikiNode,
52 parent_gloss_data: Sense,
53) -> None:
54 lang_code = page_data[-1].lang_code
55 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
56 gloss_nodes = []
57 raw_tags = []
58 gloss_data = parent_gloss_data.model_copy(deep=True)
59 for node in list_item_node.children:
60 if isinstance(node, TemplateNode):
61 if node.template_name == "rfdef": 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 continue
63 raw_tag = clean_node(wxr, gloss_data, node)
64 if node.template_name.lower() in LABEL_TEMPLATES:
65 for r_tag in re.split(r",|或", raw_tag.strip("()")):
66 r_tag = r_tag.strip()
67 if r_tag != "": 67 ↛ 65line 67 didn't jump to line 65 because the condition on line 67 was always true
68 raw_tags.append(r_tag)
69 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 69 ↛ 70line 69 didn't jump to line 70 because the condition on line 69 was never true
70 raw_tags.append(raw_tag.strip("〈〉"))
71 elif (
72 node.template_name
73 in FORM_OF_TEMPLATES | ABBR_TEMPALTES | ZH_ALT_OF_TEMPLATES
74 or node.template_name.endswith((" of", " form", "-form"))
75 ) and process_form_of_template(
76 wxr, node, gloss_data, page_data
77 ):
78 pass
79 elif node.template_name == "zh-mw":
80 process_zh_mw_template(wxr, node, gloss_data)
81 elif node.template_name.lower() in ["zh-obsolete", "†", "zh-o"]:
82 if "obsolete" not in gloss_data.tags: 82 ↛ 59line 82 didn't jump to line 59 because the condition on line 82 was always true
83 gloss_data.tags.append("obsolete")
84 elif node.template_name.lower() in ["defdate", "datedef"]:
85 extract_defdate_template(wxr, gloss_data, node)
86 else:
87 gloss_nodes.append(node)
88 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
89 continue
90 else:
91 gloss_nodes.append(node)
93 if lang_code == "ja":
94 expanded_node = wxr.wtp.parse(
95 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
96 )
97 ruby_data, nodes_without_ruby = extract_ruby(
98 wxr, expanded_node.children
99 )
100 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
101 else:
102 ruby_data = []
103 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
105 gloss_data.raw_tags.extend(raw_tags)
106 if len(gloss_text) > 0:
107 gloss_data.glosses.append(gloss_text)
108 if len(ruby_data) > 0: 108 ↛ 109line 108 didn't jump to line 109 because the condition on line 108 was never true
109 gloss_data.ruby = ruby_data
111 translate_raw_tags(gloss_data)
112 if len(gloss_data.glosses) > 0:
113 page_data[-1].senses.append(gloss_data)
115 if list_item_node.contain_node(NodeKind.LIST):
116 for next_list in list_item_node.find_child(NodeKind.LIST):
117 if next_list.sarg.endswith("#"): # nested gloss
118 extract_gloss(wxr, page_data, next_list, gloss_data)
119 else:
120 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):
121 extract_example_list_item(
122 wxr, gloss_data, e_list_item, page_data[-1]
123 )
126def process_form_of_template(
127 wxr: WiktextractContext,
128 t_node: TemplateNode,
129 sense: Sense,
130 page_data: list[WordEntry],
131) -> bool:
132 # Return `True` if template expands to list or don't want add gloss again
133 # in `extract_gloss()`
134 # https://en.wiktionary.org/wiki/Category:Form-of_templates
135 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language
136 is_alt_of = (
137 re.search(r"^alt|alt[\s-]|alternative", t_node.template_name.lower())
138 or t_node.template_name.lower() in ZH_ALT_OF_TEMPLATES
139 )
140 is_abbr = t_node.template_name.lower() in ABBR_TEMPALTES
141 if is_alt_of:
142 sense.tags.append("alt-of")
143 elif is_abbr:
144 sense.tags.extend(["alt-of", "abbreviation"])
145 else:
146 sense.tags.append("form-of")
147 expanded_template = wxr.wtp.parse(
148 wxr.wtp.node_to_wikitext(t_node), expand_all=True
149 )
150 if t_node.template_name.endswith(("-erhua form of", "-pinyin of")):
151 process_erhua_form_of_template(
152 wxr, expanded_template, sense, t_node.template_name
153 )
154 return True
155 elif (
156 t_node.template_name.lower()
157 in {"zh-short", "zh-abbrev", "中文简称"} | ZH_ALT_OF_TEMPLATES
158 ):
159 extract_zh_abbr_template(wxr, expanded_template, sense)
160 return False
162 form_of_words = []
163 for i_tag in expanded_template.find_html_recursively("i"):
164 form_of_words = process_form_of_template_child(wxr, i_tag)
166 if len(form_of_words) == 0:
167 for link_node in expanded_template.find_child_recursively( 167 ↛ 172line 167 didn't jump to line 172 because the loop on line 167 didn't complete
168 NodeKind.LINK
169 ):
170 form_of_words = process_form_of_template_child(wxr, link_node)
171 break
172 for form_of_word in form_of_words:
173 form_of = AltForm(word=form_of_word)
174 if is_alt_of or is_abbr:
175 sense.alt_of.append(form_of)
176 else:
177 sense.form_of.append(form_of)
179 if expanded_template.contain_node(NodeKind.LIST):
180 shared_gloss = clean_node(
181 wxr,
182 None,
183 list(
184 expanded_template.invert_find_child(
185 NodeKind.LIST, include_empty_str=True
186 )
187 ),
188 )
189 for list_item_node in expanded_template.find_child_recursively(
190 NodeKind.LIST_ITEM
191 ):
192 new_sense = sense.model_copy(deep=True)
193 new_sense.glosses.append(shared_gloss)
194 new_sense.glosses.append(
195 clean_node(wxr, None, list_item_node.children)
196 )
197 page_data[-1].senses.append(new_sense)
198 return True
200 return False
203def process_form_of_template_child(
204 wxr: WiktextractContext, node: WikiNode
205) -> list[str]:
206 form_of_words = []
207 span_text = clean_node(wxr, None, node)
208 for form_of_word in span_text.split("和"):
209 form_of_word = form_of_word.strip()
210 if form_of_word != "": 210 ↛ 208line 210 didn't jump to line 208 because the condition on line 210 was always true
211 form_of_words.append(form_of_word)
212 return form_of_words
215def process_erhua_form_of_template(
216 wxr: WiktextractContext,
217 expanded_node: WikiNode,
218 sense: Sense,
219 template_name: str,
220) -> None:
221 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of
222 for index, span_node in enumerate(
223 expanded_node.find_html_recursively(
224 "span", attr_name="lang", attr_value="zh"
225 )
226 ):
227 span_text = clean_node(wxr, None, span_node)
228 form = AltForm(word=span_text)
229 if index == 0:
230 form.tags.append("Traditional-Chinese")
231 else:
232 form.tags.append("Simplified-Chinese")
233 if len(form.word) > 0: 233 ↛ 222line 233 didn't jump to line 222 because the condition on line 233 was always true
234 sense.form_of.append(form)
235 gloss_nodes = []
236 for node in expanded_node.children:
237 if isinstance(node, HTMLNode) and node.tag == "small":
238 for span_node in node.find_html_recursively(
239 "span", attr_name="class", attr_value="ib-content"
240 ):
241 raw_tag = clean_node(wxr, None, span_node)
242 if raw_tag != "": 242 ↛ 238line 242 didn't jump to line 238 because the condition on line 242 was always true
243 sense.raw_tags.append(raw_tag)
244 else:
245 gloss_nodes.append(node)
246 gloss_text = clean_node(wxr, sense, gloss_nodes)
247 if template_name.endswith("-erhua form of"):
248 sense.tags.append("Erhua")
249 if len(gloss_text) > 0: 249 ↛ exitline 249 didn't return from function 'process_erhua_form_of_template' because the condition on line 249 was always true
250 sense.glosses.append(gloss_text)
253def process_zh_mw_template(
254 wxr: WiktextractContext, node: TemplateNode, sense: Sense
255) -> None:
256 # Chinese inline classifier template
257 # https://zh.wiktionary.org/wiki/Template:分類詞
258 expanded_node = wxr.wtp.parse(
259 wxr.wtp.node_to_wikitext(node), expand_all=True
260 )
261 classifiers = []
262 last_word = ""
263 for span_tag in expanded_node.find_html_recursively("span"):
264 span_class = span_tag.attrs.get("class", "")
265 if span_class in ["Hani", "Hant", "Hans"]:
266 word = clean_node(wxr, None, span_tag)
267 if word != "/":
268 classifier = Classifier(classifier=word)
269 if span_class == "Hant":
270 classifier.tags.append("Traditional-Chinese")
271 elif span_class == "Hans":
272 classifier.tags.append("Simplified-Chinese")
274 if len(classifiers) > 0 and last_word != "/":
275 sense.classifiers.extend(classifiers)
276 classifiers.clear()
277 classifiers.append(classifier)
278 last_word = word
279 elif "title" in span_tag.attrs:
280 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
281 if len(raw_tag) > 0: 281 ↛ 263line 281 didn't jump to line 263 because the condition on line 281 was always true
282 for classifier in classifiers:
283 classifier.raw_tags.append(raw_tag)
284 sense.classifiers.extend(classifiers)
285 for classifier in sense.classifiers:
286 translate_raw_tags(classifier)
287 for link in expanded_node.find_child(NodeKind.LINK): 287 ↛ 288line 287 didn't jump to line 288 because the loop on line 287 never started
288 clean_node(wxr, sense, link)
291def extract_zh_abbr_template(
292 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
293):
294 # https://zh.wiktionary.org/wiki/Template:Zh-short
295 roman = ""
296 for i_tag in expanded_node.find_html("i"):
297 roman = clean_node(wxr, None, i_tag)
298 for span_tag in expanded_node.find_html("span"):
299 span_class = span_tag.attrs.get("class", "")
300 alt_form = AltForm(word=clean_node(wxr, None, span_tag), roman=roman)
301 if span_class == "Hant":
302 alt_form.tags.append("Traditional-Chinese")
303 elif span_class == "Hans":
304 alt_form.tags.append("Simplified-Chinese")
305 if alt_form.word not in ["", "/"]:
306 sense.alt_of.append(alt_form)
309def extract_defdate_template(
310 wxr: WiktextractContext, sense: Sense | Linkage, t_node: TemplateNode
311):
312 from .models import AttestationData, ReferenceData
314 expanded_node = wxr.wtp.parse(
315 wxr.wtp.node_to_wikitext(t_node), expand_all=True
316 )
317 date = clean_node(wxr, None, expanded_node).strip("() ")
318 if date != "": 318 ↛ exitline 318 didn't return from function 'extract_defdate_template' because the condition on line 318 was always true
319 attestation = AttestationData(date=date)
320 for ref_tag in expanded_node.find_html_recursively("ref"):
321 ref_text = clean_node(wxr, None, ref_tag.children)
322 ref_name = ref_tag.attrs.get("name", "")
323 if ref_text != "": 323 ↛ 320line 323 didn't jump to line 320 because the condition on line 323 was always true
324 attestation.references.append(
325 ReferenceData(text=ref_text, refn=ref_name)
326 )
327 sense.attestations.append(attestation)