Coverage for src/wiktextract/extractor/zh/gloss.py: 94%
138 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
3from wikitextprocessor import NodeKind, WikiNode
4from wikitextprocessor.parser import TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from .example import extract_example_list_item
10from .models import AltForm, Classifier, Sense, WordEntry
11from .tags import translate_raw_tags
13# https://zh.wiktionary.org/wiki/Template:Label
14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
17def extract_gloss(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 list_node: WikiNode,
21 parent_gloss_data: Sense,
22) -> None:
23 lang_code = page_data[-1].lang_code
24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
25 gloss_nodes = []
26 raw_tags = []
27 gloss_data = parent_gloss_data.model_copy(deep=True)
28 for node in list_item_node.children:
29 if isinstance(node, TemplateNode):
30 if node.template_name == "rfdef": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 continue
32 raw_tag = clean_node(wxr, gloss_data, node)
33 if node.template_name in LABEL_TEMPLATES:
34 for r_tag in re.split(r",|或", raw_tag.strip("()")):
35 r_tag = r_tag.strip()
36 if r_tag != "": 36 ↛ 34line 36 didn't jump to line 34 because the condition on line 36 was always true
37 raw_tags.append(r_tag)
38 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 38 ↛ 39line 38 didn't jump to line 39 because the condition on line 38 was never true
39 raw_tags.append(raw_tag.strip("〈〉"))
40 elif (
41 node.template_name in FORM_OF_TEMPLATES
42 or node.template_name.endswith((" of", " form", "-form"))
43 ) and process_form_of_template(
44 wxr, node, gloss_data, page_data
45 ):
46 pass
47 elif node.template_name == "zh-mw":
48 process_zh_mw_template(wxr, node, gloss_data)
49 else:
50 gloss_nodes.append(node)
51 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
52 continue
53 else:
54 gloss_nodes.append(node)
56 if lang_code == "ja":
57 expanded_node = wxr.wtp.parse(
58 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
59 )
60 ruby_data, nodes_without_ruby = extract_ruby(
61 wxr, expanded_node.children
62 )
63 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
64 else:
65 ruby_data = []
66 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
68 gloss_data.raw_tags.extend(raw_tags)
69 if len(gloss_text) > 0:
70 gloss_data.glosses.append(gloss_text)
71 if len(ruby_data) > 0: 71 ↛ 72line 71 didn't jump to line 72 because the condition on line 71 was never true
72 gloss_data.ruby = ruby_data
74 has_nested_gloss = False
75 if list_item_node.contain_node(NodeKind.LIST):
76 for next_list in list_item_node.find_child(NodeKind.LIST):
77 if next_list.sarg.endswith("#"): # nested gloss
78 has_nested_gloss = True
79 extract_gloss(wxr, page_data, next_list, gloss_data)
80 else:
81 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):
82 extract_example_list_item(
83 wxr, gloss_data, e_list_item, page_data[-1]
84 )
86 if not has_nested_gloss and len(gloss_data.glosses) > 0:
87 translate_raw_tags(gloss_data)
88 page_data[-1].senses.append(gloss_data)
91def process_form_of_template(
92 wxr: WiktextractContext,
93 template_node: TemplateNode,
94 sense: Sense,
95 page_data: list[WordEntry],
96) -> bool:
97 # Return `True` if template expands to list or don't want add gloss again
98 # in `extract_gloss()`
99 # https://en.wiktionary.org/wiki/Category:Form-of_templates
100 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language
101 is_alt_of = re.search(
102 r"^alt|alt[\s-]|alternative", template_node.template_name.lower()
103 )
104 sense.tags.append("alt-of" if is_alt_of else "form-of")
105 expanded_template = wxr.wtp.parse(
106 wxr.wtp.node_to_wikitext(template_node), expand_all=True
107 )
108 if template_node.template_name.endswith("-erhua form of"):
109 process_erhua_form_of_template(wxr, expanded_template, sense)
110 return True
112 form_of_words = []
113 for i_tag in expanded_template.find_html_recursively("i"):
114 form_of_words = process_form_of_template_child(wxr, i_tag)
116 if len(form_of_words) == 0:
117 for link_node in expanded_template.find_child_recursively( 117 ↛ 122line 117 didn't jump to line 122 because the loop on line 117 didn't complete
118 NodeKind.LINK
119 ):
120 form_of_words = process_form_of_template_child(wxr, link_node)
121 break
122 for form_of_word in form_of_words:
123 form_of = AltForm(word=form_of_word)
124 if is_alt_of:
125 sense.alt_of.append(form_of)
126 else:
127 sense.form_of.append(form_of)
129 if expanded_template.contain_node(NodeKind.LIST):
130 shared_gloss = clean_node(
131 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST))
132 )
133 for list_item_node in expanded_template.find_child_recursively(
134 NodeKind.LIST_ITEM
135 ):
136 new_sense = sense.model_copy(deep=True)
137 new_sense.glosses.append(shared_gloss)
138 new_sense.glosses.append(
139 clean_node(wxr, None, list_item_node.children)
140 )
141 page_data[-1].senses.append(new_sense)
142 return True
144 return False
147def process_form_of_template_child(
148 wxr: WiktextractContext, node: WikiNode
149) -> list[str]:
150 form_of_words = []
151 span_text = clean_node(wxr, None, node)
152 for form_of_word in span_text.split("和"):
153 form_of_word = form_of_word.strip()
154 if form_of_word != "": 154 ↛ 152line 154 didn't jump to line 152 because the condition on line 154 was always true
155 form_of_words.append(form_of_word)
156 return form_of_words
159def process_erhua_form_of_template(
160 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
161) -> None:
162 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of
163 for index, span_node in enumerate(
164 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
165 ):
166 span_text = clean_node(wxr, None, span_node)
167 form = AltForm(word=span_text)
168 if index == 0:
169 form.tags.append("Traditional Chinese")
170 else:
171 form.tags.append("Simplified Chinese")
172 if len(form.word) > 0: 172 ↛ 163line 172 didn't jump to line 163 because the condition on line 172 was always true
173 sense.form_of.append(form)
174 gloss_text = clean_node(wxr, sense, expanded_node)
175 if gloss_text.startswith("(官話)"): 175 ↛ 178line 175 didn't jump to line 178 because the condition on line 175 was always true
176 gloss_text = gloss_text.removeprefix("(官話)").strip()
177 sense.tags.append("Mandarin")
178 sense.tags.append("Erhua")
179 if len(gloss_text) > 0: 179 ↛ exitline 179 didn't return from function 'process_erhua_form_of_template' because the condition on line 179 was always true
180 sense.glosses.append(gloss_text)
183# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板
184FORM_OF_TEMPLATES = {
185 "alt case, altcaps",
186 "alt form, altform",
187 "alt sp",
188 "construed with",
189 "honor alt case",
190 "missp",
191 "obs sp",
192 "rare sp",
193 "rfform",
194 "short for",
195 "stand sp",
196 "sup sp",
197}
200def process_zh_mw_template(
201 wxr: WiktextractContext, node: TemplateNode, sense: Sense
202) -> None:
203 # Chinese inline classifier template
204 # https://zh.wiktionary.org/wiki/Template:分類詞
205 expanded_node = wxr.wtp.parse(
206 wxr.wtp.node_to_wikitext(node), expand_all=True
207 )
208 classifiers = []
209 last_word = ""
210 for span_tag in expanded_node.find_html_recursively("span"):
211 span_class = span_tag.attrs.get("class", "")
212 if span_class in ["Hani", "Hant", "Hans"]:
213 word = clean_node(wxr, None, span_tag)
214 if word != "/":
215 classifier = Classifier(classifier=word)
216 if span_class == "Hant":
217 classifier.tags.append("Traditional Chinese")
218 elif span_class == "Hans":
219 classifier.tags.append("Simplified Chinese")
221 if len(classifiers) > 0 and last_word != "/":
222 sense.classifiers.extend(classifiers)
223 classifiers.clear()
224 classifiers.append(classifier)
225 last_word = word
226 elif "title" in span_tag.attrs:
227 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
228 if len(raw_tag) > 0: 228 ↛ 210line 228 didn't jump to line 210 because the condition on line 228 was always true
229 for classifier in classifiers:
230 classifier.raw_tags.append(raw_tag)
231 sense.classifiers.extend(classifiers)
232 for classifier in sense.classifiers:
233 translate_raw_tags(classifier)