Coverage for src/wiktextract/extractor/zh/gloss.py: 95%
135 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import NodeKind, WikiNode
4from wikitextprocessor.parser import TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from .example import extract_example_list_item
10from .models import AltForm, Classifier, Sense, WordEntry
11from .tags import translate_raw_tags
13# https://zh.wiktionary.org/wiki/Template:Label
14LABEL_TEMPLATES = frozenset(["lb", "lbl", "label"])
17def extract_gloss(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 list_node: WikiNode,
21 parent_gloss_data: Sense,
22) -> None:
23 lang_code = page_data[-1].lang_code
24 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
25 gloss_nodes = []
26 raw_tags = []
27 gloss_data = parent_gloss_data.model_copy(deep=True)
28 for node in list_item_node.children:
29 if isinstance(node, TemplateNode):
30 if node.template_name == "rfdef": 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true
31 continue
32 raw_tag = clean_node(wxr, gloss_data, node)
33 if node.template_name in LABEL_TEMPLATES:
34 raw_tags.extend(raw_tag.strip("()").split(","))
35 elif raw_tag.startswith("〈") and raw_tag.endswith("〉"): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 raw_tags.append(raw_tag.strip("〈〉"))
37 elif (
38 node.template_name in FORM_OF_TEMPLATES
39 or node.template_name.endswith((" of", " form", "-form"))
40 ) and process_form_of_template(
41 wxr, node, gloss_data, page_data
42 ):
43 pass
44 elif node.template_name == "zh-mw":
45 process_zh_mw_template(wxr, node, gloss_data)
46 else:
47 gloss_nodes.append(node)
48 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
49 continue
50 else:
51 gloss_nodes.append(node)
53 if lang_code == "ja":
54 expanded_node = wxr.wtp.parse(
55 wxr.wtp.node_to_wikitext(gloss_nodes), expand_all=True
56 )
57 ruby_data, nodes_without_ruby = extract_ruby(
58 wxr, expanded_node.children
59 )
60 gloss_text = clean_node(wxr, gloss_data, nodes_without_ruby)
61 else:
62 ruby_data = []
63 gloss_text = clean_node(wxr, gloss_data, gloss_nodes)
65 gloss_data.raw_tags.extend(raw_tags)
66 if len(gloss_text) > 0:
67 gloss_data.glosses.append(gloss_text)
68 if len(ruby_data) > 0: 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 gloss_data.ruby = ruby_data
71 has_nested_gloss = False
72 if list_item_node.contain_node(NodeKind.LIST):
73 for next_list in list_item_node.find_child(NodeKind.LIST):
74 if next_list.sarg.endswith("#"): # nested gloss
75 has_nested_gloss = True
76 extract_gloss(wxr, page_data, next_list, gloss_data)
77 else:
78 for e_list_item in next_list.find_child(NodeKind.LIST_ITEM):
79 extract_example_list_item(
80 wxr, gloss_data, e_list_item, page_data
81 )
83 if not has_nested_gloss and len(gloss_data.glosses) > 0:
84 translate_raw_tags(gloss_data)
85 page_data[-1].senses.append(gloss_data)
88def process_form_of_template(
89 wxr: WiktextractContext,
90 template_node: TemplateNode,
91 sense: Sense,
92 page_data: list[WordEntry],
93) -> bool:
94 # Return `True` if template expands to list or don't want add gloss again
95 # in `extract_gloss()`
96 # https://en.wiktionary.org/wiki/Category:Form-of_templates
97 # https://en.wiktionary.org/wiki/Category:Form-of_templates_by_language
98 is_alt_of = re.search(
99 r"^alt|alt[\s-]|alternative", template_node.template_name.lower()
100 )
101 sense.tags.append("alt-of" if is_alt_of else "form-of")
102 expanded_template = wxr.wtp.parse(
103 wxr.wtp.node_to_wikitext(template_node), expand_all=True
104 )
105 if template_node.template_name.endswith("-erhua form of"):
106 process_erhua_form_of_template(wxr, expanded_template, sense)
107 return True
109 form_of_words = []
110 for i_tag in expanded_template.find_html_recursively("i"):
111 form_of_words = process_form_of_template_child(wxr, i_tag)
113 if len(form_of_words) == 0:
114 for link_node in expanded_template.find_child_recursively( 114 ↛ 119line 114 didn't jump to line 119 because the loop on line 114 didn't complete
115 NodeKind.LINK
116 ):
117 form_of_words = process_form_of_template_child(wxr, link_node)
118 break
119 for form_of_word in form_of_words:
120 form_of = AltForm(word=form_of_word)
121 if is_alt_of:
122 sense.alt_of.append(form_of)
123 else:
124 sense.form_of.append(form_of)
126 if expanded_template.contain_node(NodeKind.LIST):
127 shared_gloss = clean_node(
128 wxr, None, list(expanded_template.invert_find_child(NodeKind.LIST))
129 )
130 for list_item_node in expanded_template.find_child_recursively(
131 NodeKind.LIST_ITEM
132 ):
133 new_sense = sense.model_copy(deep=True)
134 new_sense.glosses.append(shared_gloss)
135 new_sense.glosses.append(
136 clean_node(wxr, None, list_item_node.children)
137 )
138 page_data[-1].senses.append(new_sense)
139 return True
141 return False
144def process_form_of_template_child(
145 wxr: WiktextractContext, node: WikiNode
146) -> list[str]:
147 form_of_words = []
148 span_text = clean_node(wxr, None, node)
149 for form_of_word in span_text.split("和"):
150 form_of_word = form_of_word.strip()
151 if form_of_word != "": 151 ↛ 149line 151 didn't jump to line 149 because the condition on line 151 was always true
152 form_of_words.append(form_of_word)
153 return form_of_words
156def process_erhua_form_of_template(
157 wxr: WiktextractContext, expanded_node: WikiNode, sense: Sense
158) -> None:
159 # https://zh.wiktionary.org/wiki/Template:Cmn-erhua_form_of
160 for index, span_node in enumerate(
161 expanded_node.find_html("span", attr_name="lang", attr_value="zh")
162 ):
163 span_text = clean_node(wxr, None, span_node)
164 form = AltForm(word=span_text)
165 if index == 0:
166 form.tags.append("Traditional Chinese")
167 else:
168 form.tags.append("Simplified Chinese")
169 if len(form.word) > 0: 169 ↛ 160line 169 didn't jump to line 160 because the condition on line 169 was always true
170 sense.form_of.append(form)
171 gloss_text = clean_node(wxr, sense, expanded_node)
172 if gloss_text.startswith("(官話)"): 172 ↛ 175line 172 didn't jump to line 175 because the condition on line 172 was always true
173 gloss_text = gloss_text.removeprefix("(官話)").strip()
174 sense.tags.append("Mandarin")
175 sense.tags.append("Erhua")
176 if len(gloss_text) > 0: 176 ↛ exitline 176 didn't return from function 'process_erhua_form_of_template' because the condition on line 176 was always true
177 sense.glosses.append(gloss_text)
180# https://zh.wiktionary.org/wiki/Category:/Category:之形式模板
181FORM_OF_TEMPLATES = {
182 "alt case, altcaps",
183 "alt form, altform",
184 "alt sp",
185 "construed with",
186 "honor alt case",
187 "missp",
188 "obs sp",
189 "rare sp",
190 "rfform",
191 "short for",
192 "stand sp",
193 "sup sp",
194}
197def process_zh_mw_template(
198 wxr: WiktextractContext, node: TemplateNode, sense: Sense
199) -> None:
200 # Chinese inline classifier template
201 # https://zh.wiktionary.org/wiki/Template:分類詞
202 expanded_node = wxr.wtp.parse(
203 wxr.wtp.node_to_wikitext(node), expand_all=True
204 )
205 classifiers = []
206 last_word = ""
207 for span_tag in expanded_node.find_html_recursively("span"):
208 span_class = span_tag.attrs.get("class", "")
209 if span_class in ["Hani", "Hant", "Hans"]:
210 word = clean_node(wxr, None, span_tag)
211 if word != "/":
212 classifier = Classifier(classifier=word)
213 if span_class == "Hant":
214 classifier.tags.append("Traditional Chinese")
215 elif span_class == "Hans":
216 classifier.tags.append("Simplified Chinese")
218 if len(classifiers) > 0 and last_word != "/":
219 sense.classifiers.extend(classifiers)
220 classifiers.clear()
221 classifiers.append(classifier)
222 last_word = word
223 elif "title" in span_tag.attrs:
224 raw_tag = clean_node(wxr, None, span_tag.attrs["title"])
225 if len(raw_tag) > 0: 225 ↛ 207line 225 didn't jump to line 207 because the condition on line 225 was always true
226 for classifier in classifiers:
227 classifier.raw_tags.append(raw_tag)
228 sense.classifiers.extend(classifiers)
229 for classifier in sense.classifiers:
230 translate_raw_tags(classifier)