Coverage for src/wiktextract/extractor/zh/example.py: 97%
166 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Optional
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from ..ruby import extract_ruby
8from .linkage import process_linkage_templates_in_gloss
9from .models import Example, Sense, WordEntry
10from .tags import translate_raw_tags
12LINKAGE_TEMPLATES = {
13 "syn": "synonyms",
14 "synonyms": "synonyms",
15 "ant": "antonyms",
16 "antonyms": "antonyms",
17 "hyper": "hypernyms",
18 "hypernyms": "hypernyms",
19 "hypo": "hyponyms",
20 "hyponyms": "hyponyms",
21}
24def extract_example_list_item(
25 wxr: WiktextractContext,
26 sense_data: Sense,
27 list_item: WikiNode,
28 page_data: list[WordEntry],
29 parent_example: Optional[Example] = None,
30) -> None:
31 example_data = parent_example or Example()
32 if list_item.contain_node(NodeKind.LIST) and not all(
33 isinstance(n, TemplateNode)
34 for n in list_item.invert_find_child(NodeKind.LIST)
35 ):
36 # plain text in the nested list, not using any template
37 # https://zh.wiktionary.org/wiki/%, the second example
38 extract_plain_text_example_list(wxr, list_item, example_data)
39 else:
40 # parse example templates
41 for child in list_item.find_child(NodeKind.TEMPLATE):
42 template_name = child.template_name
43 if (
44 template_name.startswith(("quote-", "RQ:"))
45 or template_name == "quote"
46 ):
47 extract_quote_templates(wxr, child, example_data)
48 clean_node(wxr, sense_data, child) # add cat link
49 elif template_name in ["ja-x", "ja-usex"]:
50 extract_template_ja_usex(wxr, child, example_data)
51 clean_node(wxr, sense_data, child) # add cat link
52 elif template_name in ["zh-x", "zh-usex", "zh-q"]:
53 sense_data.examples.extend(
54 extract_template_zh_x(wxr, child, example_data)
55 )
56 clean_node(wxr, sense_data, child) # add cat link
57 elif template_name in ["ux", "eg", "usex", "uxi", "coi"]:
58 extract_template_ux(wxr, child, example_data)
59 clean_node(wxr, sense_data, child) # add cat link
60 elif template_name == "Q":
61 extract_template_Q(wxr, child, example_data)
62 clean_node(wxr, sense_data, child) # add cat link
63 elif template_name in LINKAGE_TEMPLATES: 63 ↛ 74line 63 didn't jump to line 74 because the condition on line 63 was always true
64 process_linkage_templates_in_gloss(
65 wxr,
66 page_data,
67 child,
68 LINKAGE_TEMPLATES[template_name],
69 sense_data.glosses[0]
70 if len(sense_data.glosses) > 0
71 else "",
72 )
73 else:
74 example_data.text = clean_node(wxr, None, child)
76 for next_list_item in list_item.find_child_recursively(
77 NodeKind.LIST_ITEM
78 ):
79 extract_example_list_item(
80 wxr, sense_data, next_list_item, page_data, example_data
81 )
83 if len(example_data.text) > 0 and parent_example is None:
84 sense_data.examples.append(example_data)
87def extract_plain_text_example_list(
88 wxr: WiktextractContext, list_item: WikiNode, example_data: Example
89) -> None:
90 for index, nested_list in list_item.find_child(
91 NodeKind.LIST, with_index=True
92 ):
93 example_data.ref = clean_node(wxr, None, list_item.children[:index])
94 example_data.text = clean_node(
95 wxr, None, nested_list.children[0].children
96 )
99def extract_quote_templates(
100 wxr: WiktextractContext, node: TemplateNode, example_data: Example
101) -> None:
102 """
103 Process `quote-*` and "RQ:*" templates.
104 """
105 expanded_node = wxr.wtp.parse(
106 wxr.wtp.node_to_wikitext(node), expand_all=True
107 )
108 for span_tag in expanded_node.find_html_recursively("span"):
109 span_class = span_tag.attrs.get("class", "")
110 if "cited-source" == span_class:
111 example_data.ref = clean_node(wxr, None, span_tag)
112 elif "e-quotation" in span_class:
113 example_data.text = clean_node(wxr, None, span_tag)
114 elif "e-translation" in span_class:
115 example_data.translation = clean_node(wxr, None, span_tag)
116 for i_tag in expanded_node.find_html_recursively(
117 "i", attr_name="class", attr_value="e-transliteration"
118 ):
119 example_data.roman = clean_node(wxr, None, i_tag)
120 break
123def extract_template_ja_usex(
124 wxr: WiktextractContext, node: TemplateNode, example_data: Example
125) -> None:
126 expanded_node = wxr.wtp.parse(
127 wxr.wtp.node_to_wikitext(node), expand_all=True
128 )
129 for span_tag in expanded_node.find_html(
130 "span", attr_name="class", attr_value="Jpan"
131 ):
132 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
133 example_data.text = clean_node(wxr, None, node_without_ruby)
134 example_data.ruby = ruby_data
135 for span_tag in expanded_node.find_html_recursively(
136 "span", attr_name="class", attr_value="tr"
137 ):
138 example_data.roman = clean_node(wxr, None, span_tag)
139 example_data.translation = clean_node(
140 wxr, None, node.template_parameters.get(3, "")
141 )
142 example_data.literal_meaning = clean_node(
143 wxr, None, node.template_parameters.get("lit", "")
144 )
147def extract_template_zh_x(
148 wxr: WiktextractContext,
149 template_node: TemplateNode,
150 parent_example: Example,
151) -> list[Example]:
152 expanded_node = wxr.wtp.parse(
153 wxr.wtp.node_to_wikitext(template_node), expand_all=True
154 )
155 has_dl_tag = False
156 results = []
157 for dl_tag in expanded_node.find_html_recursively("dl"):
158 example_data = parent_example.model_copy(deep=True)
159 has_dl_tag = True
160 for dd_tag in dl_tag.find_html("dd"):
161 dd_text = clean_node(wxr, None, dd_tag)
162 if dd_text.startswith("出自:"):
163 example_data.ref = dd_text.removeprefix("出自:")
164 else:
165 is_roman = False
166 for span_tag in dd_tag.find_html_recursively(
167 "span", attr_name="lang", attr_value="Latn"
168 ):
169 example_data.roman = clean_node(wxr, None, span_tag)
170 is_roman = True
171 for span_tag in dd_tag.find_html_recursively("span"):
172 span_text = clean_node(wxr, None, span_tag)
173 if span_text.startswith("[") and span_text.endswith(
174 "]"
175 ):
176 example_data.raw_tags.append(span_text.strip("[]"))
177 break
178 if not is_roman:
179 example_data.translation = dd_text
180 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
182 # no source, single line example
183 if not has_dl_tag:
184 example_data = parent_example.model_copy(deep=True)
185 for span_tag in expanded_node.find_html( 185 ↛ 190line 185 didn't jump to line 190 because the loop on line 185 didn't complete
186 "span", attr_name="lang", attr_value="Latn"
187 ):
188 example_data.roman = clean_node(wxr, None, span_tag)
189 break
190 for span_tag in expanded_node.find_html("span"):
191 span_text = clean_node(wxr, None, span_tag)
192 if span_text.startswith("[") and span_text.endswith("]"):
193 example_data.raw_tags.append(span_text.strip("[]"))
194 example_data.translation = clean_node(
195 wxr, None, template_node.template_parameters.get(2, "")
196 )
197 example_data.literal_meaning = clean_node(
198 wxr, None, template_node.template_parameters.get("lit", "")
199 )
200 for span_tag in expanded_node.find_html("span"):
201 span_lang = span_tag.attrs.get("lang", "")
202 if span_lang in ["zh-Hant", "zh-Hans"]:
203 example_text = clean_node(wxr, None, span_tag)
204 if len(example_text) > 0: 204 ↛ 200line 204 didn't jump to line 200 because the condition on line 204 was always true
205 new_example = example_data.model_copy(deep=True)
206 new_example.text = example_text
207 new_example.tags.append(
208 "Traditional Chinese"
209 if span_lang == "zh-Hant"
210 else "Simplified Chinese"
211 )
212 translate_raw_tags(new_example)
213 results.append(new_example)
214 return results
217def extract_zh_x_dl_span_tag(
218 wxr: WiktextractContext, dl_tag: HTMLNode, example: Example
219) -> list[Example]:
220 # process example text span tag and dialect span tag
221 results = []
222 is_first_hide = True
223 for span_tag in dl_tag.find_html("span"):
224 span_lang = span_tag.attrs.get("lang", "")
225 if span_lang in ["zh-Hant", "zh-Hans"]:
226 new_example = example.model_copy(deep=True)
227 new_example.text = clean_node(wxr, None, span_tag)
228 results.append(new_example)
229 elif "vsHide" in span_tag.attrs.get("class", ""):
230 # template has arg "collapsed=y"
231 results.extend(
232 extract_zh_x_dl_span_tag(
233 wxr,
234 span_tag,
235 results[-1]
236 if is_first_hide and len(results) > 0
237 else example,
238 )
239 )
240 is_first_hide = False
241 elif "font-size:x-small" in span_tag.attrs.get("style", ""): 241 ↛ 223line 241 didn't jump to line 223 because the condition on line 241 was always true
242 for link_node in span_tag.find_child(NodeKind.LINK):
243 raw_tag = clean_node(wxr, None, link_node)
244 if len(raw_tag) > 0: 244 ↛ 242line 244 didn't jump to line 242 because the condition on line 244 was always true
245 if len(results) > 0:
246 results[-1].raw_tags.append(raw_tag)
247 else:
248 example.raw_tags.append(raw_tag)
250 if dl_tag.tag == "dl":
251 for data in results:
252 translate_raw_tags(data)
253 return results
256def extract_template_ux(
257 wxr: WiktextractContext, node: TemplateNode, example_data: Example
258) -> None:
259 # https://zh.wiktionary.org/wiki/Template:ux
260 expanded_node = wxr.wtp.parse(
261 wxr.wtp.node_to_wikitext(node), expand_all=True
262 )
263 for i_tag in expanded_node.find_html_recursively("i"):
264 i_class = i_tag.attrs.get("class", "")
265 if "e-example" in i_class:
266 example_data.text = clean_node(wxr, None, i_tag)
267 elif "e-transliteration" in i_class: 267 ↛ 263line 267 didn't jump to line 263 because the condition on line 267 was always true
268 example_data.roman = clean_node(wxr, None, i_tag)
269 for span_tag in expanded_node.find_html_recursively("span"):
270 span_class = span_tag.attrs.get("class", "")
271 if "e-translation" in span_class:
272 example_data.translation = clean_node(wxr, None, span_tag)
273 elif "e-literally" in span_class:
274 example_data.literal_meaning = clean_node(wxr, None, span_tag)
275 elif "qualifier-content" in span_class:
276 example_data.raw_tags.extend(
277 clean_node(wxr, None, span_tag).split("、")
278 )
279 translate_raw_tags(example_data)
282def extract_template_Q(
283 wxr: WiktextractContext, node: TemplateNode, example_data: Example
284) -> None:
285 # https://zh.wiktionary.org/wiki/Template:Q
286 expanded_node = wxr.wtp.parse(
287 wxr.wtp.node_to_wikitext(node), expand_all=True
288 )
289 for div_tag in expanded_node.find_html(
290 "div", attr_name="class", attr_value="wiktQuote"
291 ):
292 ref_nodes = []
293 for child in div_tag.children: 293 ↛ 301line 293 didn't jump to line 301 because the loop on line 293 didn't complete
294 if isinstance(child, HTMLNode) and child.tag == "dl":
295 for i_tag in child.find_html_recursively(
296 "i", attr_name="class", attr_value="e-transliteration"
297 ):
298 example_data.roman = clean_node(wxr, None, i_tag)
299 break
300 ref_nodes.append(child)
301 ref_text = clean_node(wxr, None, ref_nodes)
302 if len(ref_text) > 0: 302 ↛ 304line 302 didn't jump to line 304 because the condition on line 302 was always true
303 example_data.ref = ref_text
304 for t_arg, field in (
305 ("quote", "text"),
306 ("t", "translation"),
307 ("trans", "translation"),
308 ("lit", "literal_meaning"),
309 ):
310 value = clean_node(
311 wxr, None, node.template_parameters.get(t_arg, "")
312 )
313 if len(value) > 0:
314 setattr(example_data, field, value)