Coverage for src/wiktextract/extractor/en/example.py: 8%
141 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from copy import deepcopy
3from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...tags import valid_tags
7from ...wxr_context import WiktextractContext
8from ..ruby import extract_ruby
9from .type_utils import ExampleData, SenseData
12def extract_example_list_item(
13 wxr: WiktextractContext,
14 list_item: WikiNode,
15 sense_data: SenseData,
16 parent_data: ExampleData,
17) -> list[ExampleData]:
18 examples = []
19 for template_node in list_item.find_child(NodeKind.TEMPLATE): 19 ↛ 20line 19 didn't jump to line 20 because the loop on line 19 never started
20 if template_node.template_name in ["zh-x", "zh-q"]:
21 examples.extend(
22 extract_template_zh_x(
23 wxr,
24 template_node,
25 sense_data,
26 parent_data,
27 )
28 )
29 elif template_node.template_name in ["ja-usex", "ja-x"]:
30 examples.append(
31 extract_template_ja_usex(
32 wxr,
33 template_node,
34 sense_data,
35 parent_data,
36 )
37 )
38 elif (
39 template_node.template_name.startswith(("quote-", "RQ:"))
40 or template_node.template_name == "quote"
41 ):
42 q_example = extract_quote_templates(wxr, template_node, sense_data)
43 if list_item.contain_node(NodeKind.LIST):
44 for next_list_item in list_item.find_child_recursively(
45 NodeKind.LIST_ITEM
46 ):
47 for key in ["tags", "raw_tags"]:
48 if key not in q_example:
49 q_example[key] = []
50 examples.extend(
51 extract_example_list_item(
52 wxr, next_list_item, sense_data, q_example
53 )
54 )
55 else:
56 examples.append(q_example)
58 return examples
61def extract_quote_templates(
62 wxr: WiktextractContext, node: TemplateNode, sense_data: SenseData
63) -> ExampleData:
64 expanded_node = wxr.wtp.parse(
65 wxr.wtp.node_to_wikitext(node), expand_all=True
66 )
67 clean_node(wxr, sense_data, expanded_node)
68 ref = ""
69 text = ""
70 translation = ""
71 roman = ""
72 for span_tag in expanded_node.find_html_recursively("span"):
73 span_class = span_tag.attrs.get("class", "")
74 if "cited-source" == span_class:
75 ref = clean_node(wxr, None, span_tag)
76 elif "e-quotation" in span_class:
77 text = clean_node(wxr, None, span_tag)
78 elif "e-translation" in span_class:
79 translation = clean_node(wxr, None, span_tag)
80 for i_tag in expanded_node.find_html_recursively(
81 "i", attr_name="class", attr_value="e-transliteration"
82 ):
83 roman = clean_node(wxr, None, i_tag)
84 break
85 example_data = ExampleData(
86 text=text, ref=ref, english=translation, roman=roman, type="quote"
87 )
88 clean_example_empty_data(example_data)
89 return example_data
92def extract_template_ja_usex(
93 wxr: WiktextractContext,
94 node: TemplateNode,
95 sense_data: SenseData,
96 example_data: ExampleData,
97) -> ExampleData:
98 # https://en.wiktionary.org/wiki/Template:ja-usex
99 expanded_node = wxr.wtp.parse(
100 wxr.wtp.node_to_wikitext(node), expand_all=True
101 )
102 clean_node(wxr, sense_data, expanded_node)
103 for span_tag in expanded_node.find_html(
104 "span", attr_name="class", attr_value="Jpan"
105 ):
106 ruby_data, node_without_ruby = extract_ruby(wxr, span_tag)
107 example_data["text"] = clean_node(wxr, None, node_without_ruby)
108 example_data["ruby"] = ruby_data
109 for span_tag in expanded_node.find_html_recursively(
110 "span", attr_name="class", attr_value="tr"
111 ):
112 example_data["roman"] = clean_node(wxr, None, span_tag)
113 example_data["english"] = clean_node(
114 wxr, None, node.template_parameters.get(3, "")
115 )
116 example_data["literal_meaning"] = clean_node(
117 wxr, None, node.template_parameters.get("lit", "")
118 )
119 clean_example_empty_data(example_data)
120 return example_data
123def extract_template_zh_x(
124 wxr: WiktextractContext,
125 template_node: TemplateNode,
126 sense_data: SenseData | None,
127 parent_example: ExampleData,
128) -> list[ExampleData]:
129 # https://en.wiktionary.org/wiki/Template:zh-x
130 expanded_node = wxr.wtp.parse(
131 wxr.wtp.node_to_wikitext(template_node), expand_all=True
132 )
133 clean_node(wxr, sense_data, expanded_node)
134 has_dl_tag = False
135 results = []
136 for dl_tag in expanded_node.find_html_recursively("dl"):
137 has_dl_tag = True
138 example_data = deepcopy(parent_example)
139 example_data["english"] = clean_node(
140 wxr, None, template_node.template_parameters.get(2, "")
141 )
142 for dd_tag in dl_tag.find_html("dd"):
143 dd_text = clean_node(wxr, None, dd_tag)
144 if dd_text.startswith("From:"):
145 example_data["ref"] = dd_text.removeprefix("From:")
146 else:
147 for span_tag in dd_tag.find_html_recursively(
148 "span", attr_name="lang", attr_value="Latn"
149 ):
150 example_data["roman"] = clean_node(wxr, None, span_tag)
151 for span_tag in dd_tag.find_html_recursively("span"):
152 span_text = clean_node(wxr, None, span_tag)
153 if span_text.startswith("[") and span_text.endswith(
154 "]"
155 ):
156 example_data["raw_tags"].append(
157 span_text.strip("[]")
158 )
159 break
160 results.extend(extract_zh_x_dl_span_tag(wxr, dl_tag, example_data))
162 # no source, single line example
163 if not has_dl_tag:
164 example_data = deepcopy(parent_example)
165 for span_tag in expanded_node.find_html(
166 "span", attr_name="lang", attr_value="Latn"
167 ):
168 example_data["roman"] = clean_node(wxr, None, span_tag)
169 break
170 for span_tag in expanded_node.find_html("span"):
171 span_text = clean_node(wxr, None, span_tag)
172 if span_text.startswith("[") and span_text.endswith("]"):
173 example_data["raw_tags"].append(span_text.strip("[]"))
174 example_data["english"] = clean_node(
175 wxr, None, template_node.template_parameters.get(2, "")
176 )
177 example_data["literal_meaning"] = clean_node(
178 wxr, None, template_node.template_parameters.get("lit", "")
179 )
180 for span_tag in expanded_node.find_html("span"):
181 span_lang = span_tag.attrs.get("lang", "")
182 if span_lang in ["zh-Hant", "zh-Hans"]:
183 example_text = clean_node(wxr, None, span_tag)
184 if len(example_text) > 0:
185 new_example = deepcopy(example_data)
186 new_example["text"] = example_text
187 new_example["tags"].append(
188 "Traditional Chinese"
189 if span_lang == "zh-Hant"
190 else "Simplified Chinese"
191 )
192 clean_example_empty_data(new_example)
193 results.append(new_example)
194 return results
197def extract_zh_x_dl_span_tag(
198 wxr: WiktextractContext, dl_tag: HTMLNode, example: ExampleData
199) -> list[ExampleData]:
200 # process example text span tag and dialect span tag
201 results = []
202 is_first_hide = True
203 for span_tag in dl_tag.find_html("span"):
204 span_lang = span_tag.attrs.get("lang", "")
205 if span_lang in ["zh-Hant", "zh-Hans"]:
206 new_example = deepcopy(example)
207 new_example["text"] = clean_node(wxr, None, span_tag)
208 results.append(new_example)
209 elif "vsHide" in span_tag.attrs.get("class", ""):
210 # template has arg "collapsed=y"
211 results.extend(
212 extract_zh_x_dl_span_tag(
213 wxr,
214 span_tag,
215 results[-1]
216 if is_first_hide and len(results) > 0
217 else example,
218 )
219 )
220 is_first_hide = False
221 elif "font-size:x-small" in span_tag.attrs.get("style", ""):
222 for link_node in span_tag.find_child_recursively(NodeKind.LINK):
223 raw_tag = clean_node(wxr, None, link_node)
224 if len(raw_tag) > 0:
225 if len(results) > 0:
226 results[-1]["raw_tags"].append(raw_tag)
227 else:
228 example["raw_tags"].append(raw_tag)
230 if dl_tag.tag == "dl":
231 for data in results:
232 clean_example_empty_data(data)
233 return results
236ZH_X_TAGS = {
237 "trad.": "Traditional Chinese",
238 "simp.": "Simplified Chinese",
239}
242def clean_example_empty_data(data: ExampleData) -> None:
243 # remove empty data and convert raw tags
244 raw_tags = data.get("raw_tags", [])
245 new_raw_tags = []
246 for raw_tag in raw_tags:
247 if raw_tag in ZH_X_TAGS:
248 data["tags"].append(ZH_X_TAGS[raw_tag])
249 elif raw_tag in valid_tags:
250 data["tags"].append(raw_tag)
251 else:
252 new_raw_tags.append(raw_tag)
253 data["raw_tags"] = new_raw_tags
254 if len(data.get("ref", "")) > 0:
255 data["type"] = "quote"
256 else:
257 data["type"] = "example"
258 for key, value in data.copy().items():
259 if len(value) == 0:
260 del data[key]