Coverage for src/wiktextract/extractor/zh/thesaurus.py: 92%
124 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor import Page
5from wikitextprocessor.parser import (
6 LEVEL_KIND_FLAGS,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...page import clean_node
13from ...thesaurus import ThesaurusTerm
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .section_titles import LINKAGE_TITLES, POS_TITLES
17from .tags import translate_raw_tags
19SENSE_SUBTITLE_PREFIX = "詞義:"
20IGNORED_SUBTITLES = frozenset(
21 [
22 "參見", # see also
23 "参见",
24 "延伸閱讀", # further reading
25 "延伸阅读",
26 ]
27)
30def parse_section(
31 wxr: WiktextractContext,
32 entry_word: str,
33 lang_code: str,
34 pos: str,
35 sense: str,
36 linkage_type: str,
37 level_node: WikiNode,
38) -> list[ThesaurusTerm]:
39 data = []
40 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
41 next_level_title = clean_node(wxr, None, next_level_node.largs)
42 if next_level_title in IGNORED_SUBTITLES: 42 ↛ 43line 42 didn't jump to line 43 because the condition on line 42 was never true
43 continue
44 elif next_level_node.kind == NodeKind.LEVEL3:
45 local_pos_name = next_level_title
46 english_pos = POS_TITLES.get(local_pos_name, {}).get("pos")
47 if english_pos is None: 47 ↛ 48line 47 didn't jump to line 48 because the condition on line 47 was never true
48 logger.warning(
49 f"Unrecognized POS subtitle: {local_pos_name} in page "
50 f"Thesaurus:{entry_word}"
51 )
52 english_pos = local_pos_name
53 data.extend(
54 parse_section(
55 wxr,
56 entry_word,
57 lang_code,
58 english_pos,
59 "",
60 "",
61 next_level_node,
62 )
63 )
64 elif next_level_node.kind == NodeKind.LEVEL4:
65 sense_text = next_level_title
66 sense_text = sense_text.removeprefix(SENSE_SUBTITLE_PREFIX)
67 data.extend(
68 parse_section(
69 wxr,
70 entry_word,
71 lang_code,
72 pos,
73 sense_text,
74 "",
75 next_level_node,
76 )
77 )
78 elif next_level_node.kind == NodeKind.LEVEL5: 78 ↛ 40line 78 didn't jump to line 40 because the condition on line 78 was always true
79 local_linkage_name = next_level_title
80 english_linkage = LINKAGE_TITLES.get(local_linkage_name)
81 if english_linkage is None: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true
82 logger.warning(
83 f"Unrecognized linkage subtitle: {local_linkage_name} "
84 f"in page Thesaurus:{entry_word}"
85 )
86 english_linkage = local_linkage_name
87 for node in next_level_node.find_child(
88 NodeKind.LIST | NodeKind.TEMPLATE
89 ):
90 if isinstance(node, TemplateNode):
91 data.extend(
92 process_linkage_template(
93 wxr,
94 entry_word,
95 lang_code,
96 pos,
97 sense,
98 english_linkage,
99 node,
100 )
101 )
102 elif node.kind == NodeKind.LIST: 102 ↛ 87line 102 didn't jump to line 87 because the condition on line 102 was always true
103 data.extend(
104 process_list_node(
105 wxr,
106 entry_word,
107 lang_code,
108 pos,
109 sense,
110 english_linkage,
111 node,
112 )
113 )
115 return data
118def process_linkage_template(
119 wxr: WiktextractContext,
120 entry_word: str,
121 lang_code: str,
122 pos: str,
123 sense: str,
124 linkage_type: str,
125 template_node: TemplateNode,
126) -> list[ThesaurusTerm]:
127 if re.fullmatch(r"col\d", template_node.template_name.strip(), re.I):
128 return process_col_template(
129 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node
130 )
131 elif template_node.template_name.lower() in (
132 "zh-der",
133 "zh-syn-list",
134 "zh-ant-list",
135 ):
136 return process_obsolete_zh_der_template(
137 wxr, entry_word, lang_code, pos, sense, linkage_type, template_node
138 )
140 return []
143def process_list_node(
144 wxr: WiktextractContext,
145 entry_word: str,
146 lang_code: str,
147 pos: str,
148 sense: str,
149 linkage_type: str,
150 list_node: WikiNode,
151) -> list[ThesaurusTerm]:
152 term_list = []
153 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
154 current_data = []
155 raw_tags = []
156 for list_child_template in list_item_node.find_child(NodeKind.TEMPLATE):
157 if list_child_template.template_name.lower() in (
158 "qual",
159 "i",
160 "qf",
161 "qualifier",
162 ):
163 for (
164 param_value
165 ) in list_child_template.template_parameters.values():
166 raw_tags.append(clean_node(wxr, None, param_value))
167 elif list_child_template.template_name == "ja-r": 167 ↛ 156line 167 didn't jump to line 156 because the condition on line 167 was always true
168 current_data.append(
169 process_thesaurus_ja_r_template(
170 wxr,
171 entry_word,
172 lang_code,
173 pos,
174 sense,
175 linkage_type,
176 list_child_template,
177 )
178 )
180 for data in current_data:
181 data.raw_tags.extend(raw_tags)
182 translate_raw_tags(data)
183 term_list.extend(current_data)
185 return term_list
188def process_col_template(
189 wxr: WiktextractContext,
190 entry_word: str,
191 lang_code: str,
192 pos: str,
193 sense: str,
194 linkage_type: str,
195 template_node: TemplateNode,
196) -> list[ThesaurusTerm]:
197 # https://zh.wiktionary.org/wiki/Template:Col3
198 term_list = []
199 expanded_template = wxr.wtp.parse(
200 wxr.wtp.node_to_wikitext(template_node), expand_all=True
201 )
202 for ui_tag in expanded_template.find_html_recursively("li"):
203 current_data = []
204 roman = ""
205 raw_tags = []
206 for span_tag in ui_tag.find_html("span"):
207 if span_tag.attrs.get("lang", "").endswith("-Latn"):
208 roman = clean_node(wxr, None, span_tag)
209 elif "qualifier-content" in span_tag.attrs.get("class", ""):
210 raw_tags.append(clean_node(wxr, None, span_tag))
211 elif span_tag.attrs.get("lang", "") != "":
212 term_text = clean_node(wxr, None, span_tag)
213 term_data = ThesaurusTerm(
214 entry_word,
215 lang_code,
216 pos,
217 linkage_type,
218 term_text,
219 sense=sense,
220 )
221 class_names = span_tag.attrs.get("class", "")
222 if class_names == "Hant":
223 term_data.tags.append("Traditional Chinese")
224 elif class_names == "Hans":
225 term_data.tags.append("Simplified Chinese")
226 current_data.append(term_data)
228 for data in current_data:
229 data.raw_tags.extend(raw_tags)
230 data.roman = roman
231 translate_raw_tags(data)
232 term_list.extend(current_data)
234 return term_list
237def process_obsolete_zh_der_template(
238 wxr: WiktextractContext,
239 entry_word: str,
240 lang_code: str,
241 pos: str,
242 sense: str,
243 linkage_type: str,
244 template_node: TemplateNode,
245) -> list[ThesaurusTerm]:
246 # https://zh.wiktionary.org/wiki/Template:Zh-der
247 term_list = []
248 expanded_template = wxr.wtp.parse(
249 wxr.wtp.node_to_wikitext(template_node), expand_all=True
250 )
251 for list_item_node in expanded_template.find_child_recursively(
252 NodeKind.LIST_ITEM
253 ):
254 current_data = []
255 roman = ""
256 for span_tag in list_item_node.find_html_recursively("span"):
257 if "Latn" in span_tag.attrs.get("class", ""):
258 roman = clean_node(wxr, None, span_tag)
259 elif span_tag.attrs.get("lang", "") != "": 259 ↛ 256line 259 didn't jump to line 256 because the condition on line 259 was always true
260 term_text = clean_node(wxr, None, span_tag)
261 if term_text == "/": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 continue
263 current_data.append(
264 ThesaurusTerm(
265 entry_word,
266 lang_code,
267 pos,
268 linkage_type,
269 term_text,
270 sense=sense,
271 )
272 )
273 for data in current_data:
274 data.roman = roman
275 term_list.extend(current_data)
277 return term_list
280def process_thesaurus_ja_r_template(
281 wxr: WiktextractContext,
282 entry_word: str,
283 lang_code: str,
284 pos: str,
285 sense: str,
286 linkage_type: str,
287 template_node: TemplateNode,
288) -> ThesaurusTerm:
289 from .linkage import process_ja_r_template
291 linkage_data = process_ja_r_template(wxr, template_node, "")
292 return ThesaurusTerm(
293 entry_word,
294 lang_code,
295 pos,
296 linkage_type,
297 linkage_data.word,
298 sense=sense,
299 roman=linkage_data.roman,
300 )
303def extract_thesaurus_page(
304 wxr: WiktextractContext, page: Page
305) -> list[ThesaurusTerm]:
306 entry = page.title[page.title.find(":") + 1 :]
307 wxr.wtp.start_page(page.title)
308 root = wxr.wtp.parse(page.body)
309 data = []
310 for level2_node in root.find_child(NodeKind.LEVEL2):
311 lang_name = clean_node(wxr, None, level2_node.largs)
312 lang_code = name_to_code(lang_name, "zh")
313 if lang_code == "": 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true
314 logger.warning(
315 f"Unrecognized language: {lang_name} in page Thesaurus:{entry}"
316 )
317 data.extend(
318 parse_section(wxr, entry, lang_code, "", "", "", level2_node)
319 )
320 return data