Coverage for src/wiktextract/extractor/ru/page.py: 72%
230 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1import re
2from typing import Any, Optional
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...config import POSSubtitleData
12from ...page import clean_node
13from ...wxr_context import WiktextractContext
14from ...wxr_logging import logger
15from .etymology import extract_etymology
16from .gloss import extract_gloss, process_meaning_template
17from .inflection import parse_adj_forms_table, parse_wikitext_forms_table
18from .linkage import (
19 extract_linkages,
20 extract_phrase_section,
21 process_related_block_template,
22)
23from .models import AltForm, Form, Sense, Sound, WordEntry
24from .pronunciation import (
25 extract_homophone_section,
26 extract_pronunciation_section,
27)
28from .section_titles import LINKAGE_TITLES, POS_TEMPLATE_NAMES, POS_TITLES
29from .tags import MORPHOLOGICAL_TEMPLATE_TAGS
30from .translation import extract_translations
33def process_semantic_section(
34 wxr: WiktextractContext,
35 page_data: list[WordEntry],
36 semantic_level_node: WikiNode,
37):
38 for list_node in semantic_level_node.find_child(NodeKind.LIST):
39 for template_node in list_node.find_child_recursively(
40 NodeKind.TEMPLATE
41 ):
42 if template_node.template_name == "значение": 42 ↛ 39line 42 didn't jump to line 39 because the condition on line 42 was always true
43 sense = process_meaning_template(
44 wxr, None, page_data[-1], template_node
45 )
46 if len(sense.glosses) > 0: 46 ↛ 39line 46 didn't jump to line 39 because the condition on line 46 was always true
47 page_data[-1].senses.append(sense)
50MORPH_TEMPLATE_ARGS = {
51 "p": "prefix",
52 "prefix": "prefix",
53 "i": "interfix",
54 "interfix": "interfix",
55 "in": "infix",
56 "infix": "infix",
57 "s": "suffix",
58 "suffix": "suffix",
59 "t": "transfix",
60 "transfix": "transfix",
61 "po": "suffix",
62 "postfix": "suffix",
63 "c": "circumfix",
64 "confix": "circumfix",
65 "circumfix": "circumfix",
66 "r": "root",
67 "e": "suffix",
68 "ending": "suffix",
69}
72def get_pos_from_template(
73 wxr: WiktextractContext, template_node: TemplateNode
74) -> Optional[POSSubtitleData]:
75 # Search for POS in template names
76 template_name = template_node.template_name.lower()
77 if template_name == "morph":
78 # https://ru.wiktionary.org/wiki/Шаблон:morph
79 pos_type = template_node.template_parameters.get("тип", "")
80 if pos_type in MORPH_TEMPLATE_ARGS: 80 ↛ 98line 80 didn't jump to line 98 because the condition on line 80 was always true
81 return {
82 "pos": MORPH_TEMPLATE_ARGS[pos_type],
83 "tags": ["morpheme"],
84 }
85 elif (
86 template_name in {"заголовок", "з"}
87 and 1 in template_node.template_parameters
88 ):
89 pos_text = clean_node(
90 wxr, None, template_node.template_parameters[1]
91 ).strip("()")
92 if len(pos_text) == 0: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 return
94 pos_text = pos_text.split()[0]
95 if pos_text in POS_TITLES:
96 return POS_TITLES[pos_text]
98 for part in template_name.split(maxsplit=2):
99 for subpart in part.split("-", maxsplit=2):
100 if subpart in POS_TEMPLATE_NAMES:
101 return POS_TEMPLATE_NAMES[subpart]
104def get_pos(
105 wxr: WiktextractContext, level_node: WikiNode
106) -> Optional[POSSubtitleData]:
107 for template_node in level_node.find_child(NodeKind.TEMPLATE):
108 pos_data = get_pos_from_template(wxr, template_node)
109 if pos_data is not None: 109 ↛ 107line 109 didn't jump to line 107 because the condition on line 109 was always true
110 return pos_data
111 # POS text could also in level node content
112 for template_node in level_node.find_content(NodeKind.TEMPLATE):
113 pos_data = get_pos_from_template(wxr, template_node)
114 if pos_data is not None:
115 return pos_data
117 # Search for POS in section text
118 text = clean_node(
119 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
120 )
121 for pos_string in POS_TITLES.keys():
122 if pos_string in text.lower():
123 return POS_TITLES[pos_string]
126def extract_morphological_section(
127 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
128) -> None:
129 param_tag_map = {
130 "степень": "comparative", # Шаблон:inflection/ru/adj
131 "соотв": "perfective", # Шаблон:Гл-блок
132 }
134 pos_data = get_pos(wxr, level_node)
135 if pos_data is not None: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true
136 page_data[-1].pos = pos_data["pos"]
137 page_data[-1].tags.extend(pos_data.get("tags", []))
138 for child_node in level_node.find_child(NodeKind.TEMPLATE):
139 expanded_template = wxr.wtp.parse(
140 wxr.wtp.node_to_wikitext(child_node), expand_all=True
141 )
142 if child_node.template_name.startswith("прил"):
143 parse_adj_forms_table(wxr, page_data[-1], expanded_template)
144 elif child_node.template_name.startswith(("сущ", "гл")):
145 parse_wikitext_forms_table(wxr, page_data[-1], expanded_template)
147 for node in expanded_template.children:
148 node_text = clean_node(wxr, page_data[-1], node)
149 for text in node_text.split(","):
150 text = text.strip()
151 if text in MORPHOLOGICAL_TEMPLATE_TAGS:
152 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]
153 if isinstance(tr_tag, str): 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true
154 page_data[-1].tags.append(tr_tag)
155 elif isinstance(tr_tag, list):
156 page_data[-1].tags.extend(tr_tag)
158 for param, tag in param_tag_map.items():
159 if param in child_node.template_parameters:
160 forms_text = clean_node(
161 wxr, None, child_node.template_parameters[param]
162 )
163 for form in forms_text.split(","):
164 form = form.strip()
165 if form != "": 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true
166 page_data[-1].forms.append(Form(form=form, tags=[tag]))
169def parse_section(
170 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
171) -> None:
172 section_title = clean_node(wxr, None, level_node.largs).lower()
173 wxr.wtp.start_subsection(section_title)
174 if section_title in [
175 # Morphological and syntactic properties
176 "морфологические и синтаксические свойства",
177 # Type and syntactic properties of the word combination
178 "тип и синтаксические свойства сочетания",
179 "тип и свойства сочетания",
180 ]:
181 extract_morphological_section(wxr, page_data, level_node)
182 elif section_title in POS_TITLES:
183 pos_data = POS_TITLES[section_title]
184 page_data[-1].pos = pos_data["pos"]
185 page_data[-1].tags.extend(pos_data.get("tags", []))
186 extract_gloss(wxr, page_data[-1], level_node)
187 elif section_title == "произношение" and wxr.config.capture_pronunciation:
188 extract_pronunciation_section(wxr, page_data[-1], level_node)
189 elif section_title == "семантические свойства": # Semantic properties
190 process_semantic_section(wxr, page_data, level_node)
191 elif section_title in ("значение", "значения"):
192 extract_gloss(wxr, page_data[-1], level_node)
193 elif section_title == "родственные слова" and wxr.config.capture_linkages: 193 ↛ 195line 193 didn't jump to line 195 because the condition on line 193 was never true
194 # Word family
195 for template_node in level_node.find_child(NodeKind.TEMPLATE):
196 if template_node.template_name == "родств-блок":
197 process_related_block_template(
198 wxr, page_data[-1], template_node
199 )
200 elif section_title == "этимология" and wxr.config.capture_etymologies: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 extract_etymology(wxr, page_data[-1], level_node)
202 elif ( 202 ↛ 213line 202 didn't jump to line 213
203 section_title
204 in [
205 "фразеологизмы и устойчивые сочетания",
206 "типичные сочетания",
207 "фразеологизмы",
208 "пословицы и поговорки",
209 ]
210 and wxr.config.capture_linkages
211 ):
212 extract_phrase_section(wxr, page_data[-1], level_node, section_title)
213 elif (
214 section_title in ["перевод", "иноязычные аналоги"]
215 and wxr.config.capture_translations
216 ):
217 extract_translations(wxr, page_data[-1], level_node)
218 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages:
219 extract_linkages(
220 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node
221 )
222 elif section_title == "библиография":
223 pass
224 elif section_title in ["латиница (latinça)", "латиница (latinca)"]:
225 parse_roman_section(wxr, page_data[-1], level_node)
226 elif section_title == "прочее":
227 pass
228 elif section_title == "омофоны" and wxr.config.capture_pronunciation:
229 extract_homophone_section(wxr, page_data[-1], level_node)
230 else:
231 wxr.wtp.debug(
232 f"Unprocessed section {section_title}",
233 sortid="wixtextract/extractor/ru/page/parse_section/66",
234 )
236 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
237 parse_section(wxr, page_data, next_level_node)
239 extract_section_end_templates(wxr, page_data[-1], level_node)
242def parse_page(
243 wxr: WiktextractContext, page_title: str, page_text: str
244) -> list[dict[str, Any]]:
245 # Help site describing page structure:
246 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
248 if wxr.config.verbose: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 logger.info(f"Parsing page: {page_title}")
250 wxr.config.word = page_title
251 wxr.wtp.start_page(page_title)
252 tree = wxr.wtp.parse(page_text)
253 page_data: list[WordEntry] = []
255 for level1_node in tree.find_child(NodeKind.LEVEL1):
256 lang_code = ""
257 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 257 ↛ 260line 257 didn't jump to line 260 because the loop on line 257 didn't complete
258 lang_code = subtitle_template.template_name.strip(" -")
259 break
260 if lang_code == "": 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true
261 lang_code = "unknown"
262 if ( 262 ↛ 266line 262 didn't jump to line 266
263 wxr.config.capture_language_codes is not None
264 and lang_code not in wxr.config.capture_language_codes
265 ):
266 continue
267 categories = {"categories": []}
268 lang_name = clean_node(wxr, categories, level1_node.largs)
269 wxr.wtp.start_section(lang_name)
270 base_data = WordEntry(
271 lang=lang_name,
272 lang_code=lang_code,
273 word=page_title,
274 pos="unknown",
275 )
276 base_data.categories.extend(categories["categories"])
277 extract_section_end_templates(wxr, base_data, level1_node)
278 pos_data = get_pos(wxr, level1_node)
279 if pos_data is not None:
280 base_data.pos = pos_data["pos"]
281 base_data.tags.extend(pos_data.get("tags", []))
283 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
284 if base_data.pos == "unknown":
285 pos_data = get_pos(wxr, level2_node)
286 if pos_data is not None:
287 base_data.pos = pos_data["pos"]
288 base_data.tags.extend(pos_data.get("tags", []))
289 page_data.append(base_data.model_copy(deep=True))
290 has_level3 = False
291 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
292 parse_section(wxr, page_data, level3_node)
293 has_level3 = True
294 if page_data[-1] == base_data or not has_level3: 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true
295 page_data.pop()
296 extract_low_quality_page(wxr, page_data, base_data, level2_node)
298 for any_level_index, any_level_node in enumerate(
299 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)
300 ):
301 if any_level_index == 0 and (
302 len(page_data) == 0
303 or page_data[-1].lang_code != base_data.lang_code
304 ):
305 page_data.append(base_data.model_copy(deep=True))
306 parse_section(wxr, page_data, any_level_node)
308 if len(page_data) > 0 and page_data[-1] == base_data: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true
309 page_data.pop()
310 extract_low_quality_page(wxr, page_data, base_data, level1_node)
312 for d in page_data:
313 if len(d.senses) == 0: 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true
314 d.senses.append(Sense(tags=["no-gloss"]))
315 return [d.model_dump(exclude_defaults=True) for d in page_data]
318def extract_low_quality_page(
319 wxr: WiktextractContext,
320 page_data: list[WordEntry],
321 base_data: WordEntry,
322 level_node: WikiNode,
323) -> None:
324 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
325 if isinstance(node, TemplateNode) and node.template_name.startswith(
326 "Форма-"
327 ):
328 process_form_template(wxr, page_data, base_data, node)
329 elif isinstance(node, WikiNode):
330 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 330 ↛ 331line 330 didn't jump to line 331 because the loop on line 330 never started
331 if template_node.template_name.startswith("Форма-"):
332 process_form_template(
333 wxr, page_data, base_data, template_node
334 )
337def process_form_template(
338 wxr: WiktextractContext,
339 page_data: list[WordEntry],
340 base_data: WordEntry,
341 template_node: TemplateNode,
342) -> None:
343 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ
344 # Шаблон:Форма-гл, "Шаблон:форма-гл en"
345 pos_data = get_pos_from_template(wxr, template_node)
346 if pos_data is not None: 346 ↛ 350line 346 didn't jump to line 350 because the condition on line 346 was always true
347 base_data.pos = pos_data["pos"]
348 base_data.tags.extend(pos_data.get("tags", []))
350 form_of = clean_node(
351 wxr,
352 None,
353 template_node.template_parameters.get(
354 "база", template_node.template_parameters.get(1, "")
355 ),
356 )
357 ipa = clean_node(
358 wxr, None, template_node.template_parameters.get("МФА", "")
359 )
360 expanded_node = wxr.wtp.parse(
361 wxr.wtp.node_to_wikitext(template_node), expand_all=True
362 )
363 current_data = base_data.model_copy(deep=True)
364 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
365 gloss_text = clean_node(wxr, None, list_item.children)
366 if len(gloss_text) > 0: 366 ↛ 364line 366 didn't jump to line 364 because the condition on line 366 was always true
367 sense = Sense(glosses=[gloss_text])
368 if len(form_of) > 0: 368 ↛ 371line 368 didn't jump to line 371 because the condition on line 368 was always true
369 sense.form_of.append(AltForm(word=form_of))
370 sense.tags.append("form-of")
371 current_data.senses.append(sense)
373 if len(ipa) > 0: 373 ↛ 375line 373 didn't jump to line 375 because the condition on line 373 was always true
374 current_data.sounds.append(Sound(ipa=ipa))
375 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 375 ↛ exitline 375 didn't return from function 'process_form_template' because the condition on line 375 was always true
376 clean_node(wxr, current_data, template_node)
377 page_data.append(current_data)
380def parse_roman_section(
381 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
382) -> None:
383 for link_node in level_node.find_child(NodeKind.LINK):
384 form_text = clean_node(wxr, None, link_node)
385 if form_text != "":
386 form = Form(form=form_text, tags=["romanization"])
387 word_entry.forms.append(form)
390def extract_section_end_templates(
391 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
392) -> None:
393 # category link templates
394 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации
395 for template_node in level_node.find_child(NodeKind.TEMPLATE):
396 if template_node.template_name in { 396 ↛ 408line 396 didn't jump to line 408 because the condition on line 396 was never true
397 "-ание",
398 "-атель",
399 "-ация",
400 "-ение",
401 "-ка",
402 "длина слова",
403 "Категория",
404 "Омонимы",
405 "forms",
406 "multilang",
407 }:
408 clean_node(wxr, word_entry, template_node)
409 elif template_node.template_name == "zh-forms": 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true
410 extract_zh_forms_template(wxr, word_entry, template_node)
413def extract_zh_forms_template(
414 wxr: WiktextractContext,
415 base_data: WordEntry,
416 template_node: TemplateNode,
417) -> None:
418 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms
419 # https://ru.wiktionary.org/wiki/Модуль:zh-forms
420 # similar to en and zh edition template
421 for p_name, p_value in template_node.template_parameters.items():
422 if not isinstance(p_name, str):
423 continue
424 if re.fullmatch(r"s\d*", p_name):
425 form_data = Form(
426 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]
427 )
428 if form_data.form not in ["", wxr.wtp.title]:
429 base_data.forms.append(form_data)
430 elif re.fullmatch(r"t\d*", p_name):
431 form_data = Form(
432 form=clean_node(wxr, None, p_value),
433 tags=["Traditional Chinese"],
434 )
435 if form_data.form not in ["", wxr.wtp.title]:
436 base_data.forms.append(form_data)
437 elif p_name == "lit":
438 base_data.literal_meaning = clean_node(wxr, None, p_value)