Coverage for src/wiktextract/extractor/ru/page.py: 77%
246 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 10:14 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-18 10:14 +0000
1import re
2from typing import Any
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...config import POSSubtitleData
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .etymology import extract_etymology
17from .gloss import extract_gloss, process_meaning_template
18from .inflection import (
19 extract_прил_ru_comparative_forms,
20 parse_html_forms_table,
21 parse_wikitext_forms_table,
22)
23from .linkage import (
24 extract_alt_form_section,
25 extract_linkage_section,
26 extract_phrase_section,
27)
28from .models import AltForm, Form, Hyphenation, Sense, Sound, WordEntry
29from .pronunciation import (
30 extract_homophone_section,
31 extract_pronunciation_section,
32 extract_rhyme_section,
33)
34from .section_titles import (
35 ALT_FORM_SECTIONS,
36 LINKAGE_TITLES,
37 POS_TEMPLATE_NAMES,
38 POS_TITLES,
39)
40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS
41from .translation import extract_translations
44def process_semantic_section(
45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
46):
47 for list_node in level_node.find_child(NodeKind.LIST):
48 for template_node in list_node.find_child_recursively(
49 NodeKind.TEMPLATE
50 ):
51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true
52 sense = process_meaning_template(
53 wxr, None, page_data[-1], template_node
54 )
55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true
56 page_data[-1].senses.append(sense)
59MORPH_TEMPLATE_ARGS = {
60 "p": "prefix",
61 "prefix": "prefix",
62 "i": "interfix",
63 "interfix": "interfix",
64 "in": "infix",
65 "infix": "infix",
66 "s": "suffix",
67 "suffix": "suffix",
68 "t": "transfix",
69 "transfix": "transfix",
70 "po": "suffix",
71 "postfix": "suffix",
72 "c": "circumfix",
73 "confix": "circumfix",
74 "circumfix": "circumfix",
75 "r": "root",
76 "e": "suffix",
77 "ending": "suffix",
78}
81def get_pos_from_template(
82 wxr: WiktextractContext, template_node: TemplateNode
83) -> POSSubtitleData | None:
84 # Search for POS in template names
85 template_name = template_node.template_name.lower()
86 if template_name == "morph":
87 # https://ru.wiktionary.org/wiki/Шаблон:morph
88 pos_type = template_node.template_parameters.get("тип", "")
89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true
90 return {
91 "pos": MORPH_TEMPLATE_ARGS[pos_type],
92 "tags": ["morpheme"],
93 }
94 elif (
95 template_name in {"заголовок", "з"}
96 and 1 in template_node.template_parameters
97 ):
98 pos_text = clean_node(
99 wxr, None, template_node.template_parameters[1]
100 ).strip("()")
101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 return
103 pos_text = pos_text.split()[0]
104 if pos_text in POS_TITLES:
105 return POS_TITLES[pos_text]
107 if template_name.startswith("прил ru"):
108 pos_arg = clean_node(
109 wxr, None, template_node.template_parameters.get("часть речи", "")
110 ).lower()
111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 for pos_string in POS_TITLES.keys():
113 if pos_string in pos_arg:
114 return POS_TITLES[pos_string]
115 else:
116 return {"pos": "adj"}
118 for part in template_name.split(maxsplit=2):
119 for subpart in part.split("-", maxsplit=2):
120 if subpart in POS_TEMPLATE_NAMES:
121 return POS_TEMPLATE_NAMES[subpart]
124def get_pos(
125 wxr: WiktextractContext, level_node: WikiNode
126) -> POSSubtitleData | None:
127 for template_node in level_node.find_child(NodeKind.TEMPLATE):
128 pos_data = get_pos_from_template(wxr, template_node)
129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true
130 return pos_data
131 # POS text could also in level node content
132 for template_node in level_node.find_content(NodeKind.TEMPLATE):
133 pos_data = get_pos_from_template(wxr, template_node)
134 if pos_data is not None:
135 return pos_data
137 # Search for POS in section text
138 text = clean_node(
139 wxr,
140 None,
141 list(
142 level_node.invert_find_child(
143 LEVEL_KIND_FLAGS, include_empty_str=True
144 )
145 ),
146 )
147 for pos_string in POS_TITLES.keys():
148 if pos_string in text.lower():
149 return POS_TITLES[pos_string]
152def extract_morphological_section(
153 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
154) -> None:
155 pos_data = get_pos(wxr, level_node)
156 if pos_data is not None:
157 page_data[-1].pos = pos_data["pos"]
158 page_data[-1].tags.extend(pos_data.get("tags", []))
159 for child_node in level_node.find_child(NodeKind.TEMPLATE):
160 expanded_template = wxr.wtp.parse(
161 wxr.wtp.node_to_wikitext(child_node), expand_all=True
162 )
163 clean_node(wxr, page_data[-1], expanded_template) # add category links
164 if child_node.template_name.startswith(
165 (
166 "прил ru",
167 "прил-ru",
168 "сущ ",
169 "сущ-ru",
170 "гл ",
171 "мест ru ",
172 "числ ru ",
173 "числ-",
174 "прич ru ",
175 "Фам ru ",
176 "падежи ",
177 )
178 ):
179 for table_node in expanded_template.find_child_recursively(
180 NodeKind.TABLE
181 ):
182 parse_wikitext_forms_table(wxr, page_data[-1], table_node)
183 for table_tag in expanded_template.find_html("table"):
184 parse_html_forms_table(wxr, page_data[-1], table_tag)
185 h_str = clean_node(
186 wxr, None, child_node.template_parameters.get("слоги", "")
187 )
188 if h_str != "":
189 page_data[-1].hyphenations.append(
190 Hyphenation(parts=h_str.split("-"))
191 )
193 if child_node.template_name.startswith("прил ru"):
194 extract_прил_ru_comparative_forms(
195 wxr, page_data[-1], expanded_template
196 )
198 for node in expanded_template.children:
199 node_text = clean_node(wxr, page_data[-1], node)
200 for text in node_text.split(","):
201 text = text.strip()
202 if text in MORPHOLOGICAL_TEMPLATE_TAGS:
203 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]
204 if isinstance(tr_tag, str): 204 ↛ 206line 204 didn't jump to line 206 because the condition on line 204 was always true
205 page_data[-1].tags.append(tr_tag)
206 elif isinstance(tr_tag, list):
207 page_data[-1].tags.extend(tr_tag)
210def parse_section(
211 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
212) -> None:
213 section_title = clean_node(wxr, None, level_node.largs).lower()
214 wxr.wtp.start_subsection(section_title)
215 if section_title in [
216 # Morphological and syntactic properties
217 "морфологические и синтаксические свойства",
218 # Type and syntactic properties of the word combination
219 "тип и синтаксические свойства сочетания",
220 "тип и свойства сочетания",
221 ]:
222 extract_morphological_section(wxr, page_data, level_node)
223 elif section_title in POS_TITLES:
224 pos_data = POS_TITLES[section_title]
225 page_data[-1].pos = pos_data["pos"]
226 page_data[-1].tags.extend(pos_data.get("tags", []))
227 extract_gloss(wxr, page_data[-1], level_node)
228 elif section_title == "произношение" and wxr.config.capture_pronunciation:
229 extract_pronunciation_section(wxr, page_data[-1], level_node)
230 elif section_title == "семантические свойства": # Semantic properties
231 process_semantic_section(wxr, page_data, level_node)
232 elif section_title in [
233 "значение",
234 "значения",
235 "как самостоятельный глагол",
236 "в значении вспомогательного глагола или связки",
237 ]:
238 extract_gloss(wxr, page_data[-1], level_node)
239 elif section_title == "этимология" and wxr.config.capture_etymologies: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 extract_etymology(wxr, page_data[-1], level_node)
241 elif (
242 section_title
243 in [
244 "фразеологизмы и устойчивые сочетания",
245 "типичные сочетания",
246 "фразеологизмы",
247 "пословицы и поговорки",
248 ]
249 and wxr.config.capture_linkages
250 ):
251 extract_phrase_section(wxr, page_data[-1], level_node, section_title)
252 elif (
253 section_title in ["перевод", "иноязычные аналоги"]
254 and wxr.config.capture_translations
255 ):
256 extract_translations(wxr, page_data[-1], level_node)
257 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 extract_linkage_section(
259 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node
260 )
261 elif section_title == "библиография": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 pass
263 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 parse_roman_section(wxr, page_data[-1], level_node)
265 elif section_title == "прочее": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 pass
267 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 extract_homophone_section(wxr, page_data[-1], level_node)
269 elif section_title in ALT_FORM_SECTIONS: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 extract_alt_form_section(
271 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title]
272 )
273 elif section_title == "рифмы": 273 ↛ 275line 273 didn't jump to line 275 because the condition on line 273 was always true
274 extract_rhyme_section(wxr, page_data[-1], level_node)
275 elif section_title not in ["см. также", "смотреть также", "смотрите также"]:
276 wxr.wtp.debug(
277 f"Unprocessed section {section_title}",
278 sortid="wixtextract/extractor/ru/page/parse_section/66",
279 )
281 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
282 parse_section(wxr, page_data, next_level_node)
284 extract_section_end_templates(wxr, page_data[-1], level_node)
287def parse_page(
288 wxr: WiktextractContext, page_title: str, page_text: str
289) -> list[dict[str, Any]]:
290 # Help site describing page structure:
291 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
293 if wxr.config.verbose: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true
294 logger.info(f"Parsing page: {page_title}")
295 wxr.config.word = page_title
296 wxr.wtp.start_page(page_title)
297 tree = wxr.wtp.parse(page_text)
298 page_data: list[WordEntry] = []
300 for level1_node in tree.find_child(NodeKind.LEVEL1):
301 lang_code = ""
302 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 302 ↛ 305line 302 didn't jump to line 305 because the loop on line 302 didn't complete
303 lang_code = subtitle_template.template_name.strip(" -")
304 break
305 if lang_code == "": 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 lang_code = "unknown"
307 if ( 307 ↛ 311line 307 didn't jump to line 311 because the condition on line 307 was never true
308 wxr.config.capture_language_codes is not None
309 and lang_code not in wxr.config.capture_language_codes
310 ):
311 continue
312 categories = {"categories": []}
313 lang_name = clean_node(wxr, categories, level1_node.largs)
314 wxr.wtp.start_section(lang_name)
315 base_data = WordEntry(
316 lang=lang_name,
317 lang_code=lang_code,
318 word=page_title,
319 pos="unknown",
320 )
321 base_data.categories.extend(categories["categories"])
322 extract_section_end_templates(wxr, base_data, level1_node)
323 pos_data = get_pos(wxr, level1_node)
324 if pos_data is not None:
325 base_data.pos = pos_data["pos"]
326 base_data.tags.extend(pos_data.get("tags", []))
328 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
329 if base_data.pos == "unknown":
330 pos_data = get_pos(wxr, level2_node)
331 if pos_data is not None:
332 base_data.pos = pos_data["pos"]
333 base_data.tags.extend(pos_data.get("tags", []))
334 page_data.append(base_data.model_copy(deep=True))
335 extract_level2_node_contents(wxr, page_data[-1], level2_node)
336 has_level3 = False
337 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
338 parse_section(wxr, page_data, level3_node)
339 has_level3 = True
340 if page_data[-1] == base_data or not has_level3: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true
341 page_data.pop()
342 extract_low_quality_page(wxr, page_data, base_data, level2_node)
344 for any_level_index, any_level_node in enumerate(
345 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)
346 ):
347 if any_level_index == 0 and (
348 len(page_data) == 0
349 or page_data[-1].lang_code != base_data.lang_code
350 ):
351 page_data.append(base_data.model_copy(deep=True))
352 parse_section(wxr, page_data, any_level_node)
354 if len(page_data) > 0 and page_data[-1] == base_data: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true
355 page_data.pop()
356 extract_low_quality_page(wxr, page_data, base_data, level1_node)
358 for d in page_data:
359 if len(d.senses) == 0: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true
360 d.senses.append(Sense(tags=["no-gloss"]))
361 return [d.model_dump(exclude_defaults=True) for d in page_data]
364def extract_low_quality_page(
365 wxr: WiktextractContext,
366 page_data: list[WordEntry],
367 base_data: WordEntry,
368 level_node: WikiNode,
369) -> None:
370 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
371 if isinstance(node, TemplateNode) and node.template_name.startswith(
372 "Форма-"
373 ):
374 process_form_template(wxr, page_data, base_data, node)
375 elif isinstance(node, WikiNode):
376 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):
377 if template_node.template_name.startswith("Форма-"): 377 ↛ 378line 377 didn't jump to line 378 because the condition on line 377 was never true
378 process_form_template(
379 wxr, page_data, base_data, template_node
380 )
383def process_form_template(
384 wxr: WiktextractContext,
385 page_data: list[WordEntry],
386 base_data: WordEntry,
387 template_node: TemplateNode,
388) -> None:
389 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ
390 # Шаблон:Форма-гл, "Шаблон:форма-гл en"
391 pos_data = get_pos_from_template(wxr, template_node)
392 if pos_data is not None: 392 ↛ 396line 392 didn't jump to line 396 because the condition on line 392 was always true
393 base_data.pos = pos_data["pos"]
394 base_data.tags.extend(pos_data.get("tags", []))
396 form_of = clean_node(
397 wxr,
398 None,
399 template_node.template_parameters.get(
400 "база", template_node.template_parameters.get(1, "")
401 ),
402 )
403 ipa = clean_node(
404 wxr, None, template_node.template_parameters.get("МФА", "")
405 )
406 expanded_node = wxr.wtp.parse(
407 wxr.wtp.node_to_wikitext(template_node), expand_all=True
408 )
409 current_data = base_data.model_copy(deep=True)
410 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
411 gloss_text = clean_node(wxr, None, list_item.children)
412 if len(gloss_text) > 0: 412 ↛ 410line 412 didn't jump to line 410 because the condition on line 412 was always true
413 sense = Sense(glosses=[gloss_text])
414 if len(form_of) > 0: 414 ↛ 417line 414 didn't jump to line 417 because the condition on line 414 was always true
415 sense.form_of.append(AltForm(word=form_of))
416 sense.tags.append("form-of")
417 current_data.senses.append(sense)
419 if len(ipa) > 0: 419 ↛ 421line 419 didn't jump to line 421 because the condition on line 419 was always true
420 current_data.sounds.append(Sound(ipa=ipa))
421 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 421 ↛ exitline 421 didn't return from function 'process_form_template' because the condition on line 421 was always true
422 clean_node(wxr, current_data, template_node)
423 page_data.append(current_data)
426def parse_roman_section(
427 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
428) -> None:
429 for link_node in level_node.find_child(NodeKind.LINK):
430 form_text = clean_node(wxr, None, link_node)
431 if form_text != "":
432 form = Form(form=form_text, tags=["romanization"])
433 word_entry.forms.append(form)
436def extract_section_end_templates(
437 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
438) -> None:
439 # category link templates
440 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации
441 for template_node in level_node.find_child(NodeKind.TEMPLATE):
442 if template_node.template_name in { 442 ↛ 454line 442 didn't jump to line 454 because the condition on line 442 was never true
443 "-ание",
444 "-атель",
445 "-ация",
446 "-ение",
447 "-ка",
448 "длина слова",
449 "Категория",
450 "Омонимы",
451 "forms",
452 "multilang",
453 }:
454 clean_node(wxr, word_entry, template_node)
455 elif template_node.template_name == "zh-forms": 455 ↛ 456line 455 didn't jump to line 456 because the condition on line 455 was never true
456 extract_zh_forms_template(wxr, word_entry, template_node)
459def extract_zh_forms_template(
460 wxr: WiktextractContext,
461 base_data: WordEntry,
462 template_node: TemplateNode,
463) -> None:
464 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms
465 # https://ru.wiktionary.org/wiki/Модуль:zh-forms
466 # similar to en and zh edition template
467 for p_name, p_value in template_node.template_parameters.items():
468 if not isinstance(p_name, str):
469 continue
470 if re.fullmatch(r"s\d*", p_name):
471 form_data = Form(
472 form=clean_node(wxr, None, p_value), tags=["Simplified-Chinese"]
473 )
474 if form_data.form not in ["", wxr.wtp.title]:
475 base_data.forms.append(form_data)
476 elif re.fullmatch(r"t\d*", p_name):
477 form_data = Form(
478 form=clean_node(wxr, None, p_value),
479 tags=["Traditional-Chinese"],
480 )
481 if form_data.form not in ["", wxr.wtp.title]:
482 base_data.forms.append(form_data)
483 elif p_name == "lit":
484 base_data.literal_meaning = clean_node(wxr, None, p_value)
487def extract_level2_node_contents(
488 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
489) -> None:
490 for t_node in level_node.find_content(NodeKind.TEMPLATE):
491 if t_node.template_name in ["заголовок", "з"]: 491 ↛ 490line 491 didn't jump to line 490 because the condition on line 491 was always true
492 # https://ru.wiktionary.org/wiki/Шаблон:з
493 stressed_form = clean_node(
494 wxr, None, t_node.template_parameters.get("ударение", "")
495 )
496 if "(" in stressed_form:
497 stressed_form = stressed_form[
498 : stressed_form.index("(")
499 ].strip()
500 if stressed_form not in ["", wxr.wtp.title]:
501 word_entry.forms.append(
502 Form(form=stressed_form, tags=["stressed"])
503 )