Coverage for src/wiktextract/extractor/ru/page.py: 76%
246 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
2from typing import Any
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...config import POSSubtitleData
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .etymology import extract_etymology
17from .gloss import extract_gloss, process_meaning_template
18from .inflection import (
19 extract_прил_ru_comparative_forms,
20 parse_html_forms_table,
21 parse_wikitext_forms_table,
22)
23from .linkage import (
24 extract_alt_form_section,
25 extract_linkage_section,
26 extract_phrase_section,
27)
28from .models import AltForm, Form, Hyphenation, Sense, Sound, WordEntry
29from .pronunciation import (
30 extract_homophone_section,
31 extract_pronunciation_section,
32 extract_rhyme_section,
33)
34from .section_titles import (
35 ALT_FORM_SECTIONS,
36 LINKAGE_TITLES,
37 POS_TEMPLATE_NAMES,
38 POS_TITLES,
39)
40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS
41from .translation import extract_translations
44def process_semantic_section(
45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
46):
47 for list_node in level_node.find_child(NodeKind.LIST):
48 for template_node in list_node.find_child_recursively(
49 NodeKind.TEMPLATE
50 ):
51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true
52 sense = process_meaning_template(
53 wxr, None, page_data[-1], template_node
54 )
55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true
56 page_data[-1].senses.append(sense)
59MORPH_TEMPLATE_ARGS = {
60 "p": "prefix",
61 "prefix": "prefix",
62 "i": "interfix",
63 "interfix": "interfix",
64 "in": "infix",
65 "infix": "infix",
66 "s": "suffix",
67 "suffix": "suffix",
68 "t": "transfix",
69 "transfix": "transfix",
70 "po": "suffix",
71 "postfix": "suffix",
72 "c": "circumfix",
73 "confix": "circumfix",
74 "circumfix": "circumfix",
75 "r": "root",
76 "e": "suffix",
77 "ending": "suffix",
78}
81def get_pos_from_template(
82 wxr: WiktextractContext, template_node: TemplateNode
83) -> POSSubtitleData | None:
84 # Search for POS in template names
85 template_name = template_node.template_name.lower()
86 if template_name == "morph":
87 # https://ru.wiktionary.org/wiki/Шаблон:morph
88 pos_type = template_node.template_parameters.get("тип", "")
89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true
90 return {
91 "pos": MORPH_TEMPLATE_ARGS[pos_type],
92 "tags": ["morpheme"],
93 }
94 elif (
95 template_name in {"заголовок", "з"}
96 and 1 in template_node.template_parameters
97 ):
98 pos_text = clean_node(
99 wxr, None, template_node.template_parameters[1]
100 ).strip("()")
101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 return
103 pos_text = pos_text.split()[0]
104 if pos_text in POS_TITLES:
105 return POS_TITLES[pos_text]
107 if template_name.startswith("прил ru"):
108 pos_arg = clean_node(
109 wxr, None, template_node.template_parameters.get("часть речи", "")
110 ).lower()
111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 for pos_string in POS_TITLES.keys():
113 if pos_string in pos_arg:
114 return POS_TITLES[pos_string]
115 else:
116 return {"pos": "adj"}
118 for part in template_name.split(maxsplit=2):
119 for subpart in part.split("-", maxsplit=2):
120 if subpart in POS_TEMPLATE_NAMES:
121 return POS_TEMPLATE_NAMES[subpart]
124def get_pos(
125 wxr: WiktextractContext, level_node: WikiNode
126) -> POSSubtitleData | None:
127 for template_node in level_node.find_child(NodeKind.TEMPLATE):
128 pos_data = get_pos_from_template(wxr, template_node)
129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true
130 return pos_data
131 # POS text could also in level node content
132 for template_node in level_node.find_content(NodeKind.TEMPLATE):
133 pos_data = get_pos_from_template(wxr, template_node)
134 if pos_data is not None:
135 return pos_data
137 # Search for POS in section text
138 text = clean_node(
139 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
140 )
141 for pos_string in POS_TITLES.keys():
142 if pos_string in text.lower():
143 return POS_TITLES[pos_string]
146def extract_morphological_section(
147 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
148) -> None:
149 pos_data = get_pos(wxr, level_node)
150 if pos_data is not None: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 page_data[-1].pos = pos_data["pos"]
152 page_data[-1].tags.extend(pos_data.get("tags", []))
153 for child_node in level_node.find_child(NodeKind.TEMPLATE):
154 expanded_template = wxr.wtp.parse(
155 wxr.wtp.node_to_wikitext(child_node), expand_all=True
156 )
157 clean_node(wxr, page_data[-1], expanded_template) # add category links
158 if child_node.template_name.startswith(
159 (
160 "прил ru",
161 "прил-ru",
162 "сущ ",
163 "сущ-ru",
164 "гл ",
165 "мест ru ",
166 "числ ru ",
167 "числ-",
168 "прич ru ",
169 "Фам ru ",
170 "падежи ",
171 )
172 ):
173 for table_node in expanded_template.find_child_recursively(
174 NodeKind.TABLE
175 ):
176 parse_wikitext_forms_table(wxr, page_data[-1], table_node)
177 for table_tag in expanded_template.find_html("table"):
178 parse_html_forms_table(wxr, page_data[-1], table_tag)
179 h_str = clean_node(
180 wxr, None, child_node.template_parameters.get("слоги", "")
181 )
182 if h_str != "":
183 page_data[-1].hyphenations.append(
184 Hyphenation(parts=h_str.split("-"))
185 )
187 if child_node.template_name.startswith("прил ru"):
188 extract_прил_ru_comparative_forms(
189 wxr, page_data[-1], expanded_template
190 )
192 for node in expanded_template.children:
193 node_text = clean_node(wxr, page_data[-1], node)
194 for text in node_text.split(","):
195 text = text.strip()
196 if text in MORPHOLOGICAL_TEMPLATE_TAGS:
197 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]
198 if isinstance(tr_tag, str): 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true
199 page_data[-1].tags.append(tr_tag)
200 elif isinstance(tr_tag, list):
201 page_data[-1].tags.extend(tr_tag)
204def parse_section(
205 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
206) -> None:
207 section_title = clean_node(wxr, None, level_node.largs).lower()
208 wxr.wtp.start_subsection(section_title)
209 if section_title in [
210 # Morphological and syntactic properties
211 "морфологические и синтаксические свойства",
212 # Type and syntactic properties of the word combination
213 "тип и синтаксические свойства сочетания",
214 "тип и свойства сочетания",
215 ]:
216 extract_morphological_section(wxr, page_data, level_node)
217 elif section_title in POS_TITLES:
218 pos_data = POS_TITLES[section_title]
219 page_data[-1].pos = pos_data["pos"]
220 page_data[-1].tags.extend(pos_data.get("tags", []))
221 extract_gloss(wxr, page_data[-1], level_node)
222 elif section_title == "произношение" and wxr.config.capture_pronunciation:
223 extract_pronunciation_section(wxr, page_data[-1], level_node)
224 elif section_title == "семантические свойства": # Semantic properties
225 process_semantic_section(wxr, page_data, level_node)
226 elif section_title in [
227 "значение",
228 "значения",
229 "как самостоятельный глагол",
230 "в значении вспомогательного глагола или связки",
231 ]:
232 extract_gloss(wxr, page_data[-1], level_node)
233 elif section_title == "этимология" and wxr.config.capture_etymologies: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true
234 extract_etymology(wxr, page_data[-1], level_node)
235 elif (
236 section_title
237 in [
238 "фразеологизмы и устойчивые сочетания",
239 "типичные сочетания",
240 "фразеологизмы",
241 "пословицы и поговорки",
242 ]
243 and wxr.config.capture_linkages
244 ):
245 extract_phrase_section(wxr, page_data[-1], level_node, section_title)
246 elif ( 246 ↛ 250line 246 didn't jump to line 250 because the condition on line 246 was never true
247 section_title in ["перевод", "иноязычные аналоги"]
248 and wxr.config.capture_translations
249 ):
250 extract_translations(wxr, page_data[-1], level_node)
251 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true
252 extract_linkage_section(
253 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node
254 )
255 elif section_title == "библиография": 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 pass
257 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 parse_roman_section(wxr, page_data[-1], level_node)
259 elif section_title == "прочее": 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 pass
261 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 extract_homophone_section(wxr, page_data[-1], level_node)
263 elif section_title in ALT_FORM_SECTIONS: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true
264 extract_alt_form_section(
265 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title]
266 )
267 elif section_title == "рифмы": 267 ↛ 269line 267 didn't jump to line 269 because the condition on line 267 was always true
268 extract_rhyme_section(wxr, page_data[-1], level_node)
269 elif section_title not in ["см. также", "смотреть также", "смотрите также"]:
270 wxr.wtp.debug(
271 f"Unprocessed section {section_title}",
272 sortid="wixtextract/extractor/ru/page/parse_section/66",
273 )
275 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
276 parse_section(wxr, page_data, next_level_node)
278 extract_section_end_templates(wxr, page_data[-1], level_node)
281def parse_page(
282 wxr: WiktextractContext, page_title: str, page_text: str
283) -> list[dict[str, Any]]:
284 # Help site describing page structure:
285 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
287 if wxr.config.verbose: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true
288 logger.info(f"Parsing page: {page_title}")
289 wxr.config.word = page_title
290 wxr.wtp.start_page(page_title)
291 tree = wxr.wtp.parse(page_text)
292 page_data: list[WordEntry] = []
294 for level1_node in tree.find_child(NodeKind.LEVEL1):
295 lang_code = ""
296 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 296 ↛ 299line 296 didn't jump to line 299 because the loop on line 296 didn't complete
297 lang_code = subtitle_template.template_name.strip(" -")
298 break
299 if lang_code == "": 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true
300 lang_code = "unknown"
301 if ( 301 ↛ 305line 301 didn't jump to line 305 because the condition on line 301 was never true
302 wxr.config.capture_language_codes is not None
303 and lang_code not in wxr.config.capture_language_codes
304 ):
305 continue
306 categories = {"categories": []}
307 lang_name = clean_node(wxr, categories, level1_node.largs)
308 wxr.wtp.start_section(lang_name)
309 base_data = WordEntry(
310 lang=lang_name,
311 lang_code=lang_code,
312 word=page_title,
313 pos="unknown",
314 )
315 base_data.categories.extend(categories["categories"])
316 extract_section_end_templates(wxr, base_data, level1_node)
317 pos_data = get_pos(wxr, level1_node)
318 if pos_data is not None:
319 base_data.pos = pos_data["pos"]
320 base_data.tags.extend(pos_data.get("tags", []))
322 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
323 if base_data.pos == "unknown":
324 pos_data = get_pos(wxr, level2_node)
325 if pos_data is not None:
326 base_data.pos = pos_data["pos"]
327 base_data.tags.extend(pos_data.get("tags", []))
328 page_data.append(base_data.model_copy(deep=True))
329 extract_level2_node_contents(wxr, page_data[-1], level2_node)
330 has_level3 = False
331 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
332 parse_section(wxr, page_data, level3_node)
333 has_level3 = True
334 if page_data[-1] == base_data or not has_level3: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true
335 page_data.pop()
336 extract_low_quality_page(wxr, page_data, base_data, level2_node)
338 for any_level_index, any_level_node in enumerate(
339 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)
340 ):
341 if any_level_index == 0 and (
342 len(page_data) == 0
343 or page_data[-1].lang_code != base_data.lang_code
344 ):
345 page_data.append(base_data.model_copy(deep=True))
346 parse_section(wxr, page_data, any_level_node)
348 if len(page_data) > 0 and page_data[-1] == base_data: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 page_data.pop()
350 extract_low_quality_page(wxr, page_data, base_data, level1_node)
352 for d in page_data:
353 if len(d.senses) == 0: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true
354 d.senses.append(Sense(tags=["no-gloss"]))
355 return [d.model_dump(exclude_defaults=True) for d in page_data]
358def extract_low_quality_page(
359 wxr: WiktextractContext,
360 page_data: list[WordEntry],
361 base_data: WordEntry,
362 level_node: WikiNode,
363) -> None:
364 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
365 if isinstance(node, TemplateNode) and node.template_name.startswith(
366 "Форма-"
367 ):
368 process_form_template(wxr, page_data, base_data, node)
369 elif isinstance(node, WikiNode):
370 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):
371 if template_node.template_name.startswith("Форма-"): 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true
372 process_form_template(
373 wxr, page_data, base_data, template_node
374 )
377def process_form_template(
378 wxr: WiktextractContext,
379 page_data: list[WordEntry],
380 base_data: WordEntry,
381 template_node: TemplateNode,
382) -> None:
383 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ
384 # Шаблон:Форма-гл, "Шаблон:форма-гл en"
385 pos_data = get_pos_from_template(wxr, template_node)
386 if pos_data is not None: 386 ↛ 390line 386 didn't jump to line 390 because the condition on line 386 was always true
387 base_data.pos = pos_data["pos"]
388 base_data.tags.extend(pos_data.get("tags", []))
390 form_of = clean_node(
391 wxr,
392 None,
393 template_node.template_parameters.get(
394 "база", template_node.template_parameters.get(1, "")
395 ),
396 )
397 ipa = clean_node(
398 wxr, None, template_node.template_parameters.get("МФА", "")
399 )
400 expanded_node = wxr.wtp.parse(
401 wxr.wtp.node_to_wikitext(template_node), expand_all=True
402 )
403 current_data = base_data.model_copy(deep=True)
404 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
405 gloss_text = clean_node(wxr, None, list_item.children)
406 if len(gloss_text) > 0: 406 ↛ 404line 406 didn't jump to line 404 because the condition on line 406 was always true
407 sense = Sense(glosses=[gloss_text])
408 if len(form_of) > 0: 408 ↛ 411line 408 didn't jump to line 411 because the condition on line 408 was always true
409 sense.form_of.append(AltForm(word=form_of))
410 sense.tags.append("form-of")
411 current_data.senses.append(sense)
413 if len(ipa) > 0: 413 ↛ 415line 413 didn't jump to line 415 because the condition on line 413 was always true
414 current_data.sounds.append(Sound(ipa=ipa))
415 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 415 ↛ exitline 415 didn't return from function 'process_form_template' because the condition on line 415 was always true
416 clean_node(wxr, current_data, template_node)
417 page_data.append(current_data)
420def parse_roman_section(
421 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
422) -> None:
423 for link_node in level_node.find_child(NodeKind.LINK):
424 form_text = clean_node(wxr, None, link_node)
425 if form_text != "":
426 form = Form(form=form_text, tags=["romanization"])
427 word_entry.forms.append(form)
430def extract_section_end_templates(
431 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
432) -> None:
433 # category link templates
434 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации
435 for template_node in level_node.find_child(NodeKind.TEMPLATE):
436 if template_node.template_name in { 436 ↛ 448line 436 didn't jump to line 448 because the condition on line 436 was never true
437 "-ание",
438 "-атель",
439 "-ация",
440 "-ение",
441 "-ка",
442 "длина слова",
443 "Категория",
444 "Омонимы",
445 "forms",
446 "multilang",
447 }:
448 clean_node(wxr, word_entry, template_node)
449 elif template_node.template_name == "zh-forms": 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true
450 extract_zh_forms_template(wxr, word_entry, template_node)
453def extract_zh_forms_template(
454 wxr: WiktextractContext,
455 base_data: WordEntry,
456 template_node: TemplateNode,
457) -> None:
458 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms
459 # https://ru.wiktionary.org/wiki/Модуль:zh-forms
460 # similar to en and zh edition template
461 for p_name, p_value in template_node.template_parameters.items():
462 if not isinstance(p_name, str):
463 continue
464 if re.fullmatch(r"s\d*", p_name):
465 form_data = Form(
466 form=clean_node(wxr, None, p_value), tags=["Simplified-Chinese"]
467 )
468 if form_data.form not in ["", wxr.wtp.title]:
469 base_data.forms.append(form_data)
470 elif re.fullmatch(r"t\d*", p_name):
471 form_data = Form(
472 form=clean_node(wxr, None, p_value),
473 tags=["Traditional-Chinese"],
474 )
475 if form_data.form not in ["", wxr.wtp.title]:
476 base_data.forms.append(form_data)
477 elif p_name == "lit":
478 base_data.literal_meaning = clean_node(wxr, None, p_value)
481def extract_level2_node_contents(
482 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
483) -> None:
484 for t_node in level_node.find_content(NodeKind.TEMPLATE):
485 if t_node.template_name in ["заголовок", "з"]: 485 ↛ 484line 485 didn't jump to line 484 because the condition on line 485 was always true
486 # https://ru.wiktionary.org/wiki/Шаблон:з
487 stressed_form = clean_node(
488 wxr, None, t_node.template_parameters.get("ударение", "")
489 )
490 if "(" in stressed_form:
491 stressed_form = stressed_form[
492 : stressed_form.index("(")
493 ].strip()
494 if stressed_form not in ["", wxr.wtp.title]:
495 word_entry.forms.append(
496 Form(form=stressed_form, tags=["stressed"])
497 )