Coverage for src/wiktextract/extractor/ru/page.py: 76%
244 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
2from typing import Any
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...config import POSSubtitleData
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .etymology import extract_etymology
17from .gloss import extract_gloss, process_meaning_template
18from .inflection import (
19 extract_прил_ru_comparative_forms,
20 parse_html_forms_table,
21 parse_wikitext_forms_table,
22)
23from .linkage import (
24 extract_alt_form_section,
25 extract_linkage_section,
26 extract_phrase_section,
27)
28from .models import AltForm, Form, Sense, Sound, WordEntry
29from .pronunciation import (
30 extract_homophone_section,
31 extract_pronunciation_section,
32 extract_rhyme_section,
33)
34from .section_titles import (
35 ALT_FORM_SECTIONS,
36 LINKAGE_TITLES,
37 POS_TEMPLATE_NAMES,
38 POS_TITLES,
39)
40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS
41from .translation import extract_translations
44def process_semantic_section(
45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode
46):
47 for list_node in level_node.find_child(NodeKind.LIST):
48 for template_node in list_node.find_child_recursively(
49 NodeKind.TEMPLATE
50 ):
51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true
52 sense = process_meaning_template(
53 wxr, None, page_data[-1], template_node
54 )
55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true
56 page_data[-1].senses.append(sense)
59MORPH_TEMPLATE_ARGS = {
60 "p": "prefix",
61 "prefix": "prefix",
62 "i": "interfix",
63 "interfix": "interfix",
64 "in": "infix",
65 "infix": "infix",
66 "s": "suffix",
67 "suffix": "suffix",
68 "t": "transfix",
69 "transfix": "transfix",
70 "po": "suffix",
71 "postfix": "suffix",
72 "c": "circumfix",
73 "confix": "circumfix",
74 "circumfix": "circumfix",
75 "r": "root",
76 "e": "suffix",
77 "ending": "suffix",
78}
81def get_pos_from_template(
82 wxr: WiktextractContext, template_node: TemplateNode
83) -> POSSubtitleData | None:
84 # Search for POS in template names
85 template_name = template_node.template_name.lower()
86 if template_name == "morph":
87 # https://ru.wiktionary.org/wiki/Шаблон:morph
88 pos_type = template_node.template_parameters.get("тип", "")
89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true
90 return {
91 "pos": MORPH_TEMPLATE_ARGS[pos_type],
92 "tags": ["morpheme"],
93 }
94 elif (
95 template_name in {"заголовок", "з"}
96 and 1 in template_node.template_parameters
97 ):
98 pos_text = clean_node(
99 wxr, None, template_node.template_parameters[1]
100 ).strip("()")
101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 return
103 pos_text = pos_text.split()[0]
104 if pos_text in POS_TITLES:
105 return POS_TITLES[pos_text]
107 if template_name.startswith("прил ru"):
108 pos_arg = clean_node(
109 wxr, None, template_node.template_parameters.get("часть речи", "")
110 ).lower()
111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 for pos_string in POS_TITLES.keys():
113 if pos_string in pos_arg:
114 return POS_TITLES[pos_string]
115 else:
116 return {"pos": "adj"}
118 for part in template_name.split(maxsplit=2):
119 for subpart in part.split("-", maxsplit=2):
120 if subpart in POS_TEMPLATE_NAMES:
121 return POS_TEMPLATE_NAMES[subpart]
124def get_pos(
125 wxr: WiktextractContext, level_node: WikiNode
126) -> POSSubtitleData | None:
127 for template_node in level_node.find_child(NodeKind.TEMPLATE):
128 pos_data = get_pos_from_template(wxr, template_node)
129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true
130 return pos_data
131 # POS text could also in level node content
132 for template_node in level_node.find_content(NodeKind.TEMPLATE):
133 pos_data = get_pos_from_template(wxr, template_node)
134 if pos_data is not None:
135 return pos_data
137 # Search for POS in section text
138 text = clean_node(
139 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
140 )
141 for pos_string in POS_TITLES.keys():
142 if pos_string in text.lower():
143 return POS_TITLES[pos_string]
146def extract_morphological_section(
147 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
148) -> None:
149 pos_data = get_pos(wxr, level_node)
150 if pos_data is not None: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true
151 page_data[-1].pos = pos_data["pos"]
152 page_data[-1].tags.extend(pos_data.get("tags", []))
153 for child_node in level_node.find_child(NodeKind.TEMPLATE):
154 expanded_template = wxr.wtp.parse(
155 wxr.wtp.node_to_wikitext(child_node), expand_all=True
156 )
157 clean_node(wxr, page_data[-1], expanded_template) # add category links
158 if child_node.template_name.startswith(
159 (
160 "прил ru",
161 "прил-ru",
162 "сущ ",
163 "сущ-ru",
164 "гл ",
165 "мест ru ",
166 "числ ru ",
167 "числ-",
168 "прич ru ",
169 "Фам ru ",
170 "падежи ",
171 )
172 ):
173 for table_node in expanded_template.find_child_recursively(
174 NodeKind.TABLE
175 ):
176 parse_wikitext_forms_table(wxr, page_data[-1], table_node)
177 for table_tag in expanded_template.find_html("table"):
178 parse_html_forms_table(wxr, page_data[-1], table_tag)
179 page_data[-1].hyphenation = clean_node(
180 wxr, None, child_node.template_parameters.get("слоги", "")
181 )
183 if child_node.template_name.startswith("прил ru"):
184 extract_прил_ru_comparative_forms(
185 wxr, page_data[-1], expanded_template
186 )
188 for node in expanded_template.children:
189 node_text = clean_node(wxr, page_data[-1], node)
190 for text in node_text.split(","):
191 text = text.strip()
192 if text in MORPHOLOGICAL_TEMPLATE_TAGS:
193 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]
194 if isinstance(tr_tag, str): 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true
195 page_data[-1].tags.append(tr_tag)
196 elif isinstance(tr_tag, list):
197 page_data[-1].tags.extend(tr_tag)
200def parse_section(
201 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
202) -> None:
203 section_title = clean_node(wxr, None, level_node.largs).lower()
204 wxr.wtp.start_subsection(section_title)
205 if section_title in [
206 # Morphological and syntactic properties
207 "морфологические и синтаксические свойства",
208 # Type and syntactic properties of the word combination
209 "тип и синтаксические свойства сочетания",
210 "тип и свойства сочетания",
211 ]:
212 extract_morphological_section(wxr, page_data, level_node)
213 elif section_title in POS_TITLES:
214 pos_data = POS_TITLES[section_title]
215 page_data[-1].pos = pos_data["pos"]
216 page_data[-1].tags.extend(pos_data.get("tags", []))
217 extract_gloss(wxr, page_data[-1], level_node)
218 elif section_title == "произношение" and wxr.config.capture_pronunciation:
219 extract_pronunciation_section(wxr, page_data[-1], level_node)
220 elif section_title == "семантические свойства": # Semantic properties
221 process_semantic_section(wxr, page_data, level_node)
222 elif section_title in [
223 "значение",
224 "значения",
225 "как самостоятельный глагол",
226 "в значении вспомогательного глагола или связки",
227 ]:
228 extract_gloss(wxr, page_data[-1], level_node)
229 elif section_title == "этимология" and wxr.config.capture_etymologies: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 extract_etymology(wxr, page_data[-1], level_node)
231 elif (
232 section_title
233 in [
234 "фразеологизмы и устойчивые сочетания",
235 "типичные сочетания",
236 "фразеологизмы",
237 "пословицы и поговорки",
238 ]
239 and wxr.config.capture_linkages
240 ):
241 extract_phrase_section(wxr, page_data[-1], level_node, section_title)
242 elif ( 242 ↛ 246line 242 didn't jump to line 246 because the condition on line 242 was never true
243 section_title in ["перевод", "иноязычные аналоги"]
244 and wxr.config.capture_translations
245 ):
246 extract_translations(wxr, page_data[-1], level_node)
247 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true
248 extract_linkage_section(
249 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node
250 )
251 elif section_title == "библиография": 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true
252 pass
253 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 parse_roman_section(wxr, page_data[-1], level_node)
255 elif section_title == "прочее": 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 pass
257 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true
258 extract_homophone_section(wxr, page_data[-1], level_node)
259 elif section_title in ALT_FORM_SECTIONS: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 extract_alt_form_section(
261 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title]
262 )
263 elif section_title == "рифмы": 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true
264 extract_rhyme_section(wxr, page_data[-1], level_node)
265 elif section_title not in ["см. также", "смотреть также", "смотрите также"]:
266 wxr.wtp.debug(
267 f"Unprocessed section {section_title}",
268 sortid="wixtextract/extractor/ru/page/parse_section/66",
269 )
271 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
272 parse_section(wxr, page_data, next_level_node)
274 extract_section_end_templates(wxr, page_data[-1], level_node)
277def parse_page(
278 wxr: WiktextractContext, page_title: str, page_text: str
279) -> list[dict[str, Any]]:
280 # Help site describing page structure:
281 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
283 if wxr.config.verbose: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 logger.info(f"Parsing page: {page_title}")
285 wxr.config.word = page_title
286 wxr.wtp.start_page(page_title)
287 tree = wxr.wtp.parse(page_text)
288 page_data: list[WordEntry] = []
290 for level1_node in tree.find_child(NodeKind.LEVEL1):
291 lang_code = ""
292 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 292 ↛ 295line 292 didn't jump to line 295 because the loop on line 292 didn't complete
293 lang_code = subtitle_template.template_name.strip(" -")
294 break
295 if lang_code == "": 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 lang_code = "unknown"
297 if ( 297 ↛ 301line 297 didn't jump to line 301 because the condition on line 297 was never true
298 wxr.config.capture_language_codes is not None
299 and lang_code not in wxr.config.capture_language_codes
300 ):
301 continue
302 categories = {"categories": []}
303 lang_name = clean_node(wxr, categories, level1_node.largs)
304 wxr.wtp.start_section(lang_name)
305 base_data = WordEntry(
306 lang=lang_name,
307 lang_code=lang_code,
308 word=page_title,
309 pos="unknown",
310 )
311 base_data.categories.extend(categories["categories"])
312 extract_section_end_templates(wxr, base_data, level1_node)
313 pos_data = get_pos(wxr, level1_node)
314 if pos_data is not None:
315 base_data.pos = pos_data["pos"]
316 base_data.tags.extend(pos_data.get("tags", []))
318 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
319 if base_data.pos == "unknown":
320 pos_data = get_pos(wxr, level2_node)
321 if pos_data is not None:
322 base_data.pos = pos_data["pos"]
323 base_data.tags.extend(pos_data.get("tags", []))
324 page_data.append(base_data.model_copy(deep=True))
325 extract_level2_node_contents(wxr, page_data[-1], level2_node)
326 has_level3 = False
327 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
328 parse_section(wxr, page_data, level3_node)
329 has_level3 = True
330 if page_data[-1] == base_data or not has_level3: 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true
331 page_data.pop()
332 extract_low_quality_page(wxr, page_data, base_data, level2_node)
334 for any_level_index, any_level_node in enumerate(
335 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)
336 ):
337 if any_level_index == 0 and (
338 len(page_data) == 0
339 or page_data[-1].lang_code != base_data.lang_code
340 ):
341 page_data.append(base_data.model_copy(deep=True))
342 parse_section(wxr, page_data, any_level_node)
344 if len(page_data) > 0 and page_data[-1] == base_data: 344 ↛ 345line 344 didn't jump to line 345 because the condition on line 344 was never true
345 page_data.pop()
346 extract_low_quality_page(wxr, page_data, base_data, level1_node)
348 for d in page_data:
349 if len(d.senses) == 0: 349 ↛ 350line 349 didn't jump to line 350 because the condition on line 349 was never true
350 d.senses.append(Sense(tags=["no-gloss"]))
351 return [d.model_dump(exclude_defaults=True) for d in page_data]
354def extract_low_quality_page(
355 wxr: WiktextractContext,
356 page_data: list[WordEntry],
357 base_data: WordEntry,
358 level_node: WikiNode,
359) -> None:
360 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
361 if isinstance(node, TemplateNode) and node.template_name.startswith(
362 "Форма-"
363 ):
364 process_form_template(wxr, page_data, base_data, node)
365 elif isinstance(node, WikiNode):
366 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):
367 if template_node.template_name.startswith("Форма-"): 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true
368 process_form_template(
369 wxr, page_data, base_data, template_node
370 )
373def process_form_template(
374 wxr: WiktextractContext,
375 page_data: list[WordEntry],
376 base_data: WordEntry,
377 template_node: TemplateNode,
378) -> None:
379 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ
380 # Шаблон:Форма-гл, "Шаблон:форма-гл en"
381 pos_data = get_pos_from_template(wxr, template_node)
382 if pos_data is not None: 382 ↛ 386line 382 didn't jump to line 386 because the condition on line 382 was always true
383 base_data.pos = pos_data["pos"]
384 base_data.tags.extend(pos_data.get("tags", []))
386 form_of = clean_node(
387 wxr,
388 None,
389 template_node.template_parameters.get(
390 "база", template_node.template_parameters.get(1, "")
391 ),
392 )
393 ipa = clean_node(
394 wxr, None, template_node.template_parameters.get("МФА", "")
395 )
396 expanded_node = wxr.wtp.parse(
397 wxr.wtp.node_to_wikitext(template_node), expand_all=True
398 )
399 current_data = base_data.model_copy(deep=True)
400 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
401 gloss_text = clean_node(wxr, None, list_item.children)
402 if len(gloss_text) > 0: 402 ↛ 400line 402 didn't jump to line 400 because the condition on line 402 was always true
403 sense = Sense(glosses=[gloss_text])
404 if len(form_of) > 0: 404 ↛ 407line 404 didn't jump to line 407 because the condition on line 404 was always true
405 sense.form_of.append(AltForm(word=form_of))
406 sense.tags.append("form-of")
407 current_data.senses.append(sense)
409 if len(ipa) > 0: 409 ↛ 411line 409 didn't jump to line 411 because the condition on line 409 was always true
410 current_data.sounds.append(Sound(ipa=ipa))
411 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 411 ↛ exitline 411 didn't return from function 'process_form_template' because the condition on line 411 was always true
412 clean_node(wxr, current_data, template_node)
413 page_data.append(current_data)
416def parse_roman_section(
417 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
418) -> None:
419 for link_node in level_node.find_child(NodeKind.LINK):
420 form_text = clean_node(wxr, None, link_node)
421 if form_text != "":
422 form = Form(form=form_text, tags=["romanization"])
423 word_entry.forms.append(form)
426def extract_section_end_templates(
427 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
428) -> None:
429 # category link templates
430 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации
431 for template_node in level_node.find_child(NodeKind.TEMPLATE):
432 if template_node.template_name in { 432 ↛ 444line 432 didn't jump to line 444 because the condition on line 432 was never true
433 "-ание",
434 "-атель",
435 "-ация",
436 "-ение",
437 "-ка",
438 "длина слова",
439 "Категория",
440 "Омонимы",
441 "forms",
442 "multilang",
443 }:
444 clean_node(wxr, word_entry, template_node)
445 elif template_node.template_name == "zh-forms": 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true
446 extract_zh_forms_template(wxr, word_entry, template_node)
449def extract_zh_forms_template(
450 wxr: WiktextractContext,
451 base_data: WordEntry,
452 template_node: TemplateNode,
453) -> None:
454 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms
455 # https://ru.wiktionary.org/wiki/Модуль:zh-forms
456 # similar to en and zh edition template
457 for p_name, p_value in template_node.template_parameters.items():
458 if not isinstance(p_name, str):
459 continue
460 if re.fullmatch(r"s\d*", p_name):
461 form_data = Form(
462 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]
463 )
464 if form_data.form not in ["", wxr.wtp.title]:
465 base_data.forms.append(form_data)
466 elif re.fullmatch(r"t\d*", p_name):
467 form_data = Form(
468 form=clean_node(wxr, None, p_value),
469 tags=["Traditional Chinese"],
470 )
471 if form_data.form not in ["", wxr.wtp.title]:
472 base_data.forms.append(form_data)
473 elif p_name == "lit":
474 base_data.literal_meaning = clean_node(wxr, None, p_value)
477def extract_level2_node_contents(
478 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
479) -> None:
480 for t_node in level_node.find_content(NodeKind.TEMPLATE):
481 if t_node.template_name in ["заголовок", "з"]: 481 ↛ 480line 481 didn't jump to line 480 because the condition on line 481 was always true
482 # https://ru.wiktionary.org/wiki/Шаблон:з
483 stressed_form = clean_node(
484 wxr, None, t_node.template_parameters.get("ударение", "")
485 )
486 if "(" in stressed_form:
487 stressed_form = stressed_form[
488 : stressed_form.index("(")
489 ].strip()
490 if stressed_form not in ["", wxr.wtp.title]:
491 word_entry.forms.append(
492 Form(form=stressed_form, tags=["stressed"])
493 )