Coverage for src/wiktextract/extractor/ru/page.py: 74%
239 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Any
4from wikitextprocessor.parser import (
5 LEVEL_KIND_FLAGS,
6 LevelNode,
7 NodeKind,
8 TemplateNode,
9 WikiNode,
10)
12from ...config import POSSubtitleData
13from ...page import clean_node
14from ...wxr_context import WiktextractContext
15from ...wxr_logging import logger
16from .etymology import extract_etymology
17from .gloss import extract_gloss, process_meaning_template
18from .inflection import parse_adj_forms_table, parse_wikitext_forms_table
19from .linkage import (
20 extract_linkages,
21 extract_phrase_section,
22 process_related_block_template,
23)
24from .models import AltForm, Form, Sense, Sound, WordEntry
25from .pronunciation import (
26 extract_homophone_section,
27 extract_pronunciation_section,
28)
29from .section_titles import LINKAGE_TITLES, POS_TEMPLATE_NAMES, POS_TITLES
30from .tags import MORPHOLOGICAL_TEMPLATE_TAGS
31from .translation import extract_translations
34def process_semantic_section(
35 wxr: WiktextractContext,
36 page_data: list[WordEntry],
37 semantic_level_node: WikiNode,
38):
39 for list_node in semantic_level_node.find_child(NodeKind.LIST):
40 for template_node in list_node.find_child_recursively(
41 NodeKind.TEMPLATE
42 ):
43 if template_node.template_name == "значение": 43 ↛ 40line 43 didn't jump to line 40 because the condition on line 43 was always true
44 sense = process_meaning_template(
45 wxr, None, page_data[-1], template_node
46 )
47 if len(sense.glosses) > 0: 47 ↛ 40line 47 didn't jump to line 40 because the condition on line 47 was always true
48 page_data[-1].senses.append(sense)
51MORPH_TEMPLATE_ARGS = {
52 "p": "prefix",
53 "prefix": "prefix",
54 "i": "interfix",
55 "interfix": "interfix",
56 "in": "infix",
57 "infix": "infix",
58 "s": "suffix",
59 "suffix": "suffix",
60 "t": "transfix",
61 "transfix": "transfix",
62 "po": "suffix",
63 "postfix": "suffix",
64 "c": "circumfix",
65 "confix": "circumfix",
66 "circumfix": "circumfix",
67 "r": "root",
68 "e": "suffix",
69 "ending": "suffix",
70}
73def get_pos_from_template(
74 wxr: WiktextractContext, template_node: TemplateNode
75) -> POSSubtitleData | None:
76 # Search for POS in template names
77 template_name = template_node.template_name.lower()
78 if template_name == "morph":
79 # https://ru.wiktionary.org/wiki/Шаблон:morph
80 pos_type = template_node.template_parameters.get("тип", "")
81 if pos_type in MORPH_TEMPLATE_ARGS: 81 ↛ 99line 81 didn't jump to line 99 because the condition on line 81 was always true
82 return {
83 "pos": MORPH_TEMPLATE_ARGS[pos_type],
84 "tags": ["morpheme"],
85 }
86 elif (
87 template_name in {"заголовок", "з"}
88 and 1 in template_node.template_parameters
89 ):
90 pos_text = clean_node(
91 wxr, None, template_node.template_parameters[1]
92 ).strip("()")
93 if len(pos_text) == 0: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 return
95 pos_text = pos_text.split()[0]
96 if pos_text in POS_TITLES:
97 return POS_TITLES[pos_text]
99 for part in template_name.split(maxsplit=2):
100 for subpart in part.split("-", maxsplit=2):
101 if subpart in POS_TEMPLATE_NAMES:
102 return POS_TEMPLATE_NAMES[subpart]
105def get_pos(
106 wxr: WiktextractContext, level_node: WikiNode
107) -> POSSubtitleData | None:
108 for template_node in level_node.find_child(NodeKind.TEMPLATE):
109 pos_data = get_pos_from_template(wxr, template_node)
110 if pos_data is not None: 110 ↛ 108line 110 didn't jump to line 108 because the condition on line 110 was always true
111 return pos_data
112 # POS text could also in level node content
113 for template_node in level_node.find_content(NodeKind.TEMPLATE):
114 pos_data = get_pos_from_template(wxr, template_node)
115 if pos_data is not None:
116 return pos_data
118 # Search for POS in section text
119 text = clean_node(
120 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))
121 )
122 for pos_string in POS_TITLES.keys():
123 if pos_string in text.lower():
124 return POS_TITLES[pos_string]
127def extract_morphological_section(
128 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
129) -> None:
130 param_tag_map = {
131 "степень": "comparative", # Шаблон:inflection/ru/adj
132 "соотв": "perfective", # Шаблон:Гл-блок
133 }
135 pos_data = get_pos(wxr, level_node)
136 if pos_data is not None: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true
137 page_data[-1].pos = pos_data["pos"]
138 page_data[-1].tags.extend(pos_data.get("tags", []))
139 for child_node in level_node.find_child(NodeKind.TEMPLATE):
140 expanded_template = wxr.wtp.parse(
141 wxr.wtp.node_to_wikitext(child_node), expand_all=True
142 )
143 if child_node.template_name.startswith("прил"):
144 parse_adj_forms_table(wxr, page_data[-1], expanded_template)
145 elif child_node.template_name.startswith(("сущ", "гл")):
146 parse_wikitext_forms_table(wxr, page_data[-1], expanded_template)
148 for node in expanded_template.children:
149 node_text = clean_node(wxr, page_data[-1], node)
150 for text in node_text.split(","):
151 text = text.strip()
152 if text in MORPHOLOGICAL_TEMPLATE_TAGS:
153 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]
154 if isinstance(tr_tag, str): 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true
155 page_data[-1].tags.append(tr_tag)
156 elif isinstance(tr_tag, list):
157 page_data[-1].tags.extend(tr_tag)
159 for param, tag in param_tag_map.items():
160 if param in child_node.template_parameters:
161 forms_text = clean_node(
162 wxr, None, child_node.template_parameters[param]
163 )
164 for form in forms_text.split(","):
165 form = form.strip()
166 if form != "": 166 ↛ 164line 166 didn't jump to line 164 because the condition on line 166 was always true
167 page_data[-1].forms.append(Form(form=form, tags=[tag]))
170def parse_section(
171 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
172) -> None:
173 section_title = clean_node(wxr, None, level_node.largs).lower()
174 wxr.wtp.start_subsection(section_title)
175 if section_title in [
176 # Morphological and syntactic properties
177 "морфологические и синтаксические свойства",
178 # Type and syntactic properties of the word combination
179 "тип и синтаксические свойства сочетания",
180 "тип и свойства сочетания",
181 ]:
182 extract_morphological_section(wxr, page_data, level_node)
183 elif section_title in POS_TITLES:
184 pos_data = POS_TITLES[section_title]
185 page_data[-1].pos = pos_data["pos"]
186 page_data[-1].tags.extend(pos_data.get("tags", []))
187 extract_gloss(wxr, page_data[-1], level_node)
188 elif section_title == "произношение" and wxr.config.capture_pronunciation:
189 extract_pronunciation_section(wxr, page_data[-1], level_node)
190 elif section_title == "семантические свойства": # Semantic properties
191 process_semantic_section(wxr, page_data, level_node)
192 elif section_title in ("значение", "значения"):
193 extract_gloss(wxr, page_data[-1], level_node)
194 elif section_title == "родственные слова" and wxr.config.capture_linkages: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was never true
195 # Word family
196 for template_node in level_node.find_child(NodeKind.TEMPLATE):
197 if template_node.template_name == "родств-блок":
198 process_related_block_template(
199 wxr, page_data[-1], template_node
200 )
201 elif section_title == "этимология" and wxr.config.capture_etymologies: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true
202 extract_etymology(wxr, page_data[-1], level_node)
203 elif ( 203 ↛ 214line 203 didn't jump to line 214 because the condition on line 203 was always true
204 section_title
205 in [
206 "фразеологизмы и устойчивые сочетания",
207 "типичные сочетания",
208 "фразеологизмы",
209 "пословицы и поговорки",
210 ]
211 and wxr.config.capture_linkages
212 ):
213 extract_phrase_section(wxr, page_data[-1], level_node, section_title)
214 elif (
215 section_title in ["перевод", "иноязычные аналоги"]
216 and wxr.config.capture_translations
217 ):
218 extract_translations(wxr, page_data[-1], level_node)
219 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages:
220 extract_linkages(
221 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node
222 )
223 elif section_title == "библиография":
224 pass
225 elif section_title in ["латиница (latinça)", "латиница (latinca)"]:
226 parse_roman_section(wxr, page_data[-1], level_node)
227 elif section_title == "прочее":
228 pass
229 elif section_title == "омофоны" and wxr.config.capture_pronunciation:
230 extract_homophone_section(wxr, page_data[-1], level_node)
231 else:
232 wxr.wtp.debug(
233 f"Unprocessed section {section_title}",
234 sortid="wixtextract/extractor/ru/page/parse_section/66",
235 )
237 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
238 parse_section(wxr, page_data, next_level_node)
240 extract_section_end_templates(wxr, page_data[-1], level_node)
243def parse_page(
244 wxr: WiktextractContext, page_title: str, page_text: str
245) -> list[dict[str, Any]]:
246 # Help site describing page structure:
247 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей
249 if wxr.config.verbose: 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true
250 logger.info(f"Parsing page: {page_title}")
251 wxr.config.word = page_title
252 wxr.wtp.start_page(page_title)
253 tree = wxr.wtp.parse(page_text)
254 page_data: list[WordEntry] = []
256 for level1_node in tree.find_child(NodeKind.LEVEL1):
257 lang_code = ""
258 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 258 ↛ 261line 258 didn't jump to line 261 because the loop on line 258 didn't complete
259 lang_code = subtitle_template.template_name.strip(" -")
260 break
261 if lang_code == "": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true
262 lang_code = "unknown"
263 if ( 263 ↛ 267line 263 didn't jump to line 267 because the condition on line 263 was never true
264 wxr.config.capture_language_codes is not None
265 and lang_code not in wxr.config.capture_language_codes
266 ):
267 continue
268 categories = {"categories": []}
269 lang_name = clean_node(wxr, categories, level1_node.largs)
270 wxr.wtp.start_section(lang_name)
271 base_data = WordEntry(
272 lang=lang_name,
273 lang_code=lang_code,
274 word=page_title,
275 pos="unknown",
276 )
277 base_data.categories.extend(categories["categories"])
278 extract_section_end_templates(wxr, base_data, level1_node)
279 pos_data = get_pos(wxr, level1_node)
280 if pos_data is not None:
281 base_data.pos = pos_data["pos"]
282 base_data.tags.extend(pos_data.get("tags", []))
284 for level2_node in level1_node.find_child(NodeKind.LEVEL2):
285 if base_data.pos == "unknown":
286 pos_data = get_pos(wxr, level2_node)
287 if pos_data is not None:
288 base_data.pos = pos_data["pos"]
289 base_data.tags.extend(pos_data.get("tags", []))
290 page_data.append(base_data.model_copy(deep=True))
291 extract_level2_node_contents(wxr, page_data[-1], level2_node)
292 has_level3 = False
293 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
294 parse_section(wxr, page_data, level3_node)
295 has_level3 = True
296 if page_data[-1] == base_data or not has_level3: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 page_data.pop()
298 extract_low_quality_page(wxr, page_data, base_data, level2_node)
300 for any_level_index, any_level_node in enumerate(
301 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)
302 ):
303 if any_level_index == 0 and (
304 len(page_data) == 0
305 or page_data[-1].lang_code != base_data.lang_code
306 ):
307 page_data.append(base_data.model_copy(deep=True))
308 parse_section(wxr, page_data, any_level_node)
310 if len(page_data) > 0 and page_data[-1] == base_data: 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true
311 page_data.pop()
312 extract_low_quality_page(wxr, page_data, base_data, level1_node)
314 for d in page_data:
315 if len(d.senses) == 0: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true
316 d.senses.append(Sense(tags=["no-gloss"]))
317 return [d.model_dump(exclude_defaults=True) for d in page_data]
320def extract_low_quality_page(
321 wxr: WiktextractContext,
322 page_data: list[WordEntry],
323 base_data: WordEntry,
324 level_node: WikiNode,
325) -> None:
326 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):
327 if isinstance(node, TemplateNode) and node.template_name.startswith(
328 "Форма-"
329 ):
330 process_form_template(wxr, page_data, base_data, node)
331 elif isinstance(node, WikiNode):
332 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):
333 if template_node.template_name.startswith("Форма-"): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true
334 process_form_template(
335 wxr, page_data, base_data, template_node
336 )
339def process_form_template(
340 wxr: WiktextractContext,
341 page_data: list[WordEntry],
342 base_data: WordEntry,
343 template_node: TemplateNode,
344) -> None:
345 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ
346 # Шаблон:Форма-гл, "Шаблон:форма-гл en"
347 pos_data = get_pos_from_template(wxr, template_node)
348 if pos_data is not None: 348 ↛ 352line 348 didn't jump to line 352 because the condition on line 348 was always true
349 base_data.pos = pos_data["pos"]
350 base_data.tags.extend(pos_data.get("tags", []))
352 form_of = clean_node(
353 wxr,
354 None,
355 template_node.template_parameters.get(
356 "база", template_node.template_parameters.get(1, "")
357 ),
358 )
359 ipa = clean_node(
360 wxr, None, template_node.template_parameters.get("МФА", "")
361 )
362 expanded_node = wxr.wtp.parse(
363 wxr.wtp.node_to_wikitext(template_node), expand_all=True
364 )
365 current_data = base_data.model_copy(deep=True)
366 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
367 gloss_text = clean_node(wxr, None, list_item.children)
368 if len(gloss_text) > 0: 368 ↛ 366line 368 didn't jump to line 366 because the condition on line 368 was always true
369 sense = Sense(glosses=[gloss_text])
370 if len(form_of) > 0: 370 ↛ 373line 370 didn't jump to line 373 because the condition on line 370 was always true
371 sense.form_of.append(AltForm(word=form_of))
372 sense.tags.append("form-of")
373 current_data.senses.append(sense)
375 if len(ipa) > 0: 375 ↛ 377line 375 didn't jump to line 377 because the condition on line 375 was always true
376 current_data.sounds.append(Sound(ipa=ipa))
377 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 377 ↛ exitline 377 didn't return from function 'process_form_template' because the condition on line 377 was always true
378 clean_node(wxr, current_data, template_node)
379 page_data.append(current_data)
382def parse_roman_section(
383 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
384) -> None:
385 for link_node in level_node.find_child(NodeKind.LINK):
386 form_text = clean_node(wxr, None, link_node)
387 if form_text != "":
388 form = Form(form=form_text, tags=["romanization"])
389 word_entry.forms.append(form)
392def extract_section_end_templates(
393 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
394) -> None:
395 # category link templates
396 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации
397 for template_node in level_node.find_child(NodeKind.TEMPLATE):
398 if template_node.template_name in { 398 ↛ 410line 398 didn't jump to line 410 because the condition on line 398 was never true
399 "-ание",
400 "-атель",
401 "-ация",
402 "-ение",
403 "-ка",
404 "длина слова",
405 "Категория",
406 "Омонимы",
407 "forms",
408 "multilang",
409 }:
410 clean_node(wxr, word_entry, template_node)
411 elif template_node.template_name == "zh-forms": 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true
412 extract_zh_forms_template(wxr, word_entry, template_node)
415def extract_zh_forms_template(
416 wxr: WiktextractContext,
417 base_data: WordEntry,
418 template_node: TemplateNode,
419) -> None:
420 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms
421 # https://ru.wiktionary.org/wiki/Модуль:zh-forms
422 # similar to en and zh edition template
423 for p_name, p_value in template_node.template_parameters.items():
424 if not isinstance(p_name, str):
425 continue
426 if re.fullmatch(r"s\d*", p_name):
427 form_data = Form(
428 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]
429 )
430 if form_data.form not in ["", wxr.wtp.title]:
431 base_data.forms.append(form_data)
432 elif re.fullmatch(r"t\d*", p_name):
433 form_data = Form(
434 form=clean_node(wxr, None, p_value),
435 tags=["Traditional Chinese"],
436 )
437 if form_data.form not in ["", wxr.wtp.title]:
438 base_data.forms.append(form_data)
439 elif p_name == "lit":
440 base_data.literal_meaning = clean_node(wxr, None, p_value)
443def extract_level2_node_contents(
444 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
445) -> None:
446 for t_node in level_node.find_content(NodeKind.TEMPLATE):
447 if t_node.template_name in ["заголовок", "з"]: 447 ↛ 446line 447 didn't jump to line 446 because the condition on line 447 was always true
448 # https://ru.wiktionary.org/wiki/Шаблон:з
449 stressed_form = clean_node(
450 wxr, None, t_node.template_parameters.get("ударение", "")
451 )
452 if "(" in stressed_form:
453 stressed_form = stressed_form[
454 : stressed_form.index("(")
455 ].strip()
456 if stressed_form not in ["", wxr.wtp.title]:
457 word_entry.forms.append(
458 Form(form=stressed_form, tags=["stressed"])
459 )