Coverage for src/wiktextract/extractor/ru/translation.py: 90%
63 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from mediawiki_langcodes import name_to_code
2from wikitextprocessor import HTMLNode, NodeKind, TemplateNode, WikiNode
4from ...page import clean_node
5from ...wxr_context import WiktextractContext
6from .models import Translation, WordEntry
7from .tags import translate_raw_tags
10def extract_translations(
11 wxr: WiktextractContext,
12 word_entry: WordEntry,
13 level3_node: WikiNode,
14):
15 for template_node in level3_node.find_child(NodeKind.TEMPLATE):
16 if template_node.template_name == "перев-блок": 16 ↛ 15line 16 didn't jump to line 15 because the condition on line 16 was always true
17 process_translate_block_template(wxr, word_entry, template_node)
20def process_translate_block_template(
21 wxr: WiktextractContext,
22 word_entry: WordEntry,
23 template_node: TemplateNode,
24) -> None:
25 # https://ru.wiktionary.org/wiki/Шаблон:перев-блок
26 expanded_template = wxr.wtp.parse(
27 wxr.wtp.node_to_wikitext(template_node), expand_all=True
28 )
29 sense = clean_node(wxr, None, template_node.template_parameters.get(1, ""))
30 for list_item in expanded_template.find_child_recursively(
31 NodeKind.LIST_ITEM
32 ):
33 translation = Translation(word="", lang="", sense=sense)
34 for node in list_item.children:
35 if isinstance(node, WikiNode):
36 if node.kind == NodeKind.HTML:
37 if node.tag == "sub":
38 translation.lang_code = clean_node(
39 wxr, None, node.children
40 )
41 elif node.tag == "sup":
42 # language index
43 title = node.attrs.get("title", "")
44 if len(title) > 0: 44 ↛ 34line 44 didn't jump to line 34 because the condition on line 44 was always true
45 translation.raw_tags.append(title)
46 elif node.tag == "span": 46 ↛ 34line 46 didn't jump to line 34 because the condition on line 46 was always true
47 process_translate_list_span_tag(
48 wxr, word_entry, translation, node
49 )
50 elif node.kind == NodeKind.LINK: 50 ↛ 34line 50 didn't jump to line 34 because the condition on line 50 was always true
51 translation.lang = clean_node(wxr, None, node)
52 elif isinstance(node, str) and len(node.strip(" ():\n")) > 0:
53 translation.raw_tags.append(node.strip(" ():\n"))
55 if translation.word != "" and translation.lang != "": 55 ↛ 30line 55 didn't jump to line 30 because the condition on line 55 was always true
56 if translation.lang_code == "": 56 ↛ 57line 56 didn't jump to line 57 because the condition on line 56 was never true
57 translation.lang_code = name_to_code(translation.lang, "ru")
58 translate_raw_tags(translation)
59 word_entry.translations.append(translation)
62def process_translate_list_span_tag(
63 wxr: WiktextractContext,
64 word_entry: WordEntry,
65 translation: Translation,
66 span_node: HTMLNode,
67) -> None:
68 for node in span_node.children:
69 if isinstance(node, WikiNode):
70 if node.kind == NodeKind.LINK:
71 translation.word = clean_node(wxr, None, node)
72 elif isinstance(node, HTMLNode) and node.tag in ["span", "i"]:
73 # gender tag
74 tag = clean_node(wxr, None, node)
75 if len(tag) > 0: 75 ↛ 68line 75 didn't jump to line 68 because the condition on line 75 was always true
76 translation.raw_tags.append(tag)
77 elif node.kind == NodeKind.ITALIC:
78 translation.raw_tags.append(clean_node(wxr, None, node))
79 elif isinstance(node, str): 79 ↛ 68line 79 didn't jump to line 68 because the condition on line 79 was always true
80 # convert escaped characters like " "
81 text = clean_node(wxr, None, node)
82 if text.endswith((",", ";")):
83 # this list item has multiple translation words
84 striped_text = text.strip(",: ")
85 if striped_text.startswith("(") and striped_text.endswith(")"):
86 translation.roman = striped_text.strip("()")
87 if translation.word != "" and translation.lang != "": 87 ↛ 96line 87 didn't jump to line 96 because the condition on line 87 was always true
88 if translation.lang_code == "": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 translation.lang_code = name_to_code(
90 translation.lang, "ru"
91 )
92 translate_raw_tags(translation)
93 word_entry.translations.append(
94 translation.model_copy(deep=True)
95 )
96 translation.word = ""
97 translation.roman = ""
98 translation.tags = []
99 translation.raw_tags = []
100 elif text.startswith("(") and text.endswith(")"):
101 translation.roman = text.strip("()")