Coverage for src / wiktextract / extractor / de / translation.py: 87%
100 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
1import re
3from mediawiki_langcodes import code_to_name, name_to_code
4from wikitextprocessor import NodeKind, WikiNode
5from wikitextprocessor.parser import TemplateNode
7from ...page import clean_node
8from ...wxr_context import WiktextractContext
9from .models import Translation, WordEntry
10from .tags import translate_raw_tags
13def extract_translation(
14 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode
15) -> None:
16 for template_node in level_node.find_child(NodeKind.TEMPLATE):
17 if template_node.template_name == "Ü-Tabelle": 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true
18 process_u_tabelle_template(wxr, word_entry, template_node)
21def process_u_tabelle_template(
22 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode
23) -> None:
24 # https://de.wiktionary.org/wiki/Vorlage:Ü-Tabelle
25 sense_idx = clean_node(
26 wxr, None, template_node.template_parameters.get(1, "")
27 )
28 sense = clean_node(
29 wxr, None, template_node.template_parameters.get("G", "")
30 )
31 for list_arg_name in ["Ü-Liste", "Dialekttabelle"]:
32 list_arg_value = template_node.template_parameters.get(list_arg_name)
33 if list_arg_value is None:
34 continue
35 tr_list = wxr.wtp.parse(wxr.wtp.node_to_wikitext(list_arg_value))
36 for list_item in tr_list.find_child_recursively(NodeKind.LIST_ITEM):
37 process_u_tabelle_list_item(
38 wxr, word_entry, list_item, sense, sense_idx
39 )
42def process_u_tabelle_list_item(
43 wxr: WiktextractContext,
44 word_entry: WordEntry,
45 list_item_node: WikiNode,
46 sense: str,
47 sense_idx: str,
48) -> None:
49 before_colon = True
50 tr_data = Translation(sense=sense, sense_index=sense_idx)
51 for node in list_item_node.children:
52 if isinstance(node, str):
53 node = node.strip()
54 if len(node) == 0:
55 continue
56 elif ":" in node:
57 lang_str = node[: node.index(":")].strip()
58 if len(lang_str) > 0 and len(tr_data.lang) == 0:
59 tr_data.lang = lang_str
60 if len(tr_data.lang_code) == 0: 60 ↛ 62line 60 didn't jump to line 62 because the condition on line 60 was always true
61 tr_data.lang_code = name_to_code(lang_str, "de")
62 before_colon = False
63 elif node in [",", ";"] and len(tr_data.word) > 0:
64 tr_data = append_tr_data(word_entry, tr_data)
65 elif not before_colon and len(tr_data.word) > 0: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true
66 # Plain text between {{Ü}} templates of the same translation
67 # e.g. {{Ü|fr|temps}} de {{Ü|fr|travail}} → "temps de travail"
68 tr_data.word += " " + node
70 if before_colon and len(tr_data.lang) == 0:
71 tr_data.lang = clean_node(wxr, None, node)
72 if isinstance(node, TemplateNode):
73 tr_data.lang_code = node.template_name.lower()
74 else:
75 tr_data.lang_code = name_to_code(tr_data.lang_code, "de")
76 elif isinstance(node, TemplateNode):
77 if node.template_name.startswith("Ü"):
78 u_lang_code = clean_node(
79 wxr, None, node.template_parameters.get(1, "")
80 )
81 if len(tr_data.word) > 0 and u_lang_code != tr_data.lang_code: 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was never true
82 # Different language → save current, start new
83 tr_data = append_tr_data(word_entry, tr_data)
84 process_u_template(wxr, tr_data, node)
85 elif len(tr_data.word) > 0:
86 # Same language → append word to form multi-word translation
87 # e.g. {{Ü|fr|temps}} de {{Ü|fr|travail}} → "temps de travail"
88 new_word = clean_node(
89 wxr, None, node.template_parameters.get(2, "")
90 )
91 tr_data.word += " " + new_word
92 else:
93 process_u_template(wxr, tr_data, node)
94 else:
95 raw_tag = clean_node(wxr, None, node).strip(": \n")
96 if raw_tag != "": 96 ↛ 51line 96 didn't jump to line 51 because the condition on line 96 was always true
97 tr_data.raw_tags.append(raw_tag)
98 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
99 raw_tag_str = clean_node(wxr, None, node).removesuffix(":")
100 for raw_tag in filter(None, raw_tag_str.split(", ")):
101 tr_data.raw_tags.append(raw_tag)
103 if len(tr_data.word) > 0:
104 append_tr_data(word_entry, tr_data)
107def append_tr_data(word_entry: WordEntry, tr_data: Translation) -> Translation:
108 translate_raw_tags(tr_data)
109 word_entry.translations.append(tr_data.model_copy(deep=True))
110 return Translation(
111 sense=tr_data.sense,
112 sense_index=tr_data.sense_index,
113 lang=tr_data.lang,
114 lang_code=tr_data.lang_code,
115 )
118def process_u_template(
119 wxr: WiktextractContext, tr_data: Translation, u_template: TemplateNode
120) -> None:
121 # https://de.wiktionary.org/wiki/Vorlage:Ü
122 # also "Ü?", "Üt", "Üt?", "Üxx4", "Üxx4?"
123 if len(tr_data.lang_code) == 0:
124 tr_data.lang_code = clean_node(
125 wxr, None, u_template.template_parameters.get(1, "")
126 )
127 if len(tr_data.lang) == 0: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 tr_data.lang = code_to_name(tr_data, "de")
130 tr_data.word = clean_node(
131 wxr, None, u_template.template_parameters.get(2, "")
132 )
133 template_name = u_template.template_name
134 tr_data.uncertain = template_name.endswith("?")
135 template_name = template_name.removesuffix("?")
136 display_arg = -1
137 if template_name == "Ü":
138 display_arg = 3
139 elif template_name == "Üt": 139 ↛ 157line 139 didn't jump to line 157 because the condition on line 139 was always true
140 display_arg = 4
141 if 3 in u_template.template_parameters:
142 arg_value = clean_node(
143 wxr, None, u_template.template_parameters.get(3, "")
144 )
145 if tr_data.lang_code in ["ja", "ko"] and "," in arg_value:
146 tr_data.other, tr_data.roman = tuple(
147 map(str.strip, arg_value.split(",", maxsplit=1))
148 )
149 else:
150 tr_data.roman = arg_value
151 else:
152 # this template could create roman without the third arg
153 expanded_text = clean_node(wxr, None, u_template)
154 m = re.search(r"\(([^)]+?)\^\☆\)", expanded_text)
155 if m is not None: 155 ↛ 165line 155 didn't jump to line 165 because the condition on line 155 was always true
156 tr_data.roman = m.group(1)
157 elif template_name == "Üxx4":
158 display_arg = "v"
159 if 3 in u_template.template_parameters:
160 display_arg = 3
161 tr_data.roman = clean_node(
162 wxr, None, u_template.template_parameters.get("d", "")
163 )
165 tr_word = clean_node(
166 wxr, None, u_template.template_parameters.get(display_arg, "")
167 )
168 if len(tr_word) > 0: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 tr_data.word = tr_word