Coverage for src / wiktextract / extractor / de / translation.py: 87%

100 statements  

« prev     ^ index     » next       coverage.py v7.14.0, created at 2026-05-11 04:48 +0000

1import re 

2 

3from mediawiki_langcodes import code_to_name, name_to_code 

4from wikitextprocessor import NodeKind, WikiNode 

5from wikitextprocessor.parser import TemplateNode 

6 

7from ...page import clean_node 

8from ...wxr_context import WiktextractContext 

9from .models import Translation, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_translation( 

14 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

15) -> None: 

16 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

17 if template_node.template_name == "Ü-Tabelle": 17 ↛ 16line 17 didn't jump to line 16 because the condition on line 17 was always true

18 process_u_tabelle_template(wxr, word_entry, template_node) 

19 

20 

21def process_u_tabelle_template( 

22 wxr: WiktextractContext, word_entry: WordEntry, template_node: TemplateNode 

23) -> None: 

24 # https://de.wiktionary.org/wiki/Vorlage:Ü-Tabelle 

25 sense_idx = clean_node( 

26 wxr, None, template_node.template_parameters.get(1, "") 

27 ) 

28 sense = clean_node( 

29 wxr, None, template_node.template_parameters.get("G", "") 

30 ) 

31 for list_arg_name in ["Ü-Liste", "Dialekttabelle"]: 

32 list_arg_value = template_node.template_parameters.get(list_arg_name) 

33 if list_arg_value is None: 

34 continue 

35 tr_list = wxr.wtp.parse(wxr.wtp.node_to_wikitext(list_arg_value)) 

36 for list_item in tr_list.find_child_recursively(NodeKind.LIST_ITEM): 

37 process_u_tabelle_list_item( 

38 wxr, word_entry, list_item, sense, sense_idx 

39 ) 

40 

41 

42def process_u_tabelle_list_item( 

43 wxr: WiktextractContext, 

44 word_entry: WordEntry, 

45 list_item_node: WikiNode, 

46 sense: str, 

47 sense_idx: str, 

48) -> None: 

49 before_colon = True 

50 tr_data = Translation(sense=sense, sense_index=sense_idx) 

51 for node in list_item_node.children: 

52 if isinstance(node, str): 

53 node = node.strip() 

54 if len(node) == 0: 

55 continue 

56 elif ":" in node: 

57 lang_str = node[: node.index(":")].strip() 

58 if len(lang_str) > 0 and len(tr_data.lang) == 0: 

59 tr_data.lang = lang_str 

60 if len(tr_data.lang_code) == 0: 60 ↛ 62line 60 didn't jump to line 62 because the condition on line 60 was always true

61 tr_data.lang_code = name_to_code(lang_str, "de") 

62 before_colon = False 

63 elif node in [",", ";"] and len(tr_data.word) > 0: 

64 tr_data = append_tr_data(word_entry, tr_data) 

65 elif not before_colon and len(tr_data.word) > 0: 65 ↛ 70line 65 didn't jump to line 70 because the condition on line 65 was always true

66 # Plain text between {{Ü}} templates of the same translation 

67 # e.g. {{Ü|fr|temps}} de {{Ü|fr|travail}} → "temps de travail" 

68 tr_data.word += " " + node 

69 

70 if before_colon and len(tr_data.lang) == 0: 

71 tr_data.lang = clean_node(wxr, None, node) 

72 if isinstance(node, TemplateNode): 

73 tr_data.lang_code = node.template_name.lower() 

74 else: 

75 tr_data.lang_code = name_to_code(tr_data.lang_code, "de") 

76 elif isinstance(node, TemplateNode): 

77 if node.template_name.startswith("Ü"): 

78 u_lang_code = clean_node( 

79 wxr, None, node.template_parameters.get(1, "") 

80 ) 

81 if len(tr_data.word) > 0 and u_lang_code != tr_data.lang_code: 81 ↛ 83line 81 didn't jump to line 83 because the condition on line 81 was never true

82 # Different language → save current, start new 

83 tr_data = append_tr_data(word_entry, tr_data) 

84 process_u_template(wxr, tr_data, node) 

85 elif len(tr_data.word) > 0: 

86 # Same language → append word to form multi-word translation 

87 # e.g. {{Ü|fr|temps}} de {{Ü|fr|travail}} → "temps de travail" 

88 new_word = clean_node( 

89 wxr, None, node.template_parameters.get(2, "") 

90 ) 

91 tr_data.word += " " + new_word 

92 else: 

93 process_u_template(wxr, tr_data, node) 

94 else: 

95 raw_tag = clean_node(wxr, None, node).strip(": \n") 

96 if raw_tag != "": 96 ↛ 51line 96 didn't jump to line 51 because the condition on line 96 was always true

97 tr_data.raw_tags.append(raw_tag) 

98 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

99 raw_tag_str = clean_node(wxr, None, node).removesuffix(":") 

100 for raw_tag in filter(None, raw_tag_str.split(", ")): 

101 tr_data.raw_tags.append(raw_tag) 

102 

103 if len(tr_data.word) > 0: 

104 append_tr_data(word_entry, tr_data) 

105 

106 

107def append_tr_data(word_entry: WordEntry, tr_data: Translation) -> Translation: 

108 translate_raw_tags(tr_data) 

109 word_entry.translations.append(tr_data.model_copy(deep=True)) 

110 return Translation( 

111 sense=tr_data.sense, 

112 sense_index=tr_data.sense_index, 

113 lang=tr_data.lang, 

114 lang_code=tr_data.lang_code, 

115 ) 

116 

117 

118def process_u_template( 

119 wxr: WiktextractContext, tr_data: Translation, u_template: TemplateNode 

120) -> None: 

121 # https://de.wiktionary.org/wiki/Vorlage:Ü 

122 # also "Ü?", "Üt", "Üt?", "Üxx4", "Üxx4?" 

123 if len(tr_data.lang_code) == 0: 

124 tr_data.lang_code = clean_node( 

125 wxr, None, u_template.template_parameters.get(1, "") 

126 ) 

127 if len(tr_data.lang) == 0: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 tr_data.lang = code_to_name(tr_data, "de") 

129 

130 tr_data.word = clean_node( 

131 wxr, None, u_template.template_parameters.get(2, "") 

132 ) 

133 template_name = u_template.template_name 

134 tr_data.uncertain = template_name.endswith("?") 

135 template_name = template_name.removesuffix("?") 

136 display_arg = -1 

137 if template_name == "Ü": 

138 display_arg = 3 

139 elif template_name == "Üt": 139 ↛ 157line 139 didn't jump to line 157 because the condition on line 139 was always true

140 display_arg = 4 

141 if 3 in u_template.template_parameters: 

142 arg_value = clean_node( 

143 wxr, None, u_template.template_parameters.get(3, "") 

144 ) 

145 if tr_data.lang_code in ["ja", "ko"] and "," in arg_value: 

146 tr_data.other, tr_data.roman = tuple( 

147 map(str.strip, arg_value.split(",", maxsplit=1)) 

148 ) 

149 else: 

150 tr_data.roman = arg_value 

151 else: 

152 # this template could create roman without the third arg 

153 expanded_text = clean_node(wxr, None, u_template) 

154 m = re.search(r"\(([^)]+?)\^\☆\)", expanded_text) 

155 if m is not None: 155 ↛ 165line 155 didn't jump to line 165 because the condition on line 155 was always true

156 tr_data.roman = m.group(1) 

157 elif template_name == "Üxx4": 

158 display_arg = "v" 

159 if 3 in u_template.template_parameters: 

160 display_arg = 3 

161 tr_data.roman = clean_node( 

162 wxr, None, u_template.template_parameters.get("d", "") 

163 ) 

164 

165 tr_word = clean_node( 

166 wxr, None, u_template.template_parameters.get(display_arg, "") 

167 ) 

168 if len(tr_word) > 0: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 tr_data.word = tr_word