Coverage for src/wiktextract/extractor/de/gloss.py: 89%

126 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2 

3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import AltForm, Sense, WordEntry 

8from .tags import GRAMMATICAL_TAGS, translate_raw_tags 

9from .utils import extract_sense_index 

10 

11 

12def extract_glosses( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16) -> None: 

17 sense = Sense() 

18 section_title = clean_node(wxr, None, level_node.largs) 

19 for list_node in level_node.find_child(NodeKind.LIST): 

20 sense = process_gloss_list_item( 

21 wxr, word_entry, list_node, sense, section_title 

22 ) 

23 

24 if not level_node.contain_node(NodeKind.LIST): 

25 gloss_text = clean_node(wxr, sense, level_node.children) 

26 if len(gloss_text) > 0: 

27 sense.glosses.append(gloss_text) 

28 word_entry.senses.append(sense) 

29 

30 

31def process_gloss_list_item( 

32 wxr: WiktextractContext, 

33 word_entry: WordEntry, 

34 list_node: WikiNode, 

35 parent_sense: Sense, 

36 section_title: str, 

37) -> Sense: 

38 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

39 item_type = list_item_node.sarg 

40 if ( 

41 "form-of" in word_entry.tags 

42 or section_title == "Grammatische Merkmale" 

43 ): 

44 process_form_of_list_item(wxr, word_entry, list_item_node) 

45 elif item_type.endswith("*"): 

46 # only contains modifier template 

47 has_tag_template = False 

48 for template in list_item_node.find_child(NodeKind.TEMPLATE): 

49 raw_tag = clean_node(wxr, parent_sense, template).removesuffix( 

50 ":" 

51 ) 

52 parent_sense = Sense() 

53 parent_sense.raw_tags.append(raw_tag) 

54 has_tag_template = True 

55 if not has_tag_template: 

56 new_sense = Sense() 

57 gloss_text = clean_node(wxr, new_sense, list_item_node.children) 

58 if len(gloss_text) > 0: 58 ↛ 38line 58 didn't jump to line 38 because the condition on line 58 was always true

59 new_sense.glosses.append(gloss_text) 

60 word_entry.senses.append(new_sense) 

61 elif item_type.endswith(":"): 61 ↛ 138line 61 didn't jump to line 138 because the condition on line 61 was always true

62 sense_data = parent_sense.model_copy(deep=True) 

63 gloss_nodes = [] 

64 for gloss_node in list_item_node.children: 

65 if isinstance(gloss_node, TemplateNode): 

66 if gloss_node.template_name == "K": 

67 extract_k_template(wxr, sense_data, gloss_node) 

68 elif gloss_node.template_name.endswith("."): 68 ↛ 73line 68 didn't jump to line 73 because the condition on line 68 was always true

69 raw_tag = clean_node( 

70 wxr, sense_data, gloss_node 

71 ).removesuffix(":") 

72 sense_data.raw_tags.append(raw_tag) 

73 elif gloss_node.template_name in ( 

74 "QS Herkunft", 

75 "QS Bedeutungen", 

76 ): 

77 continue 

78 else: 

79 gloss_nodes.append(gloss_node) 

80 elif ( 

81 isinstance(gloss_node, WikiNode) 

82 and gloss_node.kind == NodeKind.ITALIC 

83 ): 

84 italic_text = clean_node(wxr, None, gloss_node) 

85 if italic_text.endswith(":") or ( 

86 italic_text.startswith("(") 

87 and italic_text.endswith(")") 

88 ): 

89 italic_text = italic_text.strip(": ") 

90 if italic_text.startswith("(") and italic_text.endswith( 

91 ")" 

92 ): 

93 italic_text = italic_text.strip("() ") 

94 for raw_tag in re.split(r":|,", italic_text): 

95 raw_tag = raw_tag.strip() 

96 if len(raw_tag) > 0: 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true

97 sense_data.raw_tags.append(raw_tag) 

98 else: 

99 gloss_nodes.append(italic_text) 

100 elif not ( 

101 isinstance(gloss_node, WikiNode) 

102 and gloss_node.kind == NodeKind.LIST 

103 ): 

104 gloss_nodes.append(gloss_node) 

105 

106 gloss_text = clean_node(wxr, sense_data, gloss_nodes) 

107 sense_idx, gloss_text = extract_sense_index(gloss_text) 

108 gloss_text = gloss_text.replace("()", "").strip(":, \n") 

109 if sense_idx != "": 

110 if ( 

111 not sense_idx[0].isnumeric() 

112 and parent_sense is not None 

113 and len(parent_sense.sense_index) != "" 

114 ): 

115 sense_idx = parent_sense.sense_index + sense_idx 

116 sense_data.sense_index = sense_idx 

117 elif len(gloss_text) > 0: 117 ↛ 123line 117 didn't jump to line 123 because the condition on line 117 was always true

118 wxr.wtp.debug( 

119 "Failed to extract sense number from gloss node", 

120 sortid="extractor/de/glosses/extract_glosses/28", 

121 ) 

122 

123 if len(gloss_text) > 0: 

124 sense_data.glosses.append(gloss_text) 

125 translate_raw_tags(sense_data) 

126 word_entry.senses.append(sense_data) 

127 

128 for sub_list_node in list_item_node.find_child(NodeKind.LIST): 

129 process_gloss_list_item( 

130 wxr, 

131 word_entry, 

132 sub_list_node, 

133 sense_data, 

134 section_title, 

135 ) 

136 

137 else: 

138 wxr.wtp.debug( 

139 f"Unexpected list item in glosses: {list_item_node}", 

140 sortid="extractor/de/glosses/extract_glosses/29", 

141 ) 

142 continue 

143 return parent_sense 

144 

145 

146# plain text POS string used in form-of gloss, usually in genitive case 

147FORM_OF_POS_STRINGS = { 

148 "Adjektivs": {"pos": "adj"}, 

149 "Verbs": {"pos": "verb"}, 

150 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]}, 

151 "Substantivs": {"pos": "noun"}, 

152} 

153 

154 

155def process_form_of_list_item( 

156 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

157) -> None: 

158 from .section_titles import POS_SECTIONS 

159 

160 sense = Sense() 

161 gloss_text = clean_node(wxr, None, list_item_node.children) 

162 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 162 ↛ 174line 162 didn't jump to line 174 because the loop on line 162 didn't complete

163 if isinstance(node, TemplateNode) and node.template_name == "Ü": 

164 # https://de.wiktionary.org/wiki/Vorlage:Ü 

165 form_of = clean_node(wxr, None, node.template_parameters.get(2, "")) 

166 if len(form_of) > 0: 166 ↛ 162line 166 didn't jump to line 162 because the condition on line 166 was always true

167 sense.form_of.append(AltForm(word=form_of)) 

168 break 

169 elif node.kind == NodeKind.BOLD: 

170 bold_text = clean_node(wxr, None, node) 

171 if bold_text != "": 171 ↛ 162line 171 didn't jump to line 162 because the condition on line 171 was always true

172 sense.form_of.append(AltForm(word=bold_text)) 

173 break 

174 if gloss_text != "": 174 ↛ exitline 174 didn't return from function 'process_form_of_list_item' because the condition on line 174 was always true

175 sense.glosses.append(gloss_text) 

176 for str_node in list_item_node.children: 

177 if isinstance(str_node, str) and len(str_node.strip()) > 0: 

178 pos_data = {} 

179 for sense_word in str_node.split(): 

180 if sense_word in FORM_OF_POS_STRINGS: 

181 pos_data = FORM_OF_POS_STRINGS[sense_word] 

182 elif sense_word in POS_SECTIONS: 

183 pos_data = POS_SECTIONS[sense_word] 

184 elif sense_word in GRAMMATICAL_TAGS: 

185 tr_tag = GRAMMATICAL_TAGS[sense_word] 

186 if isinstance(tr_tag, str): 186 ↛ 188line 186 didn't jump to line 188 because the condition on line 186 was always true

187 sense.tags.append(tr_tag) 

188 elif isinstance(tr_tag, list): 

189 sense.tags.extend(tr_tag) 

190 if len(pos_data) > 0 and word_entry.pos == "unknown": 

191 word_entry.pos = pos_data["pos"] 

192 word_entry.tags.extend(pos_data.get("tags", [])) 

193 

194 if "form-of" not in word_entry.tags: 

195 word_entry.tags.append("form-of") 

196 if "form-of" not in sense.tags: 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was always true

197 sense.tags.append("form-of") 

198 word_entry.senses.append(sense) 

199 

200 

201def extract_k_template( 

202 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode 

203): 

204 # https://de.wiktionary.org/wiki/Vorlage:K 

205 for arg, arg_value in t_node.template_parameters.items(): 

206 if isinstance(arg, int) or arg == "ft": 

207 raw_tag = clean_node(wxr, None, arg_value) 

208 if raw_tag not in ["von", ""]: 208 ↛ 205line 208 didn't jump to line 205 because the condition on line 208 was always true

209 sense.raw_tags.append(raw_tag) 

210 clean_node(wxr, sense, t_node)