Coverage for src/wiktextract/extractor/de/gloss.py: 89%

123 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2 

3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .models import AltForm, Sense, WordEntry 

8from .tags import GRAMMATICAL_TAGS, translate_raw_tags 

9from .utils import extract_sense_index 

10 

11 

12def extract_glosses( 

13 wxr: WiktextractContext, 

14 word_entry: WordEntry, 

15 level_node: LevelNode, 

16) -> None: 

17 sense = Sense() 

18 section_title = clean_node(wxr, None, level_node.largs) 

19 for list_node in level_node.find_child(NodeKind.LIST): 

20 sense = process_gloss_list_item( 

21 wxr, word_entry, list_node, sense, section_title 

22 ) 

23 

24 if not level_node.contain_node(NodeKind.LIST): 

25 gloss_text = clean_node(wxr, sense, level_node.children) 

26 if len(gloss_text) > 0: 

27 sense.glosses.append(gloss_text) 

28 word_entry.senses.append(sense) 

29 

30 

31def process_gloss_list_item( 

32 wxr: WiktextractContext, 

33 word_entry: WordEntry, 

34 list_node: WikiNode, 

35 parent_sense: Sense, 

36 section_title: str, 

37) -> Sense: 

38 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

39 item_type = list_item_node.sarg 

40 if ( 

41 "form-of" in word_entry.tags 

42 or section_title == "Grammatische Merkmale" 

43 ): 

44 process_form_of_list_item(wxr, word_entry, list_item_node) 

45 elif item_type.endswith("*"): 

46 # only contains modifier template 

47 has_tag_template = False 

48 for template in list_item_node.find_child(NodeKind.TEMPLATE): 

49 raw_tag = clean_node(wxr, parent_sense, template).removesuffix( 

50 ":" 

51 ) 

52 parent_sense = Sense() 

53 parent_sense.raw_tags.append(raw_tag) 

54 has_tag_template = True 

55 if not has_tag_template: 

56 new_sense = Sense() 

57 gloss_text = clean_node(wxr, new_sense, list_item_node.children) 

58 if len(gloss_text) > 0: 58 ↛ 38line 58 didn't jump to line 38 because the condition on line 58 was always true

59 new_sense.glosses.append(gloss_text) 

60 word_entry.senses.append(new_sense) 

61 elif item_type.endswith(":"): 61 ↛ 149line 61 didn't jump to line 149 because the condition on line 61 was always true

62 sense_data = parent_sense.model_copy(deep=True) 

63 gloss_nodes = [] 

64 for gloss_node in list_item_node.children: 

65 if isinstance(gloss_node, TemplateNode): 

66 if gloss_node.template_name == "K": 

67 for ( 

68 k_arg, 

69 k_arg_value, 

70 ) in gloss_node.template_parameters.items(): 

71 if k_arg == "ft": 

72 gloss_nodes.append( 

73 clean_node(wxr, None, k_arg_value) 

74 ) 

75 gloss_nodes.append(":") 

76 elif isinstance(k_arg, int): 

77 raw_tag = clean_node(wxr, None, k_arg_value) 

78 if raw_tag != "von": 78 ↛ 67line 78 didn't jump to line 67 because the condition on line 78 was always true

79 sense_data.raw_tags.append(raw_tag) 

80 clean_node(wxr, sense_data, gloss_node) 

81 elif gloss_node.template_name.endswith("."): 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true

82 raw_tag = clean_node( 

83 wxr, sense_data, gloss_node 

84 ).removesuffix(":") 

85 sense_data.raw_tags.append(raw_tag) 

86 elif gloss_node.template_name in ( 

87 "QS Herkunft", 

88 "QS Bedeutungen", 

89 ): 

90 continue 

91 else: 

92 gloss_nodes.append(gloss_node) 

93 elif ( 

94 isinstance(gloss_node, WikiNode) 

95 and gloss_node.kind == NodeKind.ITALIC 

96 ): 

97 italic_text = clean_node(wxr, None, gloss_node) 

98 if italic_text.endswith(":") or ( 98 ↛ 111line 98 didn't jump to line 111 because the condition on line 98 was always true

99 italic_text.startswith("(") 

100 and italic_text.endswith(")") 

101 ): 

102 if not italic_text.endswith(":"): 

103 italic_text = italic_text.strip("() ") 

104 for raw_tag in re.split( 

105 r":|,", italic_text.strip(": ") 

106 ): 

107 raw_tag = raw_tag.strip() 

108 if len(raw_tag) > 0: 108 ↛ 104line 108 didn't jump to line 104 because the condition on line 108 was always true

109 sense_data.raw_tags.append(raw_tag) 

110 else: 

111 gloss_nodes.append(italic_text) 

112 elif not ( 

113 isinstance(gloss_node, WikiNode) 

114 and gloss_node.kind == NodeKind.LIST 

115 ): 

116 gloss_nodes.append(gloss_node) 

117 

118 gloss_text = clean_node(wxr, sense_data, gloss_nodes) 

119 sense_idx, gloss_text = extract_sense_index(gloss_text) 

120 if sense_idx != "": 

121 if ( 

122 not sense_idx[0].isnumeric() 

123 and parent_sense is not None 

124 and len(parent_sense.sense_index) != "" 

125 ): 

126 sense_idx = parent_sense.sense_index + sense_idx 

127 sense_data.sense_index = sense_idx 

128 elif len(gloss_text) > 0: 128 ↛ 134line 128 didn't jump to line 134 because the condition on line 128 was always true

129 wxr.wtp.debug( 

130 "Failed to extract sense number from gloss node", 

131 sortid="extractor/de/glosses/extract_glosses/28", 

132 ) 

133 

134 if len(gloss_text) > 0: 

135 sense_data.glosses.append(gloss_text) 

136 translate_raw_tags(sense_data) 

137 word_entry.senses.append(sense_data) 

138 

139 for sub_list_node in list_item_node.find_child(NodeKind.LIST): 

140 process_gloss_list_item( 

141 wxr, 

142 word_entry, 

143 sub_list_node, 

144 sense_data, 

145 section_title, 

146 ) 

147 

148 else: 

149 wxr.wtp.debug( 

150 f"Unexpected list item in glosses: {list_item_node}", 

151 sortid="extractor/de/glosses/extract_glosses/29", 

152 ) 

153 continue 

154 return parent_sense 

155 

156 

157# plain text POS string used in form-of gloss, usually in genitive case 

158FORM_OF_POS_STRINGS = { 

159 "Adjektivs": {"pos": "adj"}, 

160 "Verbs": {"pos": "verb"}, 

161 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]}, 

162 "Substantivs": {"pos": "noun"}, 

163} 

164 

165 

166def process_form_of_list_item( 

167 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

168) -> None: 

169 from .section_titles import POS_SECTIONS 

170 

171 sense = Sense() 

172 gloss_text = clean_node(wxr, None, list_item_node.children) 

173 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 173 ↛ 185line 173 didn't jump to line 185 because the loop on line 173 didn't complete

174 if isinstance(node, TemplateNode) and node.template_name == "Ü": 

175 # https://de.wiktionary.org/wiki/Vorlage:Ü 

176 form_of = clean_node(wxr, None, node.template_parameters.get(2, "")) 

177 if len(form_of) > 0: 177 ↛ 173line 177 didn't jump to line 173 because the condition on line 177 was always true

178 sense.form_of.append(AltForm(word=form_of)) 

179 break 

180 elif node.kind == NodeKind.BOLD: 

181 bold_text = clean_node(wxr, None, node) 

182 if bold_text != "": 182 ↛ 173line 182 didn't jump to line 173 because the condition on line 182 was always true

183 sense.form_of.append(AltForm(word=bold_text)) 

184 break 

185 if gloss_text != "": 185 ↛ exitline 185 didn't return from function 'process_form_of_list_item' because the condition on line 185 was always true

186 sense.glosses.append(gloss_text) 

187 for str_node in list_item_node.children: 

188 if isinstance(str_node, str) and len(str_node.strip()) > 0: 

189 pos_data = {} 

190 for sense_word in str_node.split(): 

191 if sense_word in FORM_OF_POS_STRINGS: 

192 pos_data = FORM_OF_POS_STRINGS[sense_word] 

193 elif sense_word in POS_SECTIONS: 

194 pos_data = POS_SECTIONS[sense_word] 

195 elif sense_word in GRAMMATICAL_TAGS: 

196 tr_tag = GRAMMATICAL_TAGS[sense_word] 

197 if isinstance(tr_tag, str): 197 ↛ 199line 197 didn't jump to line 199 because the condition on line 197 was always true

198 sense.tags.append(tr_tag) 

199 elif isinstance(tr_tag, list): 

200 sense.tags.extend(tr_tag) 

201 if len(pos_data) > 0 and word_entry.pos == "unknown": 

202 word_entry.pos = pos_data["pos"] 

203 word_entry.tags.extend(pos_data.get("tags", [])) 

204 

205 if "form-of" not in word_entry.tags: 

206 word_entry.tags.append("form-of") 

207 word_entry.senses.append(sense)