Coverage for src/wiktextract/extractor/de/gloss.py: 87%

120 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .models import AltForm, Sense, WordEntry 

6from .tags import GRAMMATICAL_TAGS, translate_raw_tags 

7from .utils import extract_sense_index 

8 

9 

10def extract_glosses( 

11 wxr: WiktextractContext, 

12 word_entry: WordEntry, 

13 level_node: LevelNode, 

14) -> None: 

15 sense = Sense() 

16 section_title = clean_node(wxr, None, level_node.largs) 

17 for list_node in level_node.find_child(NodeKind.LIST): 

18 sense = process_gloss_list_item( 

19 wxr, word_entry, list_node, sense, section_title 

20 ) 

21 

22 if not level_node.contain_node(NodeKind.LIST): 

23 gloss_text = clean_node(wxr, sense, level_node.children) 

24 if len(gloss_text) > 0: 

25 sense.glosses.append(gloss_text) 

26 word_entry.senses.append(sense) 

27 

28 

29def process_gloss_list_item( 

30 wxr: WiktextractContext, 

31 word_entry: WordEntry, 

32 list_node: WikiNode, 

33 parent_sense: Sense, 

34 section_title: str, 

35) -> Sense: 

36 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

37 item_type = list_item_node.sarg 

38 if item_type.endswith("*"): 

39 # only contains modifier template 

40 has_tag_template = False 

41 for template in list_item_node.find_child(NodeKind.TEMPLATE): 

42 raw_tag = clean_node(wxr, parent_sense, template).removesuffix( 

43 ":" 

44 ) 

45 parent_sense = Sense() 

46 parent_sense.raw_tags.append(raw_tag) 

47 has_tag_template = True 

48 # or form-of word 

49 if ( 

50 "form-of" in word_entry.tags 

51 or section_title == "Grammatische Merkmale" 

52 ): 

53 process_form_of_list_item(wxr, word_entry, list_item_node) 

54 elif not has_tag_template: 

55 new_sense = Sense() 

56 gloss_text = clean_node(wxr, new_sense, list_item_node.children) 

57 if len(gloss_text) > 0: 57 ↛ 36line 57 didn't jump to line 36 because the condition on line 57 was always true

58 new_sense.glosses.append(gloss_text) 

59 word_entry.senses.append(new_sense) 

60 elif item_type.endswith(":"): 60 ↛ 143line 60 didn't jump to line 143 because the condition on line 60 was always true

61 sense_data = parent_sense.model_copy(deep=True) 

62 gloss_nodes = [] 

63 for gloss_node in list_item_node.children: 

64 if isinstance(gloss_node, TemplateNode): 

65 if gloss_node.template_name == "K": 

66 for ( 

67 k_arg, 

68 k_arg_value, 

69 ) in gloss_node.template_parameters.items(): 

70 if k_arg == "ft": 

71 gloss_nodes.append( 

72 clean_node(wxr, None, k_arg_value) 

73 ) 

74 gloss_nodes.append(":") 

75 elif isinstance(k_arg, int): 

76 raw_tag = clean_node(wxr, None, k_arg_value) 

77 if raw_tag != "von": 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true

78 sense_data.raw_tags.append(raw_tag) 

79 clean_node(wxr, sense_data, gloss_node) 

80 elif gloss_node.template_name.endswith("."): 80 ↛ 85line 80 didn't jump to line 85 because the condition on line 80 was always true

81 raw_tag = clean_node( 

82 wxr, sense_data, gloss_node 

83 ).removesuffix(":") 

84 sense_data.raw_tags.append(raw_tag) 

85 elif gloss_node.template_name in ( 

86 "QS Herkunft", 

87 "QS Bedeutungen", 

88 ): 

89 continue 

90 else: 

91 gloss_nodes.append(gloss_node) 

92 elif ( 

93 isinstance(gloss_node, WikiNode) 

94 and gloss_node.kind == NodeKind.ITALIC 

95 ): 

96 italic_text = clean_node(wxr, None, gloss_node) 

97 if italic_text.endswith(":"): 97 ↛ 105line 97 didn't jump to line 105 because the condition on line 97 was always true

98 for raw_tag in italic_text.removesuffix(":").split( 

99 ", " 

100 ): 

101 raw_tag = raw_tag.strip() 

102 if len(raw_tag) > 0: 102 ↛ 98line 102 didn't jump to line 98 because the condition on line 102 was always true

103 sense_data.raw_tags.append(raw_tag) 

104 else: 

105 gloss_nodes.append(italic_text) 

106 elif not ( 

107 isinstance(gloss_node, WikiNode) 

108 and gloss_node.kind == NodeKind.LIST 

109 ): 

110 gloss_nodes.append(gloss_node) 

111 

112 gloss_text = clean_node(wxr, sense_data, gloss_nodes) 

113 sense_idx, gloss_text = extract_sense_index(gloss_text) 

114 if sense_idx != "": 114 ↛ 122line 114 didn't jump to line 122 because the condition on line 114 was always true

115 if ( 

116 not sense_idx[0].isnumeric() 

117 and parent_sense is not None 

118 and len(parent_sense.sense_index) != "" 

119 ): 

120 sense_idx = parent_sense.sense_index + sense_idx 

121 sense_data.sense_index = sense_idx 

122 elif len(gloss_text.strip()) > 0: 

123 wxr.wtp.debug( 

124 "Failed to extract sense number from gloss node", 

125 sortid="extractor/de/glosses/extract_glosses/28", 

126 ) 

127 

128 if len(gloss_text) > 0: 

129 sense_data.glosses.append(gloss_text.removeprefix(", ")) 

130 translate_raw_tags(sense_data) 

131 word_entry.senses.append(sense_data) 

132 

133 for sub_list_node in list_item_node.find_child(NodeKind.LIST): 

134 process_gloss_list_item( 

135 wxr, 

136 word_entry, 

137 sub_list_node, 

138 sense_data, 

139 section_title, 

140 ) 

141 

142 else: 

143 wxr.wtp.debug( 

144 f"Unexpected list item in glosses: {list_item_node}", 

145 sortid="extractor/de/glosses/extract_glosses/29", 

146 ) 

147 continue 

148 return parent_sense 

149 

150 

151# plain text POS string used in form-of gloss, usually in genitive case 

152FORM_OF_POS_STRINGS = { 

153 "Adjektivs": {"pos": "adj"}, 

154 "Verbs": {"pos": "verb"}, 

155 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]}, 

156 "Substantivs": {"pos": "noun"}, 

157} 

158 

159 

160def process_form_of_list_item( 

161 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode 

162) -> None: 

163 from .section_titles import POS_SECTIONS 

164 

165 sense = Sense() 

166 gloss_text = clean_node(wxr, None, list_item_node.children) 

167 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 167 ↛ 179line 167 didn't jump to line 179 because the loop on line 167 didn't complete

168 if isinstance(node, TemplateNode) and node.template_name == "Ü": 

169 # https://de.wiktionary.org/wiki/Vorlage:Ü 

170 form_of = clean_node(wxr, None, node.template_parameters.get(2, "")) 

171 if len(form_of) > 0: 171 ↛ 167line 171 didn't jump to line 167 because the condition on line 171 was always true

172 sense.form_of.append(AltForm(word=form_of)) 

173 break 

174 elif node.kind == NodeKind.BOLD: 

175 bold_text = clean_node(wxr, None, node) 

176 if bold_text != "": 176 ↛ 167line 176 didn't jump to line 167 because the condition on line 176 was always true

177 sense.form_of.append(AltForm(word=bold_text)) 

178 break 

179 if gloss_text != "": 179 ↛ exitline 179 didn't return from function 'process_form_of_list_item' because the condition on line 179 was always true

180 sense.glosses.append(gloss_text) 

181 for str_node in list_item_node.children: 

182 if isinstance(str_node, str) and len(str_node.strip()) > 0: 

183 pos_data = {} 

184 for sense_word in str_node.split(): 

185 if sense_word in FORM_OF_POS_STRINGS: 

186 pos_data = FORM_OF_POS_STRINGS[sense_word] 

187 elif sense_word in POS_SECTIONS: 

188 pos_data = POS_SECTIONS[sense_word] 

189 elif sense_word in GRAMMATICAL_TAGS: 

190 tr_tag = GRAMMATICAL_TAGS[sense_word] 

191 if isinstance(tr_tag, str): 191 ↛ 193line 191 didn't jump to line 193 because the condition on line 191 was always true

192 sense.tags.append(tr_tag) 

193 elif isinstance(tr_tag, list): 

194 sense.tags.extend(tr_tag) 

195 if len(pos_data) > 0 and word_entry.pos == "unknown": 

196 word_entry.pos = pos_data["pos"] 

197 word_entry.tags.extend(pos_data.get("tags", [])) 

198 

199 if "form-of" not in word_entry.tags: 

200 word_entry.tags.append("form-of") 

201 word_entry.senses.append(sense)