Coverage for src/wiktextract/extractor/de/gloss.py: 87%

1from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode

3from ...page import clean_node

4from ...wxr_context import WiktextractContext

5from .models import AltForm, Sense, WordEntry

6from .tags import GRAMMATICAL_TAGS, translate_raw_tags

7from .utils import extract_sense_index

10def extract_glosses(

11 wxr: WiktextractContext,

12 word_entry: WordEntry,

13 level_node: LevelNode,

14) -> None:

15 sense = Sense()

16 section_title = clean_node(wxr, None, level_node.largs)

17 for list_node in level_node.find_child(NodeKind.LIST):

18 sense = process_gloss_list_item(

19 wxr, word_entry, list_node, sense, section_title

20 )

22 if not level_node.contain_node(NodeKind.LIST):

23 gloss_text = clean_node(wxr, sense, level_node.children)

24 if len(gloss_text) > 0:

25 sense.glosses.append(gloss_text)

26 word_entry.senses.append(sense)

29def process_gloss_list_item(

30 wxr: WiktextractContext,

31 word_entry: WordEntry,

32 list_node: WikiNode,

33 parent_sense: Sense,

34 section_title: str,

35) -> Sense:

36 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):

37 item_type = list_item_node.sarg

38 if item_type.endswith("*"):

39 # only contains modifier template

40 has_tag_template = False

41 for template in list_item_node.find_child(NodeKind.TEMPLATE):

42 raw_tag = clean_node(wxr, parent_sense, template).removesuffix(

43 ":"

44 )

45 parent_sense = Sense()

46 parent_sense.raw_tags.append(raw_tag)

47 has_tag_template = True

48 # or form-of word

49 if (

50 "form-of" in word_entry.tags

51 or section_title == "Grammatische Merkmale"

52 ):

53 process_form_of_list_item(wxr, word_entry, list_item_node)

54 elif not has_tag_template:

55 new_sense = Sense()

56 gloss_text = clean_node(wxr, new_sense, list_item_node.children)

57 if len(gloss_text) > 0: 57 ↛ 36line 57 didn't jump to line 36 because the condition on line 57 was always true

58 new_sense.glosses.append(gloss_text)

59 word_entry.senses.append(new_sense)

60 elif item_type.endswith(":"): 60 ↛ 143line 60 didn't jump to line 143 because the condition on line 60 was always true

61 sense_data = parent_sense.model_copy(deep=True)

62 gloss_nodes = []

63 for gloss_node in list_item_node.children:

64 if isinstance(gloss_node, TemplateNode):

65 if gloss_node.template_name == "K":

66 for (

67 k_arg,

68 k_arg_value,

69 ) in gloss_node.template_parameters.items():

70 if k_arg == "ft":

71 gloss_nodes.append(

72 clean_node(wxr, None, k_arg_value)

73 )

74 gloss_nodes.append(":")

75 elif isinstance(k_arg, int):

76 raw_tag = clean_node(wxr, None, k_arg_value)

77 if raw_tag != "von": 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true

78 sense_data.raw_tags.append(raw_tag)

79 clean_node(wxr, sense_data, gloss_node)

80 elif gloss_node.template_name.endswith("."): 80 ↛ 85line 80 didn't jump to line 85 because the condition on line 80 was always true

81 raw_tag = clean_node(

82 wxr, sense_data, gloss_node

83 ).removesuffix(":")

84 sense_data.raw_tags.append(raw_tag)

85 elif gloss_node.template_name in (

86 "QS Herkunft",

87 "QS Bedeutungen",

88 ):

89 continue

90 else:

91 gloss_nodes.append(gloss_node)

92 elif (

93 isinstance(gloss_node, WikiNode)

94 and gloss_node.kind == NodeKind.ITALIC

95 ):

96 italic_text = clean_node(wxr, None, gloss_node)

97 if italic_text.endswith(":"): 97 ↛ 105line 97 didn't jump to line 105 because the condition on line 97 was always true

98 for raw_tag in italic_text.removesuffix(":").split(

99 ", "

100 ):

101 raw_tag = raw_tag.strip()

102 if len(raw_tag) > 0: 102 ↛ 98line 102 didn't jump to line 98 because the condition on line 102 was always true

103 sense_data.raw_tags.append(raw_tag)

104 else:

105 gloss_nodes.append(italic_text)

106 elif not (

107 isinstance(gloss_node, WikiNode)

108 and gloss_node.kind == NodeKind.LIST

109 ):

110 gloss_nodes.append(gloss_node)

111

112 gloss_text = clean_node(wxr, sense_data, gloss_nodes)

113 sense_idx, gloss_text = extract_sense_index(gloss_text)

114 if sense_idx != "": 114 ↛ 122line 114 didn't jump to line 122 because the condition on line 114 was always true

115 if (

116 not sense_idx[0].isnumeric()

117 and parent_sense is not None

118 and len(parent_sense.sense_index) != ""

119 ):

120 sense_idx = parent_sense.sense_index + sense_idx

121 sense_data.sense_index = sense_idx

122 elif len(gloss_text.strip()) > 0:

123 wxr.wtp.debug(

124 "Failed to extract sense number from gloss node",

125 sortid="extractor/de/glosses/extract_glosses/28",

126 )

127

128 if len(gloss_text) > 0:

129 sense_data.glosses.append(gloss_text.removeprefix(", "))

130 translate_raw_tags(sense_data)

131 word_entry.senses.append(sense_data)

132

133 for sub_list_node in list_item_node.find_child(NodeKind.LIST):

134 process_gloss_list_item(

135 wxr,

136 word_entry,

137 sub_list_node,

138 sense_data,

139 section_title,

140 )

141

142 else:

143 wxr.wtp.debug(

144 f"Unexpected list item in glosses: {list_item_node}",

145 sortid="extractor/de/glosses/extract_glosses/29",

146 )

147 continue

148 return parent_sense

149

150

151# plain text POS string used in form-of gloss, usually in genitive case

152FORM_OF_POS_STRINGS = {

153 "Adjektivs": {"pos": "adj"},

154 "Verbs": {"pos": "verb"},

155 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]},

156 "Substantivs": {"pos": "noun"},

157}

158

159

160def process_form_of_list_item(

161 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode

162) -> None:

163 from .section_titles import POS_SECTIONS

164

165 sense = Sense()

166 gloss_text = clean_node(wxr, None, list_item_node.children)

167 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 167 ↛ 179line 167 didn't jump to line 179 because the loop on line 167 didn't complete

168 if isinstance(node, TemplateNode) and node.template_name == "Ü":

169 # https://de.wiktionary.org/wiki/Vorlage:Ü

170 form_of = clean_node(wxr, None, node.template_parameters.get(2, ""))

171 if len(form_of) > 0: 171 ↛ 167line 171 didn't jump to line 167 because the condition on line 171 was always true

172 sense.form_of.append(AltForm(word=form_of))

173 break

174 elif node.kind == NodeKind.BOLD:

175 bold_text = clean_node(wxr, None, node)

176 if bold_text != "": 176 ↛ 167line 176 didn't jump to line 167 because the condition on line 176 was always true

177 sense.form_of.append(AltForm(word=bold_text))

178 break

179 if gloss_text != "": 179 ↛ exitline 179 didn't return from function 'process_form_of_list_item' because the condition on line 179 was always true

180 sense.glosses.append(gloss_text)

181 for str_node in list_item_node.children:

182 if isinstance(str_node, str) and len(str_node.strip()) > 0:

183 pos_data = {}

184 for sense_word in str_node.split():

185 if sense_word in FORM_OF_POS_STRINGS:

186 pos_data = FORM_OF_POS_STRINGS[sense_word]

187 elif sense_word in POS_SECTIONS:

188 pos_data = POS_SECTIONS[sense_word]

189 elif sense_word in GRAMMATICAL_TAGS:

190 tr_tag = GRAMMATICAL_TAGS[sense_word]

191 if isinstance(tr_tag, str): 191 ↛ 193line 191 didn't jump to line 193 because the condition on line 191 was always true

192 sense.tags.append(tr_tag)

193 elif isinstance(tr_tag, list):

194 sense.tags.extend(tr_tag)

195 if len(pos_data) > 0 and word_entry.pos == "unknown":

196 word_entry.pos = pos_data["pos"]

197 word_entry.tags.extend(pos_data.get("tags", []))

198

199 if "form-of" not in word_entry.tags:

200 word_entry.tags.append("form-of")

201 word_entry.senses.append(sense)