Coverage for src/wiktextract/extractor/de/gloss.py: 87%
120 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .models import AltForm, Sense, WordEntry
6from .tags import GRAMMATICAL_TAGS, translate_raw_tags
7from .utils import extract_sense_index
10def extract_glosses(
11 wxr: WiktextractContext,
12 word_entry: WordEntry,
13 level_node: LevelNode,
14) -> None:
15 sense = Sense()
16 section_title = clean_node(wxr, None, level_node.largs)
17 for list_node in level_node.find_child(NodeKind.LIST):
18 sense = process_gloss_list_item(
19 wxr, word_entry, list_node, sense, section_title
20 )
22 if not level_node.contain_node(NodeKind.LIST):
23 gloss_text = clean_node(wxr, sense, level_node.children)
24 if len(gloss_text) > 0:
25 sense.glosses.append(gloss_text)
26 word_entry.senses.append(sense)
29def process_gloss_list_item(
30 wxr: WiktextractContext,
31 word_entry: WordEntry,
32 list_node: WikiNode,
33 parent_sense: Sense,
34 section_title: str,
35) -> Sense:
36 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
37 item_type = list_item_node.sarg
38 if item_type.endswith("*"):
39 # only contains modifier template
40 has_tag_template = False
41 for template in list_item_node.find_child(NodeKind.TEMPLATE):
42 raw_tag = clean_node(wxr, parent_sense, template).removesuffix(
43 ":"
44 )
45 parent_sense = Sense()
46 parent_sense.raw_tags.append(raw_tag)
47 has_tag_template = True
48 # or form-of word
49 if (
50 "form-of" in word_entry.tags
51 or section_title == "Grammatische Merkmale"
52 ):
53 process_form_of_list_item(wxr, word_entry, list_item_node)
54 elif not has_tag_template:
55 new_sense = Sense()
56 gloss_text = clean_node(wxr, new_sense, list_item_node.children)
57 if len(gloss_text) > 0: 57 ↛ 36line 57 didn't jump to line 36 because the condition on line 57 was always true
58 new_sense.glosses.append(gloss_text)
59 word_entry.senses.append(new_sense)
60 elif item_type.endswith(":"): 60 ↛ 143line 60 didn't jump to line 143 because the condition on line 60 was always true
61 sense_data = parent_sense.model_copy(deep=True)
62 gloss_nodes = []
63 for gloss_node in list_item_node.children:
64 if isinstance(gloss_node, TemplateNode):
65 if gloss_node.template_name == "K":
66 for (
67 k_arg,
68 k_arg_value,
69 ) in gloss_node.template_parameters.items():
70 if k_arg == "ft":
71 gloss_nodes.append(
72 clean_node(wxr, None, k_arg_value)
73 )
74 gloss_nodes.append(":")
75 elif isinstance(k_arg, int):
76 raw_tag = clean_node(wxr, None, k_arg_value)
77 if raw_tag != "von": 77 ↛ 66line 77 didn't jump to line 66 because the condition on line 77 was always true
78 sense_data.raw_tags.append(raw_tag)
79 clean_node(wxr, sense_data, gloss_node)
80 elif gloss_node.template_name.endswith("."): 80 ↛ 85line 80 didn't jump to line 85 because the condition on line 80 was always true
81 raw_tag = clean_node(
82 wxr, sense_data, gloss_node
83 ).removesuffix(":")
84 sense_data.raw_tags.append(raw_tag)
85 elif gloss_node.template_name in (
86 "QS Herkunft",
87 "QS Bedeutungen",
88 ):
89 continue
90 else:
91 gloss_nodes.append(gloss_node)
92 elif (
93 isinstance(gloss_node, WikiNode)
94 and gloss_node.kind == NodeKind.ITALIC
95 ):
96 italic_text = clean_node(wxr, None, gloss_node)
97 if italic_text.endswith(":"): 97 ↛ 105line 97 didn't jump to line 105 because the condition on line 97 was always true
98 for raw_tag in italic_text.removesuffix(":").split(
99 ", "
100 ):
101 raw_tag = raw_tag.strip()
102 if len(raw_tag) > 0: 102 ↛ 98line 102 didn't jump to line 98 because the condition on line 102 was always true
103 sense_data.raw_tags.append(raw_tag)
104 else:
105 gloss_nodes.append(italic_text)
106 elif not (
107 isinstance(gloss_node, WikiNode)
108 and gloss_node.kind == NodeKind.LIST
109 ):
110 gloss_nodes.append(gloss_node)
112 gloss_text = clean_node(wxr, sense_data, gloss_nodes)
113 sense_idx, gloss_text = extract_sense_index(gloss_text)
114 if sense_idx != "": 114 ↛ 122line 114 didn't jump to line 122 because the condition on line 114 was always true
115 if (
116 not sense_idx[0].isnumeric()
117 and parent_sense is not None
118 and len(parent_sense.sense_index) != ""
119 ):
120 sense_idx = parent_sense.sense_index + sense_idx
121 sense_data.sense_index = sense_idx
122 elif len(gloss_text.strip()) > 0:
123 wxr.wtp.debug(
124 "Failed to extract sense number from gloss node",
125 sortid="extractor/de/glosses/extract_glosses/28",
126 )
128 if len(gloss_text) > 0:
129 sense_data.glosses.append(gloss_text.removeprefix(", "))
130 translate_raw_tags(sense_data)
131 word_entry.senses.append(sense_data)
133 for sub_list_node in list_item_node.find_child(NodeKind.LIST):
134 process_gloss_list_item(
135 wxr,
136 word_entry,
137 sub_list_node,
138 sense_data,
139 section_title,
140 )
142 else:
143 wxr.wtp.debug(
144 f"Unexpected list item in glosses: {list_item_node}",
145 sortid="extractor/de/glosses/extract_glosses/29",
146 )
147 continue
148 return parent_sense
151# plain text POS string used in form-of gloss, usually in genitive case
152FORM_OF_POS_STRINGS = {
153 "Adjektivs": {"pos": "adj"},
154 "Verbs": {"pos": "verb"},
155 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]},
156 "Substantivs": {"pos": "noun"},
157}
160def process_form_of_list_item(
161 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
162) -> None:
163 from .section_titles import POS_SECTIONS
165 sense = Sense()
166 gloss_text = clean_node(wxr, None, list_item_node.children)
167 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 167 ↛ 179line 167 didn't jump to line 179 because the loop on line 167 didn't complete
168 if isinstance(node, TemplateNode) and node.template_name == "Ü":
169 # https://de.wiktionary.org/wiki/Vorlage:Ü
170 form_of = clean_node(wxr, None, node.template_parameters.get(2, ""))
171 if len(form_of) > 0: 171 ↛ 167line 171 didn't jump to line 167 because the condition on line 171 was always true
172 sense.form_of.append(AltForm(word=form_of))
173 break
174 elif node.kind == NodeKind.BOLD:
175 bold_text = clean_node(wxr, None, node)
176 if bold_text != "": 176 ↛ 167line 176 didn't jump to line 167 because the condition on line 176 was always true
177 sense.form_of.append(AltForm(word=bold_text))
178 break
179 if gloss_text != "": 179 ↛ exitline 179 didn't return from function 'process_form_of_list_item' because the condition on line 179 was always true
180 sense.glosses.append(gloss_text)
181 for str_node in list_item_node.children:
182 if isinstance(str_node, str) and len(str_node.strip()) > 0:
183 pos_data = {}
184 for sense_word in str_node.split():
185 if sense_word in FORM_OF_POS_STRINGS:
186 pos_data = FORM_OF_POS_STRINGS[sense_word]
187 elif sense_word in POS_SECTIONS:
188 pos_data = POS_SECTIONS[sense_word]
189 elif sense_word in GRAMMATICAL_TAGS:
190 tr_tag = GRAMMATICAL_TAGS[sense_word]
191 if isinstance(tr_tag, str): 191 ↛ 193line 191 didn't jump to line 193 because the condition on line 191 was always true
192 sense.tags.append(tr_tag)
193 elif isinstance(tr_tag, list):
194 sense.tags.extend(tr_tag)
195 if len(pos_data) > 0 and word_entry.pos == "unknown":
196 word_entry.pos = pos_data["pos"]
197 word_entry.tags.extend(pos_data.get("tags", []))
199 if "form-of" not in word_entry.tags:
200 word_entry.tags.append("form-of")
201 word_entry.senses.append(sense)