Coverage for src/wiktextract/extractor/de/gloss.py: 89%
123 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1import re
3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import AltForm, Sense, WordEntry
8from .tags import GRAMMATICAL_TAGS, translate_raw_tags
9from .utils import extract_sense_index
12def extract_glosses(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 level_node: LevelNode,
16) -> None:
17 sense = Sense()
18 section_title = clean_node(wxr, None, level_node.largs)
19 for list_node in level_node.find_child(NodeKind.LIST):
20 sense = process_gloss_list_item(
21 wxr, word_entry, list_node, sense, section_title
22 )
24 if not level_node.contain_node(NodeKind.LIST):
25 gloss_text = clean_node(wxr, sense, level_node.children)
26 if len(gloss_text) > 0:
27 sense.glosses.append(gloss_text)
28 word_entry.senses.append(sense)
31def process_gloss_list_item(
32 wxr: WiktextractContext,
33 word_entry: WordEntry,
34 list_node: WikiNode,
35 parent_sense: Sense,
36 section_title: str,
37) -> Sense:
38 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
39 item_type = list_item_node.sarg
40 if (
41 "form-of" in word_entry.tags
42 or section_title == "Grammatische Merkmale"
43 ):
44 process_form_of_list_item(wxr, word_entry, list_item_node)
45 elif item_type.endswith("*"):
46 # only contains modifier template
47 has_tag_template = False
48 for template in list_item_node.find_child(NodeKind.TEMPLATE):
49 raw_tag = clean_node(wxr, parent_sense, template).removesuffix(
50 ":"
51 )
52 parent_sense = Sense()
53 parent_sense.raw_tags.append(raw_tag)
54 has_tag_template = True
55 if not has_tag_template:
56 new_sense = Sense()
57 gloss_text = clean_node(wxr, new_sense, list_item_node.children)
58 if len(gloss_text) > 0: 58 ↛ 38line 58 didn't jump to line 38 because the condition on line 58 was always true
59 new_sense.glosses.append(gloss_text)
60 word_entry.senses.append(new_sense)
61 elif item_type.endswith(":"): 61 ↛ 149line 61 didn't jump to line 149 because the condition on line 61 was always true
62 sense_data = parent_sense.model_copy(deep=True)
63 gloss_nodes = []
64 for gloss_node in list_item_node.children:
65 if isinstance(gloss_node, TemplateNode):
66 if gloss_node.template_name == "K":
67 for (
68 k_arg,
69 k_arg_value,
70 ) in gloss_node.template_parameters.items():
71 if k_arg == "ft":
72 gloss_nodes.append(
73 clean_node(wxr, None, k_arg_value)
74 )
75 gloss_nodes.append(":")
76 elif isinstance(k_arg, int):
77 raw_tag = clean_node(wxr, None, k_arg_value)
78 if raw_tag != "von": 78 ↛ 67line 78 didn't jump to line 67 because the condition on line 78 was always true
79 sense_data.raw_tags.append(raw_tag)
80 clean_node(wxr, sense_data, gloss_node)
81 elif gloss_node.template_name.endswith("."): 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true
82 raw_tag = clean_node(
83 wxr, sense_data, gloss_node
84 ).removesuffix(":")
85 sense_data.raw_tags.append(raw_tag)
86 elif gloss_node.template_name in (
87 "QS Herkunft",
88 "QS Bedeutungen",
89 ):
90 continue
91 else:
92 gloss_nodes.append(gloss_node)
93 elif (
94 isinstance(gloss_node, WikiNode)
95 and gloss_node.kind == NodeKind.ITALIC
96 ):
97 italic_text = clean_node(wxr, None, gloss_node)
98 if italic_text.endswith(":") or ( 98 ↛ 111line 98 didn't jump to line 111 because the condition on line 98 was always true
99 italic_text.startswith("(")
100 and italic_text.endswith(")")
101 ):
102 if not italic_text.endswith(":"):
103 italic_text = italic_text.strip("() ")
104 for raw_tag in re.split(
105 r":|,", italic_text.strip(": ")
106 ):
107 raw_tag = raw_tag.strip()
108 if len(raw_tag) > 0: 108 ↛ 104line 108 didn't jump to line 104 because the condition on line 108 was always true
109 sense_data.raw_tags.append(raw_tag)
110 else:
111 gloss_nodes.append(italic_text)
112 elif not (
113 isinstance(gloss_node, WikiNode)
114 and gloss_node.kind == NodeKind.LIST
115 ):
116 gloss_nodes.append(gloss_node)
118 gloss_text = clean_node(wxr, sense_data, gloss_nodes)
119 sense_idx, gloss_text = extract_sense_index(gloss_text)
120 if sense_idx != "":
121 if (
122 not sense_idx[0].isnumeric()
123 and parent_sense is not None
124 and len(parent_sense.sense_index) != ""
125 ):
126 sense_idx = parent_sense.sense_index + sense_idx
127 sense_data.sense_index = sense_idx
128 elif len(gloss_text) > 0: 128 ↛ 134line 128 didn't jump to line 134 because the condition on line 128 was always true
129 wxr.wtp.debug(
130 "Failed to extract sense number from gloss node",
131 sortid="extractor/de/glosses/extract_glosses/28",
132 )
134 if len(gloss_text) > 0:
135 sense_data.glosses.append(gloss_text)
136 translate_raw_tags(sense_data)
137 word_entry.senses.append(sense_data)
139 for sub_list_node in list_item_node.find_child(NodeKind.LIST):
140 process_gloss_list_item(
141 wxr,
142 word_entry,
143 sub_list_node,
144 sense_data,
145 section_title,
146 )
148 else:
149 wxr.wtp.debug(
150 f"Unexpected list item in glosses: {list_item_node}",
151 sortid="extractor/de/glosses/extract_glosses/29",
152 )
153 continue
154 return parent_sense
157# plain text POS string used in form-of gloss, usually in genitive case
158FORM_OF_POS_STRINGS = {
159 "Adjektivs": {"pos": "adj"},
160 "Verbs": {"pos": "verb"},
161 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]},
162 "Substantivs": {"pos": "noun"},
163}
166def process_form_of_list_item(
167 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
168) -> None:
169 from .section_titles import POS_SECTIONS
171 sense = Sense()
172 gloss_text = clean_node(wxr, None, list_item_node.children)
173 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 173 ↛ 185line 173 didn't jump to line 185 because the loop on line 173 didn't complete
174 if isinstance(node, TemplateNode) and node.template_name == "Ü":
175 # https://de.wiktionary.org/wiki/Vorlage:Ü
176 form_of = clean_node(wxr, None, node.template_parameters.get(2, ""))
177 if len(form_of) > 0: 177 ↛ 173line 177 didn't jump to line 173 because the condition on line 177 was always true
178 sense.form_of.append(AltForm(word=form_of))
179 break
180 elif node.kind == NodeKind.BOLD:
181 bold_text = clean_node(wxr, None, node)
182 if bold_text != "": 182 ↛ 173line 182 didn't jump to line 173 because the condition on line 182 was always true
183 sense.form_of.append(AltForm(word=bold_text))
184 break
185 if gloss_text != "": 185 ↛ exitline 185 didn't return from function 'process_form_of_list_item' because the condition on line 185 was always true
186 sense.glosses.append(gloss_text)
187 for str_node in list_item_node.children:
188 if isinstance(str_node, str) and len(str_node.strip()) > 0:
189 pos_data = {}
190 for sense_word in str_node.split():
191 if sense_word in FORM_OF_POS_STRINGS:
192 pos_data = FORM_OF_POS_STRINGS[sense_word]
193 elif sense_word in POS_SECTIONS:
194 pos_data = POS_SECTIONS[sense_word]
195 elif sense_word in GRAMMATICAL_TAGS:
196 tr_tag = GRAMMATICAL_TAGS[sense_word]
197 if isinstance(tr_tag, str): 197 ↛ 199line 197 didn't jump to line 199 because the condition on line 197 was always true
198 sense.tags.append(tr_tag)
199 elif isinstance(tr_tag, list):
200 sense.tags.extend(tr_tag)
201 if len(pos_data) > 0 and word_entry.pos == "unknown":
202 word_entry.pos = pos_data["pos"]
203 word_entry.tags.extend(pos_data.get("tags", []))
205 if "form-of" not in word_entry.tags:
206 word_entry.tags.append("form-of")
207 word_entry.senses.append(sense)