Coverage for src/wiktextract/extractor/de/gloss.py: 89%
126 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
3from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import AltForm, Sense, WordEntry
8from .tags import GRAMMATICAL_TAGS, translate_raw_tags
9from .utils import extract_sense_index
12def extract_glosses(
13 wxr: WiktextractContext,
14 word_entry: WordEntry,
15 level_node: LevelNode,
16) -> None:
17 sense = Sense()
18 section_title = clean_node(wxr, None, level_node.largs)
19 for list_node in level_node.find_child(NodeKind.LIST):
20 sense = process_gloss_list_item(
21 wxr, word_entry, list_node, sense, section_title
22 )
24 if not level_node.contain_node(NodeKind.LIST):
25 gloss_text = clean_node(wxr, sense, level_node.children)
26 if len(gloss_text) > 0:
27 sense.glosses.append(gloss_text)
28 word_entry.senses.append(sense)
31def process_gloss_list_item(
32 wxr: WiktextractContext,
33 word_entry: WordEntry,
34 list_node: WikiNode,
35 parent_sense: Sense,
36 section_title: str,
37) -> Sense:
38 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
39 item_type = list_item_node.sarg
40 if (
41 "form-of" in word_entry.tags
42 or section_title == "Grammatische Merkmale"
43 ):
44 process_form_of_list_item(wxr, word_entry, list_item_node)
45 elif item_type.endswith("*"):
46 # only contains modifier template
47 has_tag_template = False
48 for template in list_item_node.find_child(NodeKind.TEMPLATE):
49 raw_tag = clean_node(wxr, parent_sense, template).removesuffix(
50 ":"
51 )
52 parent_sense = Sense()
53 parent_sense.raw_tags.append(raw_tag)
54 has_tag_template = True
55 if not has_tag_template:
56 new_sense = Sense()
57 gloss_text = clean_node(wxr, new_sense, list_item_node.children)
58 if len(gloss_text) > 0: 58 ↛ 38line 58 didn't jump to line 38 because the condition on line 58 was always true
59 new_sense.glosses.append(gloss_text)
60 word_entry.senses.append(new_sense)
61 elif item_type.endswith(":"): 61 ↛ 138line 61 didn't jump to line 138 because the condition on line 61 was always true
62 sense_data = parent_sense.model_copy(deep=True)
63 gloss_nodes = []
64 for gloss_node in list_item_node.children:
65 if isinstance(gloss_node, TemplateNode):
66 if gloss_node.template_name == "K":
67 extract_k_template(wxr, sense_data, gloss_node)
68 elif gloss_node.template_name.endswith("."): 68 ↛ 73line 68 didn't jump to line 73 because the condition on line 68 was always true
69 raw_tag = clean_node(
70 wxr, sense_data, gloss_node
71 ).removesuffix(":")
72 sense_data.raw_tags.append(raw_tag)
73 elif gloss_node.template_name in (
74 "QS Herkunft",
75 "QS Bedeutungen",
76 ):
77 continue
78 else:
79 gloss_nodes.append(gloss_node)
80 elif (
81 isinstance(gloss_node, WikiNode)
82 and gloss_node.kind == NodeKind.ITALIC
83 ):
84 italic_text = clean_node(wxr, None, gloss_node)
85 if italic_text.endswith(":") or (
86 italic_text.startswith("(")
87 and italic_text.endswith(")")
88 ):
89 italic_text = italic_text.strip(": ")
90 if italic_text.startswith("(") and italic_text.endswith(
91 ")"
92 ):
93 italic_text = italic_text.strip("() ")
94 for raw_tag in re.split(r":|,", italic_text):
95 raw_tag = raw_tag.strip()
96 if len(raw_tag) > 0: 96 ↛ 94line 96 didn't jump to line 94 because the condition on line 96 was always true
97 sense_data.raw_tags.append(raw_tag)
98 else:
99 gloss_nodes.append(italic_text)
100 elif not (
101 isinstance(gloss_node, WikiNode)
102 and gloss_node.kind == NodeKind.LIST
103 ):
104 gloss_nodes.append(gloss_node)
106 gloss_text = clean_node(wxr, sense_data, gloss_nodes)
107 sense_idx, gloss_text = extract_sense_index(gloss_text)
108 gloss_text = gloss_text.replace("()", "").strip(":, \n")
109 if sense_idx != "":
110 if (
111 not sense_idx[0].isnumeric()
112 and parent_sense is not None
113 and len(parent_sense.sense_index) != ""
114 ):
115 sense_idx = parent_sense.sense_index + sense_idx
116 sense_data.sense_index = sense_idx
117 elif len(gloss_text) > 0: 117 ↛ 123line 117 didn't jump to line 123 because the condition on line 117 was always true
118 wxr.wtp.debug(
119 "Failed to extract sense number from gloss node",
120 sortid="extractor/de/glosses/extract_glosses/28",
121 )
123 if len(gloss_text) > 0:
124 sense_data.glosses.append(gloss_text)
125 translate_raw_tags(sense_data)
126 word_entry.senses.append(sense_data)
128 for sub_list_node in list_item_node.find_child(NodeKind.LIST):
129 process_gloss_list_item(
130 wxr,
131 word_entry,
132 sub_list_node,
133 sense_data,
134 section_title,
135 )
137 else:
138 wxr.wtp.debug(
139 f"Unexpected list item in glosses: {list_item_node}",
140 sortid="extractor/de/glosses/extract_glosses/29",
141 )
142 continue
143 return parent_sense
146# plain text POS string used in form-of gloss, usually in genitive case
147FORM_OF_POS_STRINGS = {
148 "Adjektivs": {"pos": "adj"},
149 "Verbs": {"pos": "verb"},
150 "Suffixes": {"pos": "suffix", "tags": ["morpheme"]},
151 "Substantivs": {"pos": "noun"},
152}
155def process_form_of_list_item(
156 wxr: WiktextractContext, word_entry: WordEntry, list_item_node: WikiNode
157) -> None:
158 from .section_titles import POS_SECTIONS
160 sense = Sense()
161 gloss_text = clean_node(wxr, None, list_item_node.children)
162 for node in list_item_node.find_child(NodeKind.BOLD | NodeKind.TEMPLATE): 162 ↛ 174line 162 didn't jump to line 174 because the loop on line 162 didn't complete
163 if isinstance(node, TemplateNode) and node.template_name == "Ü":
164 # https://de.wiktionary.org/wiki/Vorlage:Ü
165 form_of = clean_node(wxr, None, node.template_parameters.get(2, ""))
166 if len(form_of) > 0: 166 ↛ 162line 166 didn't jump to line 162 because the condition on line 166 was always true
167 sense.form_of.append(AltForm(word=form_of))
168 break
169 elif node.kind == NodeKind.BOLD:
170 bold_text = clean_node(wxr, None, node)
171 if bold_text != "": 171 ↛ 162line 171 didn't jump to line 162 because the condition on line 171 was always true
172 sense.form_of.append(AltForm(word=bold_text))
173 break
174 if gloss_text != "": 174 ↛ exitline 174 didn't return from function 'process_form_of_list_item' because the condition on line 174 was always true
175 sense.glosses.append(gloss_text)
176 for str_node in list_item_node.children:
177 if isinstance(str_node, str) and len(str_node.strip()) > 0:
178 pos_data = {}
179 for sense_word in str_node.split():
180 if sense_word in FORM_OF_POS_STRINGS:
181 pos_data = FORM_OF_POS_STRINGS[sense_word]
182 elif sense_word in POS_SECTIONS:
183 pos_data = POS_SECTIONS[sense_word]
184 elif sense_word in GRAMMATICAL_TAGS:
185 tr_tag = GRAMMATICAL_TAGS[sense_word]
186 if isinstance(tr_tag, str): 186 ↛ 188line 186 didn't jump to line 188 because the condition on line 186 was always true
187 sense.tags.append(tr_tag)
188 elif isinstance(tr_tag, list):
189 sense.tags.extend(tr_tag)
190 if len(pos_data) > 0 and word_entry.pos == "unknown":
191 word_entry.pos = pos_data["pos"]
192 word_entry.tags.extend(pos_data.get("tags", []))
194 if "form-of" not in word_entry.tags:
195 word_entry.tags.append("form-of")
196 if "form-of" not in sense.tags: 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was always true
197 sense.tags.append("form-of")
198 word_entry.senses.append(sense)
201def extract_k_template(
202 wxr: WiktextractContext, sense: Sense, t_node: TemplateNode
203):
204 # https://de.wiktionary.org/wiki/Vorlage:K
205 for arg, arg_value in t_node.template_parameters.items():
206 if isinstance(arg, int) or arg == "ft":
207 raw_tag = clean_node(wxr, None, arg_value)
208 if raw_tag not in ["von", ""]: 208 ↛ 205line 208 didn't jump to line 205 because the condition on line 208 was always true
209 sense.raw_tags.append(raw_tag)
210 clean_node(wxr, sense, t_node)