Coverage for src / wiktextract / extractor / de / page.py: 75%
176 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1from typing import Any
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ...wxr_logging import logger
9from .etymology import extract_etymology_section
10from .example import extract_examples
11from .form import extracrt_form_section, extract_transcription_section
12from .gloss import extract_glosses
13from .inflection import extract_inf_table_template, process_noun_table
14from .linkage import extract_descendant_section, extract_linkages
15from .models import AltForm, Hyphenation, Sense, WordEntry
16from .pronunciation import extract_pronunciation_section
17from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
18from .tags import translate_raw_tags
19from .translation import extract_translation
22def parse_section(
23 wxr: WiktextractContext,
24 page_data: list[WordEntry],
25 base_data: WordEntry,
26 level_node: WikiNode,
27) -> None:
28 # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage
29 # Level 3 headings are used to start POS sections like
30 # === {{Wortart|Verb|Deutsch}} ===
31 # title templates:
32 # https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine
33 if level_node.kind == NodeKind.LEVEL3:
34 process_pos_section(wxr, page_data, base_data, level_node)
35 # Level 4 headings were introduced by overriding the default templates.
36 # See overrides/de.json for details.
37 elif level_node.kind == NodeKind.LEVEL4: 37 ↛ exitline 37 didn't return from function 'parse_section' because the condition on line 37 was always true
38 section_name = clean_node(wxr, None, level_node.largs)
39 wxr.wtp.start_subsection(section_name)
40 if section_name in ("Bedeutungen", "Grammatische Merkmale"):
41 extract_glosses(
42 wxr,
43 page_data[-1] if len(page_data) > 0 else base_data,
44 level_node,
45 )
46 elif wxr.config.capture_pronunciation and section_name == "Aussprache":
47 extract_pronunciation_section(
48 wxr,
49 page_data[-1] if len(page_data) > 0 else base_data,
50 level_node,
51 )
52 elif wxr.config.capture_examples and section_name == "Beispiele": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true
53 extract_examples(
54 wxr,
55 page_data[-1] if len(page_data) > 0 else base_data,
56 level_node,
57 )
58 elif (
59 wxr.config.capture_translations and section_name == "Übersetzungen"
60 ):
61 extract_translation(
62 wxr,
63 page_data[-1] if len(page_data) > 0 else base_data,
64 level_node,
65 )
66 elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true
67 extract_linkages(
68 wxr,
69 page_data[-1] if len(page_data) > 0 else base_data,
70 level_node,
71 LINKAGE_TITLES[section_name],
72 )
73 elif wxr.config.capture_etymologies and section_name == "Herkunft":
74 extract_etymology_section(
75 wxr,
76 page_data[-1] if len(page_data) > 0 else base_data,
77 level_node,
78 )
79 elif section_name in FORM_TITLES: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true
80 extracrt_form_section(
81 wxr,
82 page_data[-1] if len(page_data) > 0 else base_data,
83 level_node,
84 FORM_TITLES[section_name],
85 )
86 elif section_name == "Worttrennung":
87 extract_hyphenation_section(
88 wxr,
89 page_data[-1] if len(page_data) > 0 else base_data,
90 level_node,
91 )
92 elif section_name == "Anmerkung": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true
93 extract_note_section(
94 wxr,
95 page_data[-1] if len(page_data) > 0 else base_data,
96 level_node,
97 )
98 elif section_name == "Umschrift":
99 extract_transcription_section(
100 wxr,
101 page_data[-1] if len(page_data) > 0 else base_data,
102 level_node,
103 )
104 elif section_name == "Entlehnungen": 104 ↛ 110line 104 didn't jump to line 110 because the condition on line 104 was always true
105 extract_descendant_section(
106 wxr,
107 page_data[-1] if len(page_data) > 0 else base_data,
108 level_node,
109 )
110 elif section_name not in [
111 "Referenzen",
112 "Ähnliche Wörter",
113 "Bekannte Namensträger",
114 ]:
115 wxr.wtp.debug(
116 f"Unknown section: {section_name}",
117 sortid="extractor/de/page/parse_section/107",
118 )
121FORM_POS = {
122 "Konjugierte Form",
123 "Deklinierte Form",
124 "Dekliniertes Gerundivum",
125 "Komparativ",
126 "Superlativ",
127 "Supinum",
128 "Partizip",
129 "Partizip I",
130 "Partizip II",
131 "Erweiterter Infinitiv",
132 "Adverbialpartizip",
133 "Exzessiv",
134 "Gerundium",
135}
137IGNORE_POS = {"Albanisch", "Pseudopartizip", "Ajami"}
139GENDER_TEMPLATES = {
140 "n": ["neuter"],
141 "m": ["masculine"],
142 "f": ["feminine"],
143 "mn.": ["masculine", "neuter"],
144 "nm": ["masculine", "neuter"],
145 "nf": ["neuter", "feminine"],
146 "fn": ["neuter", "feminine"],
147 "fm": ["feminine", "masculine"],
148 "mf": ["feminine", "masculine"],
149 "u": ["common-gender"],
150 "un": ["common-gender", "neuter"],
151}
154def process_pos_section(
155 wxr: WiktextractContext,
156 page_data: list[WordEntry],
157 base_data: WordEntry,
158 level_node: LevelNode,
159) -> None:
160 pos_data_list = []
161 pos_titles = []
162 for template_node in level_node.find_content(NodeKind.TEMPLATE):
163 if template_node.template_name == "Wortart":
164 pos_argument = template_node.template_parameters.get(1, "").strip()
165 pos_titles.append(pos_argument)
166 if pos_argument in IGNORE_POS: 166 ↛ 167line 166 didn't jump to line 167 because the condition on line 166 was never true
167 continue
168 elif pos_argument in FORM_POS:
169 pos_data_list.append({"pos": "unknown", "tags": ["form-of"]})
170 elif pos_argument in POS_SECTIONS: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true
171 pos_data_list.append(POS_SECTIONS[pos_argument])
172 elif pos_argument == "Gebundenes Lexem":
173 if wxr.wtp.title.startswith("-") and wxr.wtp.title.endswith(
174 "-"
175 ):
176 pos_data_list.append({"pos": "infix", "tags": ["morpheme"]})
177 elif wxr.wtp.title.endswith("-"):
178 pos_data_list.append(
179 {"pos": "prefix", "tags": ["morpheme"]}
180 )
181 elif wxr.wtp.title.startswith("-"):
182 pos_data_list.append(
183 {"pos": "suffix", "tags": ["morpheme"]}
184 )
185 else:
186 wxr.wtp.debug(
187 f"Unknown Wortart template POS argument: {pos_argument}",
188 sortid="extractor/de/page/process_pos_section/55",
189 )
190 pos_data_list.append({"pos": "unknown"})
192 if len(pos_data_list) == 0: 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 return
194 page_data.append(base_data.model_copy(deep=True))
195 for pos_data in pos_data_list:
196 for tag in pos_data.get("tags", []):
197 if tag not in page_data[-1].tags: 197 ↛ 196line 197 didn't jump to line 196 because the condition on line 197 was always true
198 page_data[-1].tags.append(tag)
199 if len(pos_data_list) > 1 and pos_data_list[-1]["pos"] != "unknown":
200 page_data[-1].pos = pos_data_list[-1]["pos"]
201 page_data[-1].pos_title = pos_titles[-1]
202 else:
203 page_data[-1].pos = pos_data_list[0]["pos"]
204 page_data[-1].pos_title = pos_titles[0]
205 for pos_data in pos_data_list:
206 if (
207 pos_data["pos"] not in [page_data[-1].pos, "unknown"]
208 and pos_data["pos"] not in page_data[-1].tags
209 ):
210 page_data[-1].tags.append(pos_data["pos"])
212 for node in level_node.find_content(NodeKind.TEMPLATE | NodeKind.ITALIC):
213 if (
214 isinstance(node, TemplateNode)
215 and node.template_name in GENDER_TEMPLATES
216 ):
217 page_data[-1].tags.extend(GENDER_TEMPLATES[node.template_name])
218 elif node.kind == NodeKind.ITALIC:
219 raw_tag = clean_node(wxr, None, node)
220 if raw_tag != "": 220 ↛ 212line 220 didn't jump to line 212 because the condition on line 220 was always true
221 page_data[-1].raw_tags.append(raw_tag)
223 wxr.wtp.start_subsection(clean_node(wxr, page_data[-1], level_node.largs))
225 for level_4_node in level_node.find_child(NodeKind.LEVEL4):
226 parse_section(wxr, page_data, base_data, level_4_node)
228 for t_node in level_node.find_child(NodeKind.TEMPLATE):
229 extract_inf_table_template(wxr, page_data[-1], t_node)
230 if t_node.template_name in ["Alte Schreibweise", "Alte Schreibung"]: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 extract_old_spell_template(wxr, page_data[-1], t_node)
233 for table_node in level_node.find_child(NodeKind.TABLE): 233 ↛ 235line 233 didn't jump to line 235 because the loop on line 233 never started
234 # page "beide"
235 process_noun_table(wxr, page_data[-1], table_node)
237 if not level_node.contain_node(NodeKind.LEVEL4):
238 extract_glosses(wxr, page_data[-1], level_node)
239 translate_raw_tags(page_data[-1])
242def parse_page(
243 wxr: WiktextractContext, page_title: str, page_text: str
244) -> list[dict[str, Any]]:
245 if wxr.config.verbose: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 logger.info(f"Parsing page: {page_title}")
248 wxr.config.word = page_title
249 wxr.wtp.start_page(page_title)
250 tree = wxr.wtp.parse(page_text, pre_expand=True)
252 page_data: list[WordEntry] = []
253 for level2_node in tree.find_child(NodeKind.LEVEL2):
254 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
255 # The language sections are marked with
256 # == <title> ({{Sprache|<lang>}}) ==
257 # where <title> is the title of the page and <lang> is the
258 # German name of the language of the section.
259 if subtitle_template.template_name == "Sprache":
260 lang_name = subtitle_template.template_parameters.get(1, "")
261 lang_code = name_to_code(lang_name, "de")
262 if lang_code == "":
263 lang_code = "unknown"
264 if lang_name != "Umschrift": 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 wxr.wtp.warning(
266 f"Unknown language: {lang_name}",
267 sortid="extractor/de/page/parse_page/76",
268 )
269 if ( 269 ↛ 273line 269 didn't jump to line 273 because the condition on line 269 was never true
270 wxr.config.capture_language_codes is not None
271 and lang_code not in wxr.config.capture_language_codes
272 ):
273 continue
274 base_data = WordEntry(
275 lang=lang_name,
276 lang_code=lang_code,
277 word=page_title,
278 pos="unknown",
279 )
280 clean_node(wxr, base_data, subtitle_template)
281 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
282 parse_section(wxr, page_data, base_data, level3_node)
283 for t_node in level2_node.find_child(NodeKind.TEMPLATE):
284 if t_node.template_name == "Ähnlichkeiten Umschrift": 284 ↛ 288line 284 didn't jump to line 288 because the condition on line 284 was always true
285 process_umschrift_template(
286 wxr, page_data, base_data, t_node
287 )
288 elif t_node.template_name in [
289 "Alte Schreibweise",
290 "Alte Schreibung",
291 ]:
292 extract_old_spell_template(wxr, base_data, t_node)
293 page_data.append(base_data)
295 for data in page_data:
296 if len(data.senses) == 0:
297 data.senses.append(Sense(tags=["no-gloss"]))
298 return [d.model_dump(exclude_defaults=True) for d in page_data]
301def process_umschrift_template(
302 wxr: WiktextractContext,
303 page_data: list[WordEntry],
304 base_data: WordEntry,
305 template_node: TemplateNode,
306) -> None:
307 # https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift
308 # soft-redirect template, similar to en edition's "zh-see"
309 data = base_data.model_copy(deep=True)
310 data.pos = "soft-redirect"
311 for key, value in template_node.template_parameters.items():
312 if isinstance(key, int):
313 redirect_page = clean_node(wxr, None, value)
314 link_arg = template_node.template_parameters.get(f"link{key}", "")
315 link_text = clean_node(wxr, None, link_arg)
316 if len(link_text) > 0:
317 redirect_page = link_text
318 if len(redirect_page) > 0: 318 ↛ 311line 318 didn't jump to line 311 because the condition on line 318 was always true
319 data.redirects.append(redirect_page)
320 if len(data.redirects) > 0: 320 ↛ exitline 320 didn't return from function 'process_umschrift_template' because the condition on line 320 was always true
321 page_data.append(data)
324def extract_hyphenation_section(
325 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
326) -> None:
327 h_str = ""
328 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
329 for node in list_item.children:
330 if isinstance(node, str): 330 ↛ 329line 330 didn't jump to line 329 because the condition on line 330 was always true
331 if "," in node:
332 h_str = node[: node.index(",")].strip()
333 break
334 else:
335 h_str += node.strip()
336 if h_str not in ["?", ""]: 336 ↛ exitline 336 didn't return from function 'extract_hyphenation_section' because the condition on line 336 was always true
337 word_entry.hyphenations.append(Hyphenation(parts=h_str.split("·")))
340def extract_note_section(
341 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
342) -> None:
343 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
344 note = clean_node(
345 wxr,
346 None,
347 list(
348 list_item.invert_find_child(
349 NodeKind.LIST, include_empty_str=True
350 )
351 ),
352 )
353 if note != "":
354 word_entry.notes.append(note)
357def extract_old_spell_template(
358 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
359) -> None:
360 # https://de.wiktionary.org/wiki/Vorlage:Alte_Schreibweise
361 word = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
362 if word != "":
363 word_entry.senses.append(Sense(alt_of=[AltForm(word=word)]))
364 for tag in ["alt-of", "obsolete", "no-gloss"]:
365 if tag not in word_entry.tags:
366 word_entry.tags.append(tag)
367 clean_node(wxr, word_entry, t_node)