Coverage for src/wiktextract/extractor/de/page.py: 77%
134 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Any
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ...wxr_logging import logger
9from .etymology import extract_etymology
10from .example import extract_examples
11from .form import extracrt_form_section
12from .gloss import extract_glosses
13from .inflection import extract_inf_table_template
14from .linkage import extract_linkages
15from .models import Sense, WordEntry
16from .pronunciation import extract_pronunciation_section
17from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS
18from .translation import extract_translation
21def parse_section(
22 wxr: WiktextractContext,
23 page_data: list[WordEntry],
24 base_data: WordEntry,
25 level_node: WikiNode,
26) -> None:
27 # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage
28 # Level 3 headings are used to start POS sections like
29 # === {{Wortart|Verb|Deutsch}} ===
30 # title templates:
31 # https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine
32 if level_node.kind == NodeKind.LEVEL3:
33 process_pos_section(wxr, page_data, base_data, level_node)
34 # Level 4 headings were introduced by overriding the default templates.
35 # See overrides/de.json for details.
36 elif level_node.kind == NodeKind.LEVEL4: 36 ↛ exitline 36 didn't return from function 'parse_section' because the condition on line 36 was always true
37 section_name = clean_node(wxr, None, level_node.largs)
38 wxr.wtp.start_subsection(section_name)
39 if section_name in ("Bedeutungen", "Grammatische Merkmale"):
40 extract_glosses(
41 wxr,
42 page_data[-1] if len(page_data) > 0 else base_data,
43 level_node,
44 )
45 elif wxr.config.capture_pronunciation and section_name == "Aussprache":
46 extract_pronunciation_section(
47 wxr,
48 page_data[-1] if len(page_data) > 0 else base_data,
49 level_node,
50 )
51 elif wxr.config.capture_examples and section_name == "Beispiele": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 extract_examples(wxr, page_data, level_node)
53 elif ( 53 ↛ 56line 53 didn't jump to line 56 because the condition on line 53 was never true
54 wxr.config.capture_translations and section_name == "Übersetzungen"
55 ):
56 extract_translation(
57 wxr,
58 page_data[-1] if len(page_data) > 0 else base_data,
59 level_node,
60 )
61 elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true
62 extract_linkages(
63 wxr,
64 page_data[-1] if len(page_data) > 0 else base_data,
65 level_node,
66 LINKAGE_TITLES[section_name],
67 )
68 elif wxr.config.capture_etymologies and section_name == "Herkunft": 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true
69 extract_etymology(
70 wxr,
71 page_data[-1] if len(page_data) > 0 else base_data,
72 level_node,
73 )
74 elif section_name in FORM_TITLES: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true
75 extracrt_form_section(
76 wxr,
77 page_data[-1] if len(page_data) > 0 else base_data,
78 level_node,
79 FORM_TITLES[section_name],
80 )
81 elif section_name == "Worttrennung": 81 ↛ exitline 81 didn't return from function 'parse_section' because the condition on line 81 was always true
82 extract_hyphenation_section(
83 wxr,
84 page_data[-1] if len(page_data) > 0 else base_data,
85 level_node,
86 )
89FORM_POS = {
90 "Konjugierte Form",
91 "Deklinierte Form",
92 "Dekliniertes Gerundivum",
93 "Komparativ",
94 "Superlativ",
95 "Supinum",
96 "Partizip",
97 "Partizip I",
98 "Partizip II",
99 "Erweiterter Infinitiv",
100 "Adverbialpartizip",
101 "Exzessiv",
102 "Gerundium",
103}
105IGNORE_POS = {"Albanisch", "Pseudopartizip", "Ajami"}
107GENDER_TEMPLATES = {
108 "n": ["neuter"],
109 "m": ["masculine"],
110 "f": ["feminine"],
111 "mn.": ["masculine", "neuter"],
112 "nm": ["masculine", "neuter"],
113 "nf": ["neuter", "feminine"],
114 "fn": ["neuter", "feminine"],
115 "fm": ["feminine", "masculine"],
116 "mf": ["feminine", "masculine"],
117 "u": ["common-gender"],
118 "un": ["common-gender", "neuter"],
119}
122def process_pos_section(
123 wxr: WiktextractContext,
124 page_data: list[WordEntry],
125 base_data: WordEntry,
126 level_node: LevelNode,
127) -> None:
128 pos_data_list = []
129 for template_node in level_node.find_content(NodeKind.TEMPLATE):
130 if template_node.template_name == "Wortart":
131 pos_argument = template_node.template_parameters.get(1, "").strip()
132 if pos_argument in IGNORE_POS: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 continue
134 elif pos_argument in FORM_POS:
135 pos_data_list.append({"pos": "unknown", "tags": ["form-of"]})
136 elif pos_argument in POS_SECTIONS: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true
137 pos_data_list.append(POS_SECTIONS[pos_argument])
138 elif pos_argument == "Gebundenes Lexem":
139 if wxr.wtp.title.startswith("-") and wxr.wtp.title.endswith(
140 "-"
141 ):
142 pos_data_list.append({"pos": "infix", "tags": ["morpheme"]})
143 elif wxr.wtp.title.endswith("-"):
144 pos_data_list.append(
145 {"pos": "prefix", "tags": ["morpheme"]}
146 )
147 elif wxr.wtp.title.startswith("-"):
148 pos_data_list.append(
149 {"pos": "suffix", "tags": ["morpheme"]}
150 )
151 else:
152 wxr.wtp.debug(
153 f"Unknown Wortart template POS argument: {pos_argument}",
154 sortid="extractor/de/page/process_pos_section/55",
155 )
156 elif template_node.template_name in GENDER_TEMPLATES: 156 ↛ 129line 156 didn't jump to line 129 because the condition on line 156 was always true
157 base_data.tags.extend(GENDER_TEMPLATES[template_node.template_name])
159 if len(pos_data_list) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true
160 return
161 for pos_index, pos_data in enumerate(pos_data_list):
162 pos = pos_data["pos"]
163 pos_tags = pos_data.get("tags", [])
164 base_data.tags.extend(pos_tags)
165 if pos_index == 0:
166 base_data.pos = pos
167 elif pos != base_data.pos: 167 ↛ 161line 167 didn't jump to line 161 because the condition on line 167 was always true
168 base_data.other_pos.append(pos)
169 page_data.append(base_data.model_copy(deep=True))
170 wxr.wtp.start_subsection(clean_node(wxr, page_data[-1], level_node.largs))
172 for level_4_node in level_node.find_child(NodeKind.LEVEL4):
173 parse_section(wxr, page_data, base_data, level_4_node)
175 for template_node in level_node.find_child(NodeKind.TEMPLATE): 175 ↛ 176line 175 didn't jump to line 176 because the loop on line 175 never started
176 if template_node.template_name.endswith("Übersicht"):
177 extract_inf_table_template(wxr, page_data[-1], template_node)
179 if not level_node.contain_node(NodeKind.LEVEL4):
180 extract_glosses(wxr, page_data[-1], level_node)
183def parse_page(
184 wxr: WiktextractContext, page_title: str, page_text: str
185) -> list[dict[str, Any]]:
186 if wxr.config.verbose: 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true
187 logger.info(f"Parsing page: {page_title}")
189 wxr.config.word = page_title
190 wxr.wtp.start_page(page_title)
191 tree = wxr.wtp.parse(page_text, pre_expand=True)
193 page_data: list[WordEntry] = []
194 for level2_node in tree.find_child(NodeKind.LEVEL2):
195 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
196 # The language sections are marked with
197 # == <title> ({{Sprache|<lang>}}) ==
198 # where <title> is the title of the page and <lang> is the
199 # German name of the language of the section.
200 if subtitle_template.template_name == "Sprache": 200 ↛ 195line 200 didn't jump to line 195 because the condition on line 200 was always true
201 lang_name = subtitle_template.template_parameters.get(1, "")
202 lang_code = name_to_code(lang_name, "de")
203 if lang_code == "":
204 lang_code = "unknown"
205 if lang_name != "Umschrift": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true
206 wxr.wtp.warning(
207 f"Unknown language: {lang_name}",
208 sortid="extractor/de/page/parse_page/76",
209 )
210 if ( 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was never true
211 wxr.config.capture_language_codes is not None
212 and lang_code not in wxr.config.capture_language_codes
213 ):
214 continue
215 base_data = WordEntry(
216 lang=lang_name, lang_code=lang_code, word=page_title
217 )
218 clean_node(wxr, base_data, subtitle_template)
219 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
220 parse_section(wxr, page_data, base_data, level3_node)
221 for template_node in level2_node.find_child(NodeKind.TEMPLATE):
222 if template_node.template_name == "Ähnlichkeiten Umschrift": 222 ↛ 221line 222 didn't jump to line 221 because the condition on line 222 was always true
223 process_umschrift_template(
224 wxr, page_data, base_data, template_node
225 )
227 for data in page_data:
228 if len(data.senses) == 0:
229 data.senses.append(Sense(tags=["no-gloss"]))
230 return [d.model_dump(exclude_defaults=True) for d in page_data]
233def process_umschrift_template(
234 wxr: WiktextractContext,
235 page_data: list[WordEntry],
236 base_data: WordEntry,
237 template_node: TemplateNode,
238) -> None:
239 # https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift
240 # soft-redirect template, similar to en edition's "zh-see"
241 data = base_data.model_copy(deep=True)
242 data.pos = "soft-redirect"
243 for key, value in template_node.template_parameters.items():
244 if isinstance(key, int):
245 redirect_page = clean_node(wxr, None, value)
246 link_arg = template_node.template_parameters.get(f"link{key}", "")
247 link_text = clean_node(wxr, None, link_arg)
248 if len(link_text) > 0:
249 redirect_page = link_text
250 if len(redirect_page) > 0: 250 ↛ 243line 250 didn't jump to line 243 because the condition on line 250 was always true
251 data.redirects.append(redirect_page)
252 if len(data.redirects) > 0: 252 ↛ exitline 252 didn't return from function 'process_umschrift_template' because the condition on line 252 was always true
253 page_data.append(data)
256def extract_hyphenation_section(
257 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
258) -> None:
259 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
260 for node in list_item.children:
261 if isinstance(node, str): 261 ↛ 260line 261 didn't jump to line 260 because the condition on line 261 was always true
262 if "," in node:
263 word_entry.hyphenation = node[: node.index(",")].strip()
264 break
265 else:
266 word_entry.hyphenation += node.strip()
267 if word_entry.hyphenation == "?": 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 word_entry.hyphenation = ""