Coverage for src/wiktextract/extractor/de/page.py: 73%

174 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from typing import Any 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ...wxr_logging import logger 

9from .etymology import extract_etymology_section 

10from .example import extract_examples 

11from .form import extracrt_form_section, extract_transcription_section 

12from .gloss import extract_glosses 

13from .inflection import extract_inf_table_template, process_noun_table 

14from .linkage import extract_descendant_section, extract_linkages 

15from .models import AltForm, Sense, WordEntry 

16from .pronunciation import extract_pronunciation_section 

17from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS 

18from .tags import translate_raw_tags 

19from .translation import extract_translation 

20 

21 

22def parse_section( 

23 wxr: WiktextractContext, 

24 page_data: list[WordEntry], 

25 base_data: WordEntry, 

26 level_node: WikiNode, 

27) -> None: 

28 # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage 

29 # Level 3 headings are used to start POS sections like 

30 # === {{Wortart|Verb|Deutsch}} === 

31 # title templates: 

32 # https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine 

33 if level_node.kind == NodeKind.LEVEL3: 

34 process_pos_section(wxr, page_data, base_data, level_node) 

35 # Level 4 headings were introduced by overriding the default templates. 

36 # See overrides/de.json for details. 

37 elif level_node.kind == NodeKind.LEVEL4: 37 ↛ exitline 37 didn't return from function 'parse_section' because the condition on line 37 was always true

38 section_name = clean_node(wxr, None, level_node.largs) 

39 wxr.wtp.start_subsection(section_name) 

40 if section_name in ("Bedeutungen", "Grammatische Merkmale"): 

41 extract_glosses( 

42 wxr, 

43 page_data[-1] if len(page_data) > 0 else base_data, 

44 level_node, 

45 ) 

46 elif wxr.config.capture_pronunciation and section_name == "Aussprache": 

47 extract_pronunciation_section( 

48 wxr, 

49 page_data[-1] if len(page_data) > 0 else base_data, 

50 level_node, 

51 ) 

52 elif wxr.config.capture_examples and section_name == "Beispiele": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 extract_examples( 

54 wxr, 

55 page_data[-1] if len(page_data) > 0 else base_data, 

56 level_node, 

57 ) 

58 elif ( 58 ↛ 61line 58 didn't jump to line 61 because the condition on line 58 was never true

59 wxr.config.capture_translations and section_name == "Übersetzungen" 

60 ): 

61 extract_translation( 

62 wxr, 

63 page_data[-1] if len(page_data) > 0 else base_data, 

64 level_node, 

65 ) 

66 elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 extract_linkages( 

68 wxr, 

69 page_data[-1] if len(page_data) > 0 else base_data, 

70 level_node, 

71 LINKAGE_TITLES[section_name], 

72 ) 

73 elif wxr.config.capture_etymologies and section_name == "Herkunft": 

74 extract_etymology_section( 

75 wxr, 

76 page_data[-1] if len(page_data) > 0 else base_data, 

77 level_node, 

78 ) 

79 elif section_name in FORM_TITLES: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 extracrt_form_section( 

81 wxr, 

82 page_data[-1] if len(page_data) > 0 else base_data, 

83 level_node, 

84 FORM_TITLES[section_name], 

85 ) 

86 elif section_name == "Worttrennung": 

87 extract_hyphenation_section( 

88 wxr, 

89 page_data[-1] if len(page_data) > 0 else base_data, 

90 level_node, 

91 ) 

92 elif section_name == "Anmerkung": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 extract_note_section( 

94 wxr, 

95 page_data[-1] if len(page_data) > 0 else base_data, 

96 level_node, 

97 ) 

98 elif section_name == "Umschrift": 

99 extract_transcription_section( 

100 wxr, 

101 page_data[-1] if len(page_data) > 0 else base_data, 

102 level_node, 

103 ) 

104 elif section_name == "Entlehnungen": 104 ↛ 110line 104 didn't jump to line 110 because the condition on line 104 was always true

105 extract_descendant_section( 

106 wxr, 

107 page_data[-1] if len(page_data) > 0 else base_data, 

108 level_node, 

109 ) 

110 elif section_name not in [ 

111 "Referenzen", 

112 "Ähnliche Wörter", 

113 "Bekannte Namensträger", 

114 ]: 

115 wxr.wtp.debug( 

116 f"Unknown section: {section_name}", 

117 sortid="extractor/de/page/parse_section/107", 

118 ) 

119 

120 

121FORM_POS = { 

122 "Konjugierte Form", 

123 "Deklinierte Form", 

124 "Dekliniertes Gerundivum", 

125 "Komparativ", 

126 "Superlativ", 

127 "Supinum", 

128 "Partizip", 

129 "Partizip I", 

130 "Partizip II", 

131 "Erweiterter Infinitiv", 

132 "Adverbialpartizip", 

133 "Exzessiv", 

134 "Gerundium", 

135} 

136 

137IGNORE_POS = {"Albanisch", "Pseudopartizip", "Ajami"} 

138 

139GENDER_TEMPLATES = { 

140 "n": ["neuter"], 

141 "m": ["masculine"], 

142 "f": ["feminine"], 

143 "mn.": ["masculine", "neuter"], 

144 "nm": ["masculine", "neuter"], 

145 "nf": ["neuter", "feminine"], 

146 "fn": ["neuter", "feminine"], 

147 "fm": ["feminine", "masculine"], 

148 "mf": ["feminine", "masculine"], 

149 "u": ["common-gender"], 

150 "un": ["common-gender", "neuter"], 

151} 

152 

153 

154def process_pos_section( 

155 wxr: WiktextractContext, 

156 page_data: list[WordEntry], 

157 base_data: WordEntry, 

158 level_node: LevelNode, 

159) -> None: 

160 pos_data_list = [] 

161 pos_title = "" 

162 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

163 if template_node.template_name == "Wortart": 

164 pos_argument = template_node.template_parameters.get(1, "").strip() 

165 if pos_title == "": 

166 pos_title = pos_argument 

167 if pos_argument in IGNORE_POS: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 continue 

169 elif pos_argument in FORM_POS: 

170 pos_data_list.append({"pos": "unknown", "tags": ["form-of"]}) 

171 elif pos_argument in POS_SECTIONS: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 pos_data_list.append(POS_SECTIONS[pos_argument]) 

173 elif pos_argument == "Gebundenes Lexem": 

174 if wxr.wtp.title.startswith("-") and wxr.wtp.title.endswith( 

175 "-" 

176 ): 

177 pos_data_list.append({"pos": "infix", "tags": ["morpheme"]}) 

178 elif wxr.wtp.title.endswith("-"): 

179 pos_data_list.append( 

180 {"pos": "prefix", "tags": ["morpheme"]} 

181 ) 

182 elif wxr.wtp.title.startswith("-"): 

183 pos_data_list.append( 

184 {"pos": "suffix", "tags": ["morpheme"]} 

185 ) 

186 else: 

187 wxr.wtp.debug( 

188 f"Unknown Wortart template POS argument: {pos_argument}", 

189 sortid="extractor/de/page/process_pos_section/55", 

190 ) 

191 pos_data_list.append({"pos": "unknown"}) 

192 

193 if len(pos_data_list) == 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 return 

195 page_data.append(base_data.model_copy(deep=True)) 

196 for pos_index, pos_data in enumerate(pos_data_list): 

197 pos = pos_data["pos"] 

198 for tag in pos_data.get("tags", []): 

199 if tag not in page_data[-1].tags: 199 ↛ 198line 199 didn't jump to line 198 because the condition on line 199 was always true

200 page_data[-1].tags.append(tag) 

201 if pos_index == 0: 

202 page_data[-1].pos = pos 

203 page_data[-1].pos_title = pos_title 

204 elif pos != page_data[-1].pos and pos not in page_data[-1].other_pos: 204 ↛ 196line 204 didn't jump to line 196 because the condition on line 204 was always true

205 page_data[-1].other_pos.append(pos) 

206 

207 for node in level_node.find_content(NodeKind.TEMPLATE | NodeKind.ITALIC): 

208 if ( 

209 isinstance(node, TemplateNode) 

210 and node.template_name in GENDER_TEMPLATES 

211 ): 

212 page_data[-1].tags.extend(GENDER_TEMPLATES[node.template_name]) 

213 elif node.kind == NodeKind.ITALIC: 

214 raw_tag = clean_node(wxr, None, node) 

215 if raw_tag != "": 215 ↛ 207line 215 didn't jump to line 207 because the condition on line 215 was always true

216 page_data[-1].raw_tags.append(raw_tag) 

217 

218 wxr.wtp.start_subsection(clean_node(wxr, page_data[-1], level_node.largs)) 

219 

220 for level_4_node in level_node.find_child(NodeKind.LEVEL4): 

221 parse_section(wxr, page_data, base_data, level_4_node) 

222 

223 for t_node in level_node.find_child(NodeKind.TEMPLATE): 

224 extract_inf_table_template(wxr, page_data[-1], t_node) 

225 if t_node.template_name in ["Alte Schreibweise", "Alte Schreibung"]: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 extract_old_spell_template(wxr, page_data[-1], t_node) 

227 

228 for table_node in level_node.find_child(NodeKind.TABLE): 228 ↛ 230line 228 didn't jump to line 230 because the loop on line 228 never started

229 # page "beide" 

230 process_noun_table(wxr, page_data[-1], table_node) 

231 

232 if not level_node.contain_node(NodeKind.LEVEL4): 

233 extract_glosses(wxr, page_data[-1], level_node) 

234 translate_raw_tags(page_data[-1]) 

235 

236 

237def parse_page( 

238 wxr: WiktextractContext, page_title: str, page_text: str 

239) -> list[dict[str, Any]]: 

240 if wxr.config.verbose: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 logger.info(f"Parsing page: {page_title}") 

242 

243 wxr.config.word = page_title 

244 wxr.wtp.start_page(page_title) 

245 tree = wxr.wtp.parse(page_text, pre_expand=True) 

246 

247 page_data: list[WordEntry] = [] 

248 for level2_node in tree.find_child(NodeKind.LEVEL2): 

249 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

250 # The language sections are marked with 

251 # == <title> ({{Sprache|<lang>}}) == 

252 # where <title> is the title of the page and <lang> is the 

253 # German name of the language of the section. 

254 if subtitle_template.template_name == "Sprache": 

255 lang_name = subtitle_template.template_parameters.get(1, "") 

256 lang_code = name_to_code(lang_name, "de") 

257 if lang_code == "": 

258 lang_code = "unknown" 

259 if lang_name != "Umschrift": 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 wxr.wtp.warning( 

261 f"Unknown language: {lang_name}", 

262 sortid="extractor/de/page/parse_page/76", 

263 ) 

264 if ( 264 ↛ 268line 264 didn't jump to line 268 because the condition on line 264 was never true

265 wxr.config.capture_language_codes is not None 

266 and lang_code not in wxr.config.capture_language_codes 

267 ): 

268 continue 

269 base_data = WordEntry( 

270 lang=lang_name, 

271 lang_code=lang_code, 

272 word=page_title, 

273 pos="unknown", 

274 ) 

275 clean_node(wxr, base_data, subtitle_template) 

276 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

277 parse_section(wxr, page_data, base_data, level3_node) 

278 for t_node in level2_node.find_child(NodeKind.TEMPLATE): 

279 if t_node.template_name == "Ähnlichkeiten Umschrift": 279 ↛ 283line 279 didn't jump to line 283 because the condition on line 279 was always true

280 process_umschrift_template( 

281 wxr, page_data, base_data, t_node 

282 ) 

283 elif t_node.template_name in [ 

284 "Alte Schreibweise", 

285 "Alte Schreibung", 

286 ]: 

287 extract_old_spell_template(wxr, base_data, t_node) 

288 page_data.append(base_data) 

289 

290 for data in page_data: 

291 if len(data.senses) == 0: 

292 data.senses.append(Sense(tags=["no-gloss"])) 

293 return [d.model_dump(exclude_defaults=True) for d in page_data] 

294 

295 

296def process_umschrift_template( 

297 wxr: WiktextractContext, 

298 page_data: list[WordEntry], 

299 base_data: WordEntry, 

300 template_node: TemplateNode, 

301) -> None: 

302 # https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift 

303 # soft-redirect template, similar to en edition's "zh-see" 

304 data = base_data.model_copy(deep=True) 

305 data.pos = "soft-redirect" 

306 for key, value in template_node.template_parameters.items(): 

307 if isinstance(key, int): 

308 redirect_page = clean_node(wxr, None, value) 

309 link_arg = template_node.template_parameters.get(f"link{key}", "") 

310 link_text = clean_node(wxr, None, link_arg) 

311 if len(link_text) > 0: 

312 redirect_page = link_text 

313 if len(redirect_page) > 0: 313 ↛ 306line 313 didn't jump to line 306 because the condition on line 313 was always true

314 data.redirects.append(redirect_page) 

315 if len(data.redirects) > 0: 315 ↛ exitline 315 didn't return from function 'process_umschrift_template' because the condition on line 315 was always true

316 page_data.append(data) 

317 

318 

319def extract_hyphenation_section( 

320 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

321) -> None: 

322 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

323 for node in list_item.children: 

324 if isinstance(node, str): 324 ↛ 323line 324 didn't jump to line 323 because the condition on line 324 was always true

325 if "," in node: 

326 word_entry.hyphenation = node[: node.index(",")].strip() 

327 break 

328 else: 

329 word_entry.hyphenation += node.strip() 

330 if word_entry.hyphenation == "?": 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 word_entry.hyphenation = "" 

332 

333 

334def extract_note_section( 

335 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

336) -> None: 

337 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

338 note = clean_node( 

339 wxr, None, list(list_item.invert_find_child(NodeKind.LIST)) 

340 ) 

341 if note != "": 

342 word_entry.notes.append(note) 

343 

344 

345def extract_old_spell_template( 

346 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

347) -> None: 

348 # https://de.wiktionary.org/wiki/Vorlage:Alte_Schreibweise 

349 word = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

350 if word != "": 

351 word_entry.senses.append(Sense(alt_of=[AltForm(word=word)])) 

352 for tag in ["alt-of", "obsolete", "no-gloss"]: 

353 if tag not in word_entry.tags: 

354 word_entry.tags.append(tag) 

355 clean_node(wxr, word_entry, t_node)