Coverage for src/wiktextract/extractor/de/page.py: 73%

1from typing import Any

3from mediawiki_langcodes import name_to_code

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from ...wxr_logging import logger

9from .etymology import extract_etymology_section

10from .example import extract_examples

11from .form import extracrt_form_section, extract_transcription_section

12from .gloss import extract_glosses

13from .inflection import extract_inf_table_template, process_noun_table

14from .linkage import extract_descendant_section, extract_linkages

15from .models import AltForm, Sense, WordEntry

16from .pronunciation import extract_pronunciation_section

17from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS

18from .tags import translate_raw_tags

19from .translation import extract_translation

22def parse_section(

23 wxr: WiktextractContext,

24 page_data: list[WordEntry],

25 base_data: WordEntry,

26 level_node: WikiNode,

27) -> None:

28 # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage

29 # Level 3 headings are used to start POS sections like

30 # === {{Wortart|Verb|Deutsch}} ===

31 # title templates:

32 # https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine

33 if level_node.kind == NodeKind.LEVEL3:

34 process_pos_section(wxr, page_data, base_data, level_node)

35 # Level 4 headings were introduced by overriding the default templates.

36 # See overrides/de.json for details.

37 elif level_node.kind == NodeKind.LEVEL4: 37 ↛ exitline 37 didn't return from function 'parse_section' because the condition on line 37 was always true

38 section_name = clean_node(wxr, None, level_node.largs)

39 wxr.wtp.start_subsection(section_name)

40 if section_name in ("Bedeutungen", "Grammatische Merkmale"):

41 extract_glosses(

42 wxr,

43 page_data[-1] if len(page_data) > 0 else base_data,

44 level_node,

45 )

46 elif wxr.config.capture_pronunciation and section_name == "Aussprache":

47 extract_pronunciation_section(

48 wxr,

49 page_data[-1] if len(page_data) > 0 else base_data,

50 level_node,

51 )

52 elif wxr.config.capture_examples and section_name == "Beispiele": 52 ↛ 53line 52 didn't jump to line 53 because the condition on line 52 was never true

53 extract_examples(

54 wxr,

55 page_data[-1] if len(page_data) > 0 else base_data,

56 level_node,

57 )

58 elif ( 58 ↛ 61line 58 didn't jump to line 61 because the condition on line 58 was never true

59 wxr.config.capture_translations and section_name == "Übersetzungen"

60 ):

61 extract_translation(

62 wxr,

63 page_data[-1] if len(page_data) > 0 else base_data,

64 level_node,

65 )

66 elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES: 66 ↛ 67line 66 didn't jump to line 67 because the condition on line 66 was never true

67 extract_linkages(

68 wxr,

69 page_data[-1] if len(page_data) > 0 else base_data,

70 level_node,

71 LINKAGE_TITLES[section_name],

72 )

73 elif wxr.config.capture_etymologies and section_name == "Herkunft":

74 extract_etymology_section(

75 wxr,

76 page_data[-1] if len(page_data) > 0 else base_data,

77 level_node,

78 )

79 elif section_name in FORM_TITLES: 79 ↛ 80line 79 didn't jump to line 80 because the condition on line 79 was never true

80 extracrt_form_section(

81 wxr,

82 page_data[-1] if len(page_data) > 0 else base_data,

83 level_node,

84 FORM_TITLES[section_name],

85 )

86 elif section_name == "Worttrennung":

87 extract_hyphenation_section(

88 wxr,

89 page_data[-1] if len(page_data) > 0 else base_data,

90 level_node,

91 )

92 elif section_name == "Anmerkung": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 extract_note_section(

94 wxr,

95 page_data[-1] if len(page_data) > 0 else base_data,

96 level_node,

97 )

98 elif section_name == "Umschrift":

99 extract_transcription_section(

100 wxr,

101 page_data[-1] if len(page_data) > 0 else base_data,

102 level_node,

103 )

104 elif section_name == "Entlehnungen": 104 ↛ 110line 104 didn't jump to line 110 because the condition on line 104 was always true

105 extract_descendant_section(

106 wxr,

107 page_data[-1] if len(page_data) > 0 else base_data,

108 level_node,

109 )

110 elif section_name not in [

111 "Referenzen",

112 "Ähnliche Wörter",

113 "Bekannte Namensträger",

114 ]:

115 wxr.wtp.debug(

116 f"Unknown section: {section_name}",

117 sortid="extractor/de/page/parse_section/107",

118 )

119

120

121FORM_POS = {

122 "Konjugierte Form",

123 "Deklinierte Form",

124 "Dekliniertes Gerundivum",

125 "Komparativ",

126 "Superlativ",

127 "Supinum",

128 "Partizip",

129 "Partizip I",

130 "Partizip II",

131 "Erweiterter Infinitiv",

132 "Adverbialpartizip",

133 "Exzessiv",

134 "Gerundium",

135}

136

137IGNORE_POS = {"Albanisch", "Pseudopartizip", "Ajami"}

138

139GENDER_TEMPLATES = {

140 "n": ["neuter"],

141 "m": ["masculine"],

142 "f": ["feminine"],

143 "mn.": ["masculine", "neuter"],

144 "nm": ["masculine", "neuter"],

145 "nf": ["neuter", "feminine"],

146 "fn": ["neuter", "feminine"],

147 "fm": ["feminine", "masculine"],

148 "mf": ["feminine", "masculine"],

149 "u": ["common-gender"],

150 "un": ["common-gender", "neuter"],

151}

152

153

154def process_pos_section(

155 wxr: WiktextractContext,

156 page_data: list[WordEntry],

157 base_data: WordEntry,

158 level_node: LevelNode,

159) -> None:

160 pos_data_list = []

161 pos_title = ""

162 for template_node in level_node.find_content(NodeKind.TEMPLATE):

163 if template_node.template_name == "Wortart":

164 pos_argument = template_node.template_parameters.get(1, "").strip()

165 if pos_title == "":

166 pos_title = pos_argument

167 if pos_argument in IGNORE_POS: 167 ↛ 168line 167 didn't jump to line 168 because the condition on line 167 was never true

168 continue

169 elif pos_argument in FORM_POS:

170 pos_data_list.append({"pos": "unknown", "tags": ["form-of"]})

171 elif pos_argument in POS_SECTIONS: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 pos_data_list.append(POS_SECTIONS[pos_argument])

173 elif pos_argument == "Gebundenes Lexem":

174 if wxr.wtp.title.startswith("-") and wxr.wtp.title.endswith(

175 "-"

176 ):

177 pos_data_list.append({"pos": "infix", "tags": ["morpheme"]})

178 elif wxr.wtp.title.endswith("-"):

179 pos_data_list.append(

180 {"pos": "prefix", "tags": ["morpheme"]}

181 )

182 elif wxr.wtp.title.startswith("-"):

183 pos_data_list.append(

184 {"pos": "suffix", "tags": ["morpheme"]}

185 )

186 else:

187 wxr.wtp.debug(

188 f"Unknown Wortart template POS argument: {pos_argument}",

189 sortid="extractor/de/page/process_pos_section/55",

190 )

191 pos_data_list.append({"pos": "unknown"})

192

193 if len(pos_data_list) == 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 return

195 page_data.append(base_data.model_copy(deep=True))

196 for pos_index, pos_data in enumerate(pos_data_list):

197 pos = pos_data["pos"]

198 for tag in pos_data.get("tags", []):

199 if tag not in page_data[-1].tags: 199 ↛ 198line 199 didn't jump to line 198 because the condition on line 199 was always true

200 page_data[-1].tags.append(tag)

201 if pos_index == 0:

202 page_data[-1].pos = pos

203 page_data[-1].pos_title = pos_title

204 elif pos != page_data[-1].pos and pos not in page_data[-1].other_pos: 204 ↛ 196line 204 didn't jump to line 196 because the condition on line 204 was always true

205 page_data[-1].other_pos.append(pos)

206

207 for node in level_node.find_content(NodeKind.TEMPLATE | NodeKind.ITALIC):

208 if (

209 isinstance(node, TemplateNode)

210 and node.template_name in GENDER_TEMPLATES

211 ):

212 page_data[-1].tags.extend(GENDER_TEMPLATES[node.template_name])

213 elif node.kind == NodeKind.ITALIC:

214 raw_tag = clean_node(wxr, None, node)

215 if raw_tag != "": 215 ↛ 207line 215 didn't jump to line 207 because the condition on line 215 was always true

216 page_data[-1].raw_tags.append(raw_tag)

217

218 wxr.wtp.start_subsection(clean_node(wxr, page_data[-1], level_node.largs))

219

220 for level_4_node in level_node.find_child(NodeKind.LEVEL4):

221 parse_section(wxr, page_data, base_data, level_4_node)

222

223 for t_node in level_node.find_child(NodeKind.TEMPLATE):

224 extract_inf_table_template(wxr, page_data[-1], t_node)

225 if t_node.template_name in ["Alte Schreibweise", "Alte Schreibung"]: 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 extract_old_spell_template(wxr, page_data[-1], t_node)

227

228 for table_node in level_node.find_child(NodeKind.TABLE): 228 ↛ 230line 228 didn't jump to line 230 because the loop on line 228 never started

229 # page "beide"

230 process_noun_table(wxr, page_data[-1], table_node)

231

232 if not level_node.contain_node(NodeKind.LEVEL4):

233 extract_glosses(wxr, page_data[-1], level_node)

234 translate_raw_tags(page_data[-1])

235

236

237def parse_page(

238 wxr: WiktextractContext, page_title: str, page_text: str

239) -> list[dict[str, Any]]:

240 if wxr.config.verbose: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 logger.info(f"Parsing page: {page_title}")

242

243 wxr.config.word = page_title

244 wxr.wtp.start_page(page_title)

245 tree = wxr.wtp.parse(page_text, pre_expand=True)

246

247 page_data: list[WordEntry] = []

248 for level2_node in tree.find_child(NodeKind.LEVEL2):

249 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):

250 # The language sections are marked with

251 # == <title> ({{Sprache|<lang>}}) ==

252 # where <title> is the title of the page and <lang> is the

253 # German name of the language of the section.

254 if subtitle_template.template_name == "Sprache":

255 lang_name = subtitle_template.template_parameters.get(1, "")

256 lang_code = name_to_code(lang_name, "de")

257 if lang_code == "":

258 lang_code = "unknown"

259 if lang_name != "Umschrift": 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 wxr.wtp.warning(

261 f"Unknown language: {lang_name}",

262 sortid="extractor/de/page/parse_page/76",

263 )

264 if ( 264 ↛ 268line 264 didn't jump to line 268 because the condition on line 264 was never true

265 wxr.config.capture_language_codes is not None

266 and lang_code not in wxr.config.capture_language_codes

267 ):

268 continue

269 base_data = WordEntry(

270 lang=lang_name,

271 lang_code=lang_code,

272 word=page_title,

273 pos="unknown",

274 )

275 clean_node(wxr, base_data, subtitle_template)

276 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

277 parse_section(wxr, page_data, base_data, level3_node)

278 for t_node in level2_node.find_child(NodeKind.TEMPLATE):

279 if t_node.template_name == "Ähnlichkeiten Umschrift": 279 ↛ 283line 279 didn't jump to line 283 because the condition on line 279 was always true

280 process_umschrift_template(

281 wxr, page_data, base_data, t_node

282 )

283 elif t_node.template_name in [

284 "Alte Schreibweise",

285 "Alte Schreibung",

286 ]:

287 extract_old_spell_template(wxr, base_data, t_node)

288 page_data.append(base_data)

289

290 for data in page_data:

291 if len(data.senses) == 0:

292 data.senses.append(Sense(tags=["no-gloss"]))

293 return [d.model_dump(exclude_defaults=True) for d in page_data]

294

295

296def process_umschrift_template(

297 wxr: WiktextractContext,

298 page_data: list[WordEntry],

299 base_data: WordEntry,

300 template_node: TemplateNode,

301) -> None:

302 # https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift

303 # soft-redirect template, similar to en edition's "zh-see"

304 data = base_data.model_copy(deep=True)

305 data.pos = "soft-redirect"

306 for key, value in template_node.template_parameters.items():

307 if isinstance(key, int):

308 redirect_page = clean_node(wxr, None, value)

309 link_arg = template_node.template_parameters.get(f"link{key}", "")

310 link_text = clean_node(wxr, None, link_arg)

311 if len(link_text) > 0:

312 redirect_page = link_text

313 if len(redirect_page) > 0: 313 ↛ 306line 313 didn't jump to line 306 because the condition on line 313 was always true

314 data.redirects.append(redirect_page)

315 if len(data.redirects) > 0: 315 ↛ exitline 315 didn't return from function 'process_umschrift_template' because the condition on line 315 was always true

316 page_data.append(data)

317

318

319def extract_hyphenation_section(

320 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

321) -> None:

322 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

323 for node in list_item.children:

324 if isinstance(node, str): 324 ↛ 323line 324 didn't jump to line 323 because the condition on line 324 was always true

325 if "," in node:

326 word_entry.hyphenation = node[: node.index(",")].strip()

327 break

328 else:

329 word_entry.hyphenation += node.strip()

330 if word_entry.hyphenation == "?": 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 word_entry.hyphenation = ""

332

333

334def extract_note_section(

335 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

336) -> None:

337 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

338 note = clean_node(

339 wxr, None, list(list_item.invert_find_child(NodeKind.LIST))

340 )

341 if note != "":

342 word_entry.notes.append(note)

343

344

345def extract_old_spell_template(

346 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode

347) -> None:

348 # https://de.wiktionary.org/wiki/Vorlage:Alte_Schreibweise

349 word = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

350 if word != "":

351 word_entry.senses.append(Sense(alt_of=[AltForm(word=word)]))

352 for tag in ["alt-of", "obsolete", "no-gloss"]:

353 if tag not in word_entry.tags:

354 word_entry.tags.append(tag)

355 clean_node(wxr, word_entry, t_node)