Coverage for src/wiktextract/extractor/de/page.py: 77%

134 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from typing import Any 

2 

3from mediawiki_langcodes import name_to_code 

4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ...wxr_logging import logger 

9from .etymology import extract_etymology 

10from .example import extract_examples 

11from .form import extracrt_form_section 

12from .gloss import extract_glosses 

13from .inflection import extract_inf_table_template 

14from .linkage import extract_linkages 

15from .models import Sense, WordEntry 

16from .pronunciation import extract_pronunciation_section 

17from .section_titles import FORM_TITLES, LINKAGE_TITLES, POS_SECTIONS 

18from .translation import extract_translation 

19 

20 

21def parse_section( 

22 wxr: WiktextractContext, 

23 page_data: list[WordEntry], 

24 base_data: WordEntry, 

25 level_node: WikiNode, 

26) -> None: 

27 # Page structure: https://de.wiktionary.org/wiki/Hilfe:Formatvorlage 

28 # Level 3 headings are used to start POS sections like 

29 # === {{Wortart|Verb|Deutsch}} === 

30 # title templates: 

31 # https://de.wiktionary.org/wiki/Kategorie:Wiktionary:Textbausteine 

32 if level_node.kind == NodeKind.LEVEL3: 

33 process_pos_section(wxr, page_data, base_data, level_node) 

34 # Level 4 headings were introduced by overriding the default templates. 

35 # See overrides/de.json for details. 

36 elif level_node.kind == NodeKind.LEVEL4: 36 ↛ exitline 36 didn't return from function 'parse_section' because the condition on line 36 was always true

37 section_name = clean_node(wxr, None, level_node.largs) 

38 wxr.wtp.start_subsection(section_name) 

39 if section_name in ("Bedeutungen", "Grammatische Merkmale"): 

40 extract_glosses( 

41 wxr, 

42 page_data[-1] if len(page_data) > 0 else base_data, 

43 level_node, 

44 ) 

45 elif wxr.config.capture_pronunciation and section_name == "Aussprache": 

46 extract_pronunciation_section( 

47 wxr, 

48 page_data[-1] if len(page_data) > 0 else base_data, 

49 level_node, 

50 ) 

51 elif wxr.config.capture_examples and section_name == "Beispiele": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 extract_examples(wxr, page_data, level_node) 

53 elif ( 53 ↛ 56line 53 didn't jump to line 56 because the condition on line 53 was never true

54 wxr.config.capture_translations and section_name == "Übersetzungen" 

55 ): 

56 extract_translation( 

57 wxr, 

58 page_data[-1] if len(page_data) > 0 else base_data, 

59 level_node, 

60 ) 

61 elif wxr.config.capture_linkages and section_name in LINKAGE_TITLES: 61 ↛ 62line 61 didn't jump to line 62 because the condition on line 61 was never true

62 extract_linkages( 

63 wxr, 

64 page_data[-1] if len(page_data) > 0 else base_data, 

65 level_node, 

66 LINKAGE_TITLES[section_name], 

67 ) 

68 elif wxr.config.capture_etymologies and section_name == "Herkunft": 68 ↛ 69line 68 didn't jump to line 69 because the condition on line 68 was never true

69 extract_etymology( 

70 wxr, 

71 page_data[-1] if len(page_data) > 0 else base_data, 

72 level_node, 

73 ) 

74 elif section_name in FORM_TITLES: 74 ↛ 75line 74 didn't jump to line 75 because the condition on line 74 was never true

75 extracrt_form_section( 

76 wxr, 

77 page_data[-1] if len(page_data) > 0 else base_data, 

78 level_node, 

79 FORM_TITLES[section_name], 

80 ) 

81 elif section_name == "Worttrennung": 81 ↛ exitline 81 didn't return from function 'parse_section' because the condition on line 81 was always true

82 extract_hyphenation_section( 

83 wxr, 

84 page_data[-1] if len(page_data) > 0 else base_data, 

85 level_node, 

86 ) 

87 

88 

89FORM_POS = { 

90 "Konjugierte Form", 

91 "Deklinierte Form", 

92 "Dekliniertes Gerundivum", 

93 "Komparativ", 

94 "Superlativ", 

95 "Supinum", 

96 "Partizip", 

97 "Partizip I", 

98 "Partizip II", 

99 "Erweiterter Infinitiv", 

100 "Adverbialpartizip", 

101 "Exzessiv", 

102 "Gerundium", 

103} 

104 

105IGNORE_POS = {"Albanisch", "Pseudopartizip", "Ajami"} 

106 

107GENDER_TEMPLATES = { 

108 "n": ["neuter"], 

109 "m": ["masculine"], 

110 "f": ["feminine"], 

111 "mn.": ["masculine", "neuter"], 

112 "nm": ["masculine", "neuter"], 

113 "nf": ["neuter", "feminine"], 

114 "fn": ["neuter", "feminine"], 

115 "fm": ["feminine", "masculine"], 

116 "mf": ["feminine", "masculine"], 

117 "u": ["common-gender"], 

118 "un": ["common-gender", "neuter"], 

119} 

120 

121 

122def process_pos_section( 

123 wxr: WiktextractContext, 

124 page_data: list[WordEntry], 

125 base_data: WordEntry, 

126 level_node: LevelNode, 

127) -> None: 

128 pos_data_list = [] 

129 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

130 if template_node.template_name == "Wortart": 

131 pos_argument = template_node.template_parameters.get(1, "").strip() 

132 if pos_argument in IGNORE_POS: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 continue 

134 elif pos_argument in FORM_POS: 

135 pos_data_list.append({"pos": "unknown", "tags": ["form-of"]}) 

136 elif pos_argument in POS_SECTIONS: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true

137 pos_data_list.append(POS_SECTIONS[pos_argument]) 

138 elif pos_argument == "Gebundenes Lexem": 

139 if wxr.wtp.title.startswith("-") and wxr.wtp.title.endswith( 

140 "-" 

141 ): 

142 pos_data_list.append({"pos": "infix", "tags": ["morpheme"]}) 

143 elif wxr.wtp.title.endswith("-"): 

144 pos_data_list.append( 

145 {"pos": "prefix", "tags": ["morpheme"]} 

146 ) 

147 elif wxr.wtp.title.startswith("-"): 

148 pos_data_list.append( 

149 {"pos": "suffix", "tags": ["morpheme"]} 

150 ) 

151 else: 

152 wxr.wtp.debug( 

153 f"Unknown Wortart template POS argument: {pos_argument}", 

154 sortid="extractor/de/page/process_pos_section/55", 

155 ) 

156 elif template_node.template_name in GENDER_TEMPLATES: 156 ↛ 129line 156 didn't jump to line 129 because the condition on line 156 was always true

157 base_data.tags.extend(GENDER_TEMPLATES[template_node.template_name]) 

158 

159 if len(pos_data_list) == 0: 159 ↛ 160line 159 didn't jump to line 160 because the condition on line 159 was never true

160 return 

161 for pos_index, pos_data in enumerate(pos_data_list): 

162 pos = pos_data["pos"] 

163 pos_tags = pos_data.get("tags", []) 

164 base_data.tags.extend(pos_tags) 

165 if pos_index == 0: 

166 base_data.pos = pos 

167 elif pos != base_data.pos: 167 ↛ 161line 167 didn't jump to line 161 because the condition on line 167 was always true

168 base_data.other_pos.append(pos) 

169 page_data.append(base_data.model_copy(deep=True)) 

170 wxr.wtp.start_subsection(clean_node(wxr, page_data[-1], level_node.largs)) 

171 

172 for level_4_node in level_node.find_child(NodeKind.LEVEL4): 

173 parse_section(wxr, page_data, base_data, level_4_node) 

174 

175 for template_node in level_node.find_child(NodeKind.TEMPLATE): 175 ↛ 176line 175 didn't jump to line 176 because the loop on line 175 never started

176 if template_node.template_name.endswith("Übersicht"): 

177 extract_inf_table_template(wxr, page_data[-1], template_node) 

178 

179 if not level_node.contain_node(NodeKind.LEVEL4): 

180 extract_glosses(wxr, page_data[-1], level_node) 

181 

182 

183def parse_page( 

184 wxr: WiktextractContext, page_title: str, page_text: str 

185) -> list[dict[str, Any]]: 

186 if wxr.config.verbose: 186 ↛ 187line 186 didn't jump to line 187 because the condition on line 186 was never true

187 logger.info(f"Parsing page: {page_title}") 

188 

189 wxr.config.word = page_title 

190 wxr.wtp.start_page(page_title) 

191 tree = wxr.wtp.parse(page_text, pre_expand=True) 

192 

193 page_data: list[WordEntry] = [] 

194 for level2_node in tree.find_child(NodeKind.LEVEL2): 

195 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

196 # The language sections are marked with 

197 # == <title> ({{Sprache|<lang>}}) == 

198 # where <title> is the title of the page and <lang> is the 

199 # German name of the language of the section. 

200 if subtitle_template.template_name == "Sprache": 200 ↛ 195line 200 didn't jump to line 195 because the condition on line 200 was always true

201 lang_name = subtitle_template.template_parameters.get(1, "") 

202 lang_code = name_to_code(lang_name, "de") 

203 if lang_code == "": 

204 lang_code = "unknown" 

205 if lang_name != "Umschrift": 205 ↛ 206line 205 didn't jump to line 206 because the condition on line 205 was never true

206 wxr.wtp.warning( 

207 f"Unknown language: {lang_name}", 

208 sortid="extractor/de/page/parse_page/76", 

209 ) 

210 if ( 210 ↛ 214line 210 didn't jump to line 214 because the condition on line 210 was never true

211 wxr.config.capture_language_codes is not None 

212 and lang_code not in wxr.config.capture_language_codes 

213 ): 

214 continue 

215 base_data = WordEntry( 

216 lang=lang_name, lang_code=lang_code, word=page_title 

217 ) 

218 clean_node(wxr, base_data, subtitle_template) 

219 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

220 parse_section(wxr, page_data, base_data, level3_node) 

221 for template_node in level2_node.find_child(NodeKind.TEMPLATE): 

222 if template_node.template_name == "Ähnlichkeiten Umschrift": 222 ↛ 221line 222 didn't jump to line 221 because the condition on line 222 was always true

223 process_umschrift_template( 

224 wxr, page_data, base_data, template_node 

225 ) 

226 

227 for data in page_data: 

228 if len(data.senses) == 0: 

229 data.senses.append(Sense(tags=["no-gloss"])) 

230 return [d.model_dump(exclude_defaults=True) for d in page_data] 

231 

232 

233def process_umschrift_template( 

234 wxr: WiktextractContext, 

235 page_data: list[WordEntry], 

236 base_data: WordEntry, 

237 template_node: TemplateNode, 

238) -> None: 

239 # https://de.wiktionary.org/wiki/Vorlage:Ähnlichkeiten_Umschrift 

240 # soft-redirect template, similar to en edition's "zh-see" 

241 data = base_data.model_copy(deep=True) 

242 data.pos = "soft-redirect" 

243 for key, value in template_node.template_parameters.items(): 

244 if isinstance(key, int): 

245 redirect_page = clean_node(wxr, None, value) 

246 link_arg = template_node.template_parameters.get(f"link{key}", "") 

247 link_text = clean_node(wxr, None, link_arg) 

248 if len(link_text) > 0: 

249 redirect_page = link_text 

250 if len(redirect_page) > 0: 250 ↛ 243line 250 didn't jump to line 243 because the condition on line 250 was always true

251 data.redirects.append(redirect_page) 

252 if len(data.redirects) > 0: 252 ↛ exitline 252 didn't return from function 'process_umschrift_template' because the condition on line 252 was always true

253 page_data.append(data) 

254 

255 

256def extract_hyphenation_section( 

257 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

258) -> None: 

259 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

260 for node in list_item.children: 

261 if isinstance(node, str): 261 ↛ 260line 261 didn't jump to line 260 because the condition on line 261 was always true

262 if "," in node: 

263 word_entry.hyphenation = node[: node.index(",")].strip() 

264 break 

265 else: 

266 word_entry.hyphenation += node.strip() 

267 if word_entry.hyphenation == "?": 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 word_entry.hyphenation = ""