Coverage for src/wiktextract/extractor/ru/page.py: 72%

230 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1import re 

2from typing import Any, Optional 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...config import POSSubtitleData 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ...wxr_logging import logger 

15from .etymology import extract_etymology 

16from .gloss import extract_gloss, process_meaning_template 

17from .inflection import parse_adj_forms_table, parse_wikitext_forms_table 

18from .linkage import ( 

19 extract_linkages, 

20 extract_phrase_section, 

21 process_related_block_template, 

22) 

23from .models import AltForm, Form, Sense, Sound, WordEntry 

24from .pronunciation import ( 

25 extract_homophone_section, 

26 extract_pronunciation_section, 

27) 

28from .section_titles import LINKAGE_TITLES, POS_TEMPLATE_NAMES, POS_TITLES 

29from .tags import MORPHOLOGICAL_TEMPLATE_TAGS 

30from .translation import extract_translations 

31 

32 

33def process_semantic_section( 

34 wxr: WiktextractContext, 

35 page_data: list[WordEntry], 

36 semantic_level_node: WikiNode, 

37): 

38 for list_node in semantic_level_node.find_child(NodeKind.LIST): 

39 for template_node in list_node.find_child_recursively( 

40 NodeKind.TEMPLATE 

41 ): 

42 if template_node.template_name == "значение": 42 ↛ 39line 42 didn't jump to line 39 because the condition on line 42 was always true

43 sense = process_meaning_template( 

44 wxr, None, page_data[-1], template_node 

45 ) 

46 if len(sense.glosses) > 0: 46 ↛ 39line 46 didn't jump to line 39 because the condition on line 46 was always true

47 page_data[-1].senses.append(sense) 

48 

49 

50MORPH_TEMPLATE_ARGS = { 

51 "p": "prefix", 

52 "prefix": "prefix", 

53 "i": "interfix", 

54 "interfix": "interfix", 

55 "in": "infix", 

56 "infix": "infix", 

57 "s": "suffix", 

58 "suffix": "suffix", 

59 "t": "transfix", 

60 "transfix": "transfix", 

61 "po": "suffix", 

62 "postfix": "suffix", 

63 "c": "circumfix", 

64 "confix": "circumfix", 

65 "circumfix": "circumfix", 

66 "r": "root", 

67 "e": "suffix", 

68 "ending": "suffix", 

69} 

70 

71 

72def get_pos_from_template( 

73 wxr: WiktextractContext, template_node: TemplateNode 

74) -> Optional[POSSubtitleData]: 

75 # Search for POS in template names 

76 template_name = template_node.template_name.lower() 

77 if template_name == "morph": 

78 # https://ru.wiktionary.org/wiki/Шаблон:morph 

79 pos_type = template_node.template_parameters.get("тип", "") 

80 if pos_type in MORPH_TEMPLATE_ARGS: 80 ↛ 98line 80 didn't jump to line 98 because the condition on line 80 was always true

81 return { 

82 "pos": MORPH_TEMPLATE_ARGS[pos_type], 

83 "tags": ["morpheme"], 

84 } 

85 elif ( 

86 template_name in {"заголовок", "з"} 

87 and 1 in template_node.template_parameters 

88 ): 

89 pos_text = clean_node( 

90 wxr, None, template_node.template_parameters[1] 

91 ).strip("()") 

92 if len(pos_text) == 0: 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 return 

94 pos_text = pos_text.split()[0] 

95 if pos_text in POS_TITLES: 

96 return POS_TITLES[pos_text] 

97 

98 for part in template_name.split(maxsplit=2): 

99 for subpart in part.split("-", maxsplit=2): 

100 if subpart in POS_TEMPLATE_NAMES: 

101 return POS_TEMPLATE_NAMES[subpart] 

102 

103 

104def get_pos( 

105 wxr: WiktextractContext, level_node: WikiNode 

106) -> Optional[POSSubtitleData]: 

107 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

108 pos_data = get_pos_from_template(wxr, template_node) 

109 if pos_data is not None: 109 ↛ 107line 109 didn't jump to line 107 because the condition on line 109 was always true

110 return pos_data 

111 # POS text could also in level node content 

112 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

113 pos_data = get_pos_from_template(wxr, template_node) 

114 if pos_data is not None: 

115 return pos_data 

116 

117 # Search for POS in section text 

118 text = clean_node( 

119 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

120 ) 

121 for pos_string in POS_TITLES.keys(): 

122 if pos_string in text.lower(): 

123 return POS_TITLES[pos_string] 

124 

125 

126def extract_morphological_section( 

127 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

128) -> None: 

129 param_tag_map = { 

130 "степень": "comparative", # Шаблон:inflection/ru/adj 

131 "соотв": "perfective", # Шаблон:Гл-блок 

132 } 

133 

134 pos_data = get_pos(wxr, level_node) 

135 if pos_data is not None: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true

136 page_data[-1].pos = pos_data["pos"] 

137 page_data[-1].tags.extend(pos_data.get("tags", [])) 

138 for child_node in level_node.find_child(NodeKind.TEMPLATE): 

139 expanded_template = wxr.wtp.parse( 

140 wxr.wtp.node_to_wikitext(child_node), expand_all=True 

141 ) 

142 if child_node.template_name.startswith("прил"): 

143 parse_adj_forms_table(wxr, page_data[-1], expanded_template) 

144 elif child_node.template_name.startswith(("сущ", "гл")): 

145 parse_wikitext_forms_table(wxr, page_data[-1], expanded_template) 

146 

147 for node in expanded_template.children: 

148 node_text = clean_node(wxr, page_data[-1], node) 

149 for text in node_text.split(","): 

150 text = text.strip() 

151 if text in MORPHOLOGICAL_TEMPLATE_TAGS: 

152 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text] 

153 if isinstance(tr_tag, str): 153 ↛ 155line 153 didn't jump to line 155 because the condition on line 153 was always true

154 page_data[-1].tags.append(tr_tag) 

155 elif isinstance(tr_tag, list): 

156 page_data[-1].tags.extend(tr_tag) 

157 

158 for param, tag in param_tag_map.items(): 

159 if param in child_node.template_parameters: 

160 forms_text = clean_node( 

161 wxr, None, child_node.template_parameters[param] 

162 ) 

163 for form in forms_text.split(","): 

164 form = form.strip() 

165 if form != "": 165 ↛ 163line 165 didn't jump to line 163 because the condition on line 165 was always true

166 page_data[-1].forms.append(Form(form=form, tags=[tag])) 

167 

168 

169def parse_section( 

170 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

171) -> None: 

172 section_title = clean_node(wxr, None, level_node.largs).lower() 

173 wxr.wtp.start_subsection(section_title) 

174 if section_title in [ 

175 # Morphological and syntactic properties 

176 "морфологические и синтаксические свойства", 

177 # Type and syntactic properties of the word combination 

178 "тип и синтаксические свойства сочетания", 

179 "тип и свойства сочетания", 

180 ]: 

181 extract_morphological_section(wxr, page_data, level_node) 

182 elif section_title in POS_TITLES: 

183 pos_data = POS_TITLES[section_title] 

184 page_data[-1].pos = pos_data["pos"] 

185 page_data[-1].tags.extend(pos_data.get("tags", [])) 

186 extract_gloss(wxr, page_data[-1], level_node) 

187 elif section_title == "произношение" and wxr.config.capture_pronunciation: 

188 extract_pronunciation_section(wxr, page_data[-1], level_node) 

189 elif section_title == "семантические свойства": # Semantic properties 

190 process_semantic_section(wxr, page_data, level_node) 

191 elif section_title in ("значение", "значения"): 

192 extract_gloss(wxr, page_data[-1], level_node) 

193 elif section_title == "родственные слова" and wxr.config.capture_linkages: 193 ↛ 195line 193 didn't jump to line 195 because the condition on line 193 was never true

194 # Word family 

195 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

196 if template_node.template_name == "родств-блок": 

197 process_related_block_template( 

198 wxr, page_data[-1], template_node 

199 ) 

200 elif section_title == "этимология" and wxr.config.capture_etymologies: 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 extract_etymology(wxr, page_data[-1], level_node) 

202 elif ( 202 ↛ 213line 202 didn't jump to line 213

203 section_title 

204 in [ 

205 "фразеологизмы и устойчивые сочетания", 

206 "типичные сочетания", 

207 "фразеологизмы", 

208 "пословицы и поговорки", 

209 ] 

210 and wxr.config.capture_linkages 

211 ): 

212 extract_phrase_section(wxr, page_data[-1], level_node, section_title) 

213 elif ( 

214 section_title in ["перевод", "иноязычные аналоги"] 

215 and wxr.config.capture_translations 

216 ): 

217 extract_translations(wxr, page_data[-1], level_node) 

218 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 

219 extract_linkages( 

220 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node 

221 ) 

222 elif section_title == "библиография": 

223 pass 

224 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 

225 parse_roman_section(wxr, page_data[-1], level_node) 

226 elif section_title == "прочее": 

227 pass 

228 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 

229 extract_homophone_section(wxr, page_data[-1], level_node) 

230 else: 

231 wxr.wtp.debug( 

232 f"Unprocessed section {section_title}", 

233 sortid="wixtextract/extractor/ru/page/parse_section/66", 

234 ) 

235 

236 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

237 parse_section(wxr, page_data, next_level_node) 

238 

239 extract_section_end_templates(wxr, page_data[-1], level_node) 

240 

241 

242def parse_page( 

243 wxr: WiktextractContext, page_title: str, page_text: str 

244) -> list[dict[str, Any]]: 

245 # Help site describing page structure: 

246 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей 

247 

248 if wxr.config.verbose: 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 logger.info(f"Parsing page: {page_title}") 

250 wxr.config.word = page_title 

251 wxr.wtp.start_page(page_title) 

252 tree = wxr.wtp.parse(page_text) 

253 page_data: list[WordEntry] = [] 

254 

255 for level1_node in tree.find_child(NodeKind.LEVEL1): 

256 lang_code = "" 

257 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 257 ↛ 260line 257 didn't jump to line 260 because the loop on line 257 didn't complete

258 lang_code = subtitle_template.template_name.strip(" -") 

259 break 

260 if lang_code == "": 260 ↛ 261line 260 didn't jump to line 261 because the condition on line 260 was never true

261 lang_code = "unknown" 

262 if ( 262 ↛ 266line 262 didn't jump to line 266

263 wxr.config.capture_language_codes is not None 

264 and lang_code not in wxr.config.capture_language_codes 

265 ): 

266 continue 

267 categories = {"categories": []} 

268 lang_name = clean_node(wxr, categories, level1_node.largs) 

269 wxr.wtp.start_section(lang_name) 

270 base_data = WordEntry( 

271 lang=lang_name, 

272 lang_code=lang_code, 

273 word=page_title, 

274 pos="unknown", 

275 ) 

276 base_data.categories.extend(categories["categories"]) 

277 extract_section_end_templates(wxr, base_data, level1_node) 

278 pos_data = get_pos(wxr, level1_node) 

279 if pos_data is not None: 

280 base_data.pos = pos_data["pos"] 

281 base_data.tags.extend(pos_data.get("tags", [])) 

282 

283 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

284 if base_data.pos == "unknown": 

285 pos_data = get_pos(wxr, level2_node) 

286 if pos_data is not None: 

287 base_data.pos = pos_data["pos"] 

288 base_data.tags.extend(pos_data.get("tags", [])) 

289 page_data.append(base_data.model_copy(deep=True)) 

290 has_level3 = False 

291 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

292 parse_section(wxr, page_data, level3_node) 

293 has_level3 = True 

294 if page_data[-1] == base_data or not has_level3: 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true

295 page_data.pop() 

296 extract_low_quality_page(wxr, page_data, base_data, level2_node) 

297 

298 for any_level_index, any_level_node in enumerate( 

299 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2) 

300 ): 

301 if any_level_index == 0 and ( 

302 len(page_data) == 0 

303 or page_data[-1].lang_code != base_data.lang_code 

304 ): 

305 page_data.append(base_data.model_copy(deep=True)) 

306 parse_section(wxr, page_data, any_level_node) 

307 

308 if len(page_data) > 0 and page_data[-1] == base_data: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 page_data.pop() 

310 extract_low_quality_page(wxr, page_data, base_data, level1_node) 

311 

312 for d in page_data: 

313 if len(d.senses) == 0: 313 ↛ 314line 313 didn't jump to line 314 because the condition on line 313 was never true

314 d.senses.append(Sense(tags=["no-gloss"])) 

315 return [d.model_dump(exclude_defaults=True) for d in page_data] 

316 

317 

318def extract_low_quality_page( 

319 wxr: WiktextractContext, 

320 page_data: list[WordEntry], 

321 base_data: WordEntry, 

322 level_node: WikiNode, 

323) -> None: 

324 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

325 if isinstance(node, TemplateNode) and node.template_name.startswith( 

326 "Форма-" 

327 ): 

328 process_form_template(wxr, page_data, base_data, node) 

329 elif isinstance(node, WikiNode): 

330 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 330 ↛ 331line 330 didn't jump to line 331 because the loop on line 330 never started

331 if template_node.template_name.startswith("Форма-"): 

332 process_form_template( 

333 wxr, page_data, base_data, template_node 

334 ) 

335 

336 

337def process_form_template( 

338 wxr: WiktextractContext, 

339 page_data: list[WordEntry], 

340 base_data: WordEntry, 

341 template_node: TemplateNode, 

342) -> None: 

343 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ 

344 # Шаблон:Форма-гл, "Шаблон:форма-гл en" 

345 pos_data = get_pos_from_template(wxr, template_node) 

346 if pos_data is not None: 346 ↛ 350line 346 didn't jump to line 350 because the condition on line 346 was always true

347 base_data.pos = pos_data["pos"] 

348 base_data.tags.extend(pos_data.get("tags", [])) 

349 

350 form_of = clean_node( 

351 wxr, 

352 None, 

353 template_node.template_parameters.get( 

354 "база", template_node.template_parameters.get(1, "") 

355 ), 

356 ) 

357 ipa = clean_node( 

358 wxr, None, template_node.template_parameters.get("МФА", "") 

359 ) 

360 expanded_node = wxr.wtp.parse( 

361 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

362 ) 

363 current_data = base_data.model_copy(deep=True) 

364 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

365 gloss_text = clean_node(wxr, None, list_item.children) 

366 if len(gloss_text) > 0: 366 ↛ 364line 366 didn't jump to line 364 because the condition on line 366 was always true

367 sense = Sense(glosses=[gloss_text]) 

368 if len(form_of) > 0: 368 ↛ 371line 368 didn't jump to line 371 because the condition on line 368 was always true

369 sense.form_of.append(AltForm(word=form_of)) 

370 sense.tags.append("form-of") 

371 current_data.senses.append(sense) 

372 

373 if len(ipa) > 0: 373 ↛ 375line 373 didn't jump to line 375 because the condition on line 373 was always true

374 current_data.sounds.append(Sound(ipa=ipa)) 

375 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 375 ↛ exitline 375 didn't return from function 'process_form_template' because the condition on line 375 was always true

376 clean_node(wxr, current_data, template_node) 

377 page_data.append(current_data) 

378 

379 

380def parse_roman_section( 

381 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

382) -> None: 

383 for link_node in level_node.find_child(NodeKind.LINK): 

384 form_text = clean_node(wxr, None, link_node) 

385 if form_text != "": 

386 form = Form(form=form_text, tags=["romanization"]) 

387 word_entry.forms.append(form) 

388 

389 

390def extract_section_end_templates( 

391 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

392) -> None: 

393 # category link templates 

394 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации 

395 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

396 if template_node.template_name in { 396 ↛ 408line 396 didn't jump to line 408 because the condition on line 396 was never true

397 "-ание", 

398 "-атель", 

399 "-ация", 

400 "-ение", 

401 "-ка", 

402 "длина слова", 

403 "Категория", 

404 "Омонимы", 

405 "forms", 

406 "multilang", 

407 }: 

408 clean_node(wxr, word_entry, template_node) 

409 elif template_node.template_name == "zh-forms": 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true

410 extract_zh_forms_template(wxr, word_entry, template_node) 

411 

412 

413def extract_zh_forms_template( 

414 wxr: WiktextractContext, 

415 base_data: WordEntry, 

416 template_node: TemplateNode, 

417) -> None: 

418 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms 

419 # https://ru.wiktionary.org/wiki/Модуль:zh-forms 

420 # similar to en and zh edition template 

421 for p_name, p_value in template_node.template_parameters.items(): 

422 if not isinstance(p_name, str): 

423 continue 

424 if re.fullmatch(r"s\d*", p_name): 

425 form_data = Form( 

426 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"] 

427 ) 

428 if form_data.form not in ["", wxr.wtp.title]: 

429 base_data.forms.append(form_data) 

430 elif re.fullmatch(r"t\d*", p_name): 

431 form_data = Form( 

432 form=clean_node(wxr, None, p_value), 

433 tags=["Traditional Chinese"], 

434 ) 

435 if form_data.form not in ["", wxr.wtp.title]: 

436 base_data.forms.append(form_data) 

437 elif p_name == "lit": 

438 base_data.literal_meaning = clean_node(wxr, None, p_value)