Coverage for src/wiktextract/extractor/ru/page.py: 77%

246 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-18 10:14 +0000

1import re 

2from typing import Any 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...config import POSSubtitleData 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .etymology import extract_etymology 

17from .gloss import extract_gloss, process_meaning_template 

18from .inflection import ( 

19 extract_прил_ru_comparative_forms, 

20 parse_html_forms_table, 

21 parse_wikitext_forms_table, 

22) 

23from .linkage import ( 

24 extract_alt_form_section, 

25 extract_linkage_section, 

26 extract_phrase_section, 

27) 

28from .models import AltForm, Form, Hyphenation, Sense, Sound, WordEntry 

29from .pronunciation import ( 

30 extract_homophone_section, 

31 extract_pronunciation_section, 

32 extract_rhyme_section, 

33) 

34from .section_titles import ( 

35 ALT_FORM_SECTIONS, 

36 LINKAGE_TITLES, 

37 POS_TEMPLATE_NAMES, 

38 POS_TITLES, 

39) 

40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS 

41from .translation import extract_translations 

42 

43 

44def process_semantic_section( 

45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

46): 

47 for list_node in level_node.find_child(NodeKind.LIST): 

48 for template_node in list_node.find_child_recursively( 

49 NodeKind.TEMPLATE 

50 ): 

51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true

52 sense = process_meaning_template( 

53 wxr, None, page_data[-1], template_node 

54 ) 

55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true

56 page_data[-1].senses.append(sense) 

57 

58 

59MORPH_TEMPLATE_ARGS = { 

60 "p": "prefix", 

61 "prefix": "prefix", 

62 "i": "interfix", 

63 "interfix": "interfix", 

64 "in": "infix", 

65 "infix": "infix", 

66 "s": "suffix", 

67 "suffix": "suffix", 

68 "t": "transfix", 

69 "transfix": "transfix", 

70 "po": "suffix", 

71 "postfix": "suffix", 

72 "c": "circumfix", 

73 "confix": "circumfix", 

74 "circumfix": "circumfix", 

75 "r": "root", 

76 "e": "suffix", 

77 "ending": "suffix", 

78} 

79 

80 

81def get_pos_from_template( 

82 wxr: WiktextractContext, template_node: TemplateNode 

83) -> POSSubtitleData | None: 

84 # Search for POS in template names 

85 template_name = template_node.template_name.lower() 

86 if template_name == "morph": 

87 # https://ru.wiktionary.org/wiki/Шаблон:morph 

88 pos_type = template_node.template_parameters.get("тип", "") 

89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true

90 return { 

91 "pos": MORPH_TEMPLATE_ARGS[pos_type], 

92 "tags": ["morpheme"], 

93 } 

94 elif ( 

95 template_name in {"заголовок", "з"} 

96 and 1 in template_node.template_parameters 

97 ): 

98 pos_text = clean_node( 

99 wxr, None, template_node.template_parameters[1] 

100 ).strip("()") 

101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 return 

103 pos_text = pos_text.split()[0] 

104 if pos_text in POS_TITLES: 

105 return POS_TITLES[pos_text] 

106 

107 if template_name.startswith("прил ru"): 

108 pos_arg = clean_node( 

109 wxr, None, template_node.template_parameters.get("часть речи", "") 

110 ).lower() 

111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 for pos_string in POS_TITLES.keys(): 

113 if pos_string in pos_arg: 

114 return POS_TITLES[pos_string] 

115 else: 

116 return {"pos": "adj"} 

117 

118 for part in template_name.split(maxsplit=2): 

119 for subpart in part.split("-", maxsplit=2): 

120 if subpart in POS_TEMPLATE_NAMES: 

121 return POS_TEMPLATE_NAMES[subpart] 

122 

123 

124def get_pos( 

125 wxr: WiktextractContext, level_node: WikiNode 

126) -> POSSubtitleData | None: 

127 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

128 pos_data = get_pos_from_template(wxr, template_node) 

129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true

130 return pos_data 

131 # POS text could also in level node content 

132 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

133 pos_data = get_pos_from_template(wxr, template_node) 

134 if pos_data is not None: 

135 return pos_data 

136 

137 # Search for POS in section text 

138 text = clean_node( 

139 wxr, 

140 None, 

141 list( 

142 level_node.invert_find_child( 

143 LEVEL_KIND_FLAGS, include_empty_str=True 

144 ) 

145 ), 

146 ) 

147 for pos_string in POS_TITLES.keys(): 

148 if pos_string in text.lower(): 

149 return POS_TITLES[pos_string] 

150 

151 

152def extract_morphological_section( 

153 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

154) -> None: 

155 pos_data = get_pos(wxr, level_node) 

156 if pos_data is not None: 

157 page_data[-1].pos = pos_data["pos"] 

158 page_data[-1].tags.extend(pos_data.get("tags", [])) 

159 for child_node in level_node.find_child(NodeKind.TEMPLATE): 

160 expanded_template = wxr.wtp.parse( 

161 wxr.wtp.node_to_wikitext(child_node), expand_all=True 

162 ) 

163 clean_node(wxr, page_data[-1], expanded_template) # add category links 

164 if child_node.template_name.startswith( 

165 ( 

166 "прил ru", 

167 "прил-ru", 

168 "сущ ", 

169 "сущ-ru", 

170 "гл ", 

171 "мест ru ", 

172 "числ ru ", 

173 "числ-", 

174 "прич ru ", 

175 "Фам ru ", 

176 "падежи ", 

177 ) 

178 ): 

179 for table_node in expanded_template.find_child_recursively( 

180 NodeKind.TABLE 

181 ): 

182 parse_wikitext_forms_table(wxr, page_data[-1], table_node) 

183 for table_tag in expanded_template.find_html("table"): 

184 parse_html_forms_table(wxr, page_data[-1], table_tag) 

185 h_str = clean_node( 

186 wxr, None, child_node.template_parameters.get("слоги", "") 

187 ) 

188 if h_str != "": 

189 page_data[-1].hyphenations.append( 

190 Hyphenation(parts=h_str.split("-")) 

191 ) 

192 

193 if child_node.template_name.startswith("прил ru"): 

194 extract_прил_ru_comparative_forms( 

195 wxr, page_data[-1], expanded_template 

196 ) 

197 

198 for node in expanded_template.children: 

199 node_text = clean_node(wxr, page_data[-1], node) 

200 for text in node_text.split(","): 

201 text = text.strip() 

202 if text in MORPHOLOGICAL_TEMPLATE_TAGS: 

203 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text] 

204 if isinstance(tr_tag, str): 204 ↛ 206line 204 didn't jump to line 206 because the condition on line 204 was always true

205 page_data[-1].tags.append(tr_tag) 

206 elif isinstance(tr_tag, list): 

207 page_data[-1].tags.extend(tr_tag) 

208 

209 

210def parse_section( 

211 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

212) -> None: 

213 section_title = clean_node(wxr, None, level_node.largs).lower() 

214 wxr.wtp.start_subsection(section_title) 

215 if section_title in [ 

216 # Morphological and syntactic properties 

217 "морфологические и синтаксические свойства", 

218 # Type and syntactic properties of the word combination 

219 "тип и синтаксические свойства сочетания", 

220 "тип и свойства сочетания", 

221 ]: 

222 extract_morphological_section(wxr, page_data, level_node) 

223 elif section_title in POS_TITLES: 

224 pos_data = POS_TITLES[section_title] 

225 page_data[-1].pos = pos_data["pos"] 

226 page_data[-1].tags.extend(pos_data.get("tags", [])) 

227 extract_gloss(wxr, page_data[-1], level_node) 

228 elif section_title == "произношение" and wxr.config.capture_pronunciation: 

229 extract_pronunciation_section(wxr, page_data[-1], level_node) 

230 elif section_title == "семантические свойства": # Semantic properties 

231 process_semantic_section(wxr, page_data, level_node) 

232 elif section_title in [ 

233 "значение", 

234 "значения", 

235 "как самостоятельный глагол", 

236 "в значении вспомогательного глагола или связки", 

237 ]: 

238 extract_gloss(wxr, page_data[-1], level_node) 

239 elif section_title == "этимология" and wxr.config.capture_etymologies: 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 extract_etymology(wxr, page_data[-1], level_node) 

241 elif ( 

242 section_title 

243 in [ 

244 "фразеологизмы и устойчивые сочетания", 

245 "типичные сочетания", 

246 "фразеологизмы", 

247 "пословицы и поговорки", 

248 ] 

249 and wxr.config.capture_linkages 

250 ): 

251 extract_phrase_section(wxr, page_data[-1], level_node, section_title) 

252 elif ( 

253 section_title in ["перевод", "иноязычные аналоги"] 

254 and wxr.config.capture_translations 

255 ): 

256 extract_translations(wxr, page_data[-1], level_node) 

257 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 extract_linkage_section( 

259 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node 

260 ) 

261 elif section_title == "библиография": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 pass 

263 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 parse_roman_section(wxr, page_data[-1], level_node) 

265 elif section_title == "прочее": 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 pass 

267 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 extract_homophone_section(wxr, page_data[-1], level_node) 

269 elif section_title in ALT_FORM_SECTIONS: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 extract_alt_form_section( 

271 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title] 

272 ) 

273 elif section_title == "рифмы": 273 ↛ 275line 273 didn't jump to line 275 because the condition on line 273 was always true

274 extract_rhyme_section(wxr, page_data[-1], level_node) 

275 elif section_title not in ["см. также", "смотреть также", "смотрите также"]: 

276 wxr.wtp.debug( 

277 f"Unprocessed section {section_title}", 

278 sortid="wixtextract/extractor/ru/page/parse_section/66", 

279 ) 

280 

281 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

282 parse_section(wxr, page_data, next_level_node) 

283 

284 extract_section_end_templates(wxr, page_data[-1], level_node) 

285 

286 

287def parse_page( 

288 wxr: WiktextractContext, page_title: str, page_text: str 

289) -> list[dict[str, Any]]: 

290 # Help site describing page structure: 

291 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей 

292 

293 if wxr.config.verbose: 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 logger.info(f"Parsing page: {page_title}") 

295 wxr.config.word = page_title 

296 wxr.wtp.start_page(page_title) 

297 tree = wxr.wtp.parse(page_text) 

298 page_data: list[WordEntry] = [] 

299 

300 for level1_node in tree.find_child(NodeKind.LEVEL1): 

301 lang_code = "" 

302 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 302 ↛ 305line 302 didn't jump to line 305 because the loop on line 302 didn't complete

303 lang_code = subtitle_template.template_name.strip(" -") 

304 break 

305 if lang_code == "": 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 lang_code = "unknown" 

307 if ( 307 ↛ 311line 307 didn't jump to line 311 because the condition on line 307 was never true

308 wxr.config.capture_language_codes is not None 

309 and lang_code not in wxr.config.capture_language_codes 

310 ): 

311 continue 

312 categories = {"categories": []} 

313 lang_name = clean_node(wxr, categories, level1_node.largs) 

314 wxr.wtp.start_section(lang_name) 

315 base_data = WordEntry( 

316 lang=lang_name, 

317 lang_code=lang_code, 

318 word=page_title, 

319 pos="unknown", 

320 ) 

321 base_data.categories.extend(categories["categories"]) 

322 extract_section_end_templates(wxr, base_data, level1_node) 

323 pos_data = get_pos(wxr, level1_node) 

324 if pos_data is not None: 

325 base_data.pos = pos_data["pos"] 

326 base_data.tags.extend(pos_data.get("tags", [])) 

327 

328 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

329 if base_data.pos == "unknown": 

330 pos_data = get_pos(wxr, level2_node) 

331 if pos_data is not None: 

332 base_data.pos = pos_data["pos"] 

333 base_data.tags.extend(pos_data.get("tags", [])) 

334 page_data.append(base_data.model_copy(deep=True)) 

335 extract_level2_node_contents(wxr, page_data[-1], level2_node) 

336 has_level3 = False 

337 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

338 parse_section(wxr, page_data, level3_node) 

339 has_level3 = True 

340 if page_data[-1] == base_data or not has_level3: 340 ↛ 341line 340 didn't jump to line 341 because the condition on line 340 was never true

341 page_data.pop() 

342 extract_low_quality_page(wxr, page_data, base_data, level2_node) 

343 

344 for any_level_index, any_level_node in enumerate( 

345 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2) 

346 ): 

347 if any_level_index == 0 and ( 

348 len(page_data) == 0 

349 or page_data[-1].lang_code != base_data.lang_code 

350 ): 

351 page_data.append(base_data.model_copy(deep=True)) 

352 parse_section(wxr, page_data, any_level_node) 

353 

354 if len(page_data) > 0 and page_data[-1] == base_data: 354 ↛ 355line 354 didn't jump to line 355 because the condition on line 354 was never true

355 page_data.pop() 

356 extract_low_quality_page(wxr, page_data, base_data, level1_node) 

357 

358 for d in page_data: 

359 if len(d.senses) == 0: 359 ↛ 360line 359 didn't jump to line 360 because the condition on line 359 was never true

360 d.senses.append(Sense(tags=["no-gloss"])) 

361 return [d.model_dump(exclude_defaults=True) for d in page_data] 

362 

363 

364def extract_low_quality_page( 

365 wxr: WiktextractContext, 

366 page_data: list[WordEntry], 

367 base_data: WordEntry, 

368 level_node: WikiNode, 

369) -> None: 

370 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

371 if isinstance(node, TemplateNode) and node.template_name.startswith( 

372 "Форма-" 

373 ): 

374 process_form_template(wxr, page_data, base_data, node) 

375 elif isinstance(node, WikiNode): 

376 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 

377 if template_node.template_name.startswith("Форма-"): 377 ↛ 378line 377 didn't jump to line 378 because the condition on line 377 was never true

378 process_form_template( 

379 wxr, page_data, base_data, template_node 

380 ) 

381 

382 

383def process_form_template( 

384 wxr: WiktextractContext, 

385 page_data: list[WordEntry], 

386 base_data: WordEntry, 

387 template_node: TemplateNode, 

388) -> None: 

389 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ 

390 # Шаблон:Форма-гл, "Шаблон:форма-гл en" 

391 pos_data = get_pos_from_template(wxr, template_node) 

392 if pos_data is not None: 392 ↛ 396line 392 didn't jump to line 396 because the condition on line 392 was always true

393 base_data.pos = pos_data["pos"] 

394 base_data.tags.extend(pos_data.get("tags", [])) 

395 

396 form_of = clean_node( 

397 wxr, 

398 None, 

399 template_node.template_parameters.get( 

400 "база", template_node.template_parameters.get(1, "") 

401 ), 

402 ) 

403 ipa = clean_node( 

404 wxr, None, template_node.template_parameters.get("МФА", "") 

405 ) 

406 expanded_node = wxr.wtp.parse( 

407 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

408 ) 

409 current_data = base_data.model_copy(deep=True) 

410 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

411 gloss_text = clean_node(wxr, None, list_item.children) 

412 if len(gloss_text) > 0: 412 ↛ 410line 412 didn't jump to line 410 because the condition on line 412 was always true

413 sense = Sense(glosses=[gloss_text]) 

414 if len(form_of) > 0: 414 ↛ 417line 414 didn't jump to line 417 because the condition on line 414 was always true

415 sense.form_of.append(AltForm(word=form_of)) 

416 sense.tags.append("form-of") 

417 current_data.senses.append(sense) 

418 

419 if len(ipa) > 0: 419 ↛ 421line 419 didn't jump to line 421 because the condition on line 419 was always true

420 current_data.sounds.append(Sound(ipa=ipa)) 

421 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 421 ↛ exitline 421 didn't return from function 'process_form_template' because the condition on line 421 was always true

422 clean_node(wxr, current_data, template_node) 

423 page_data.append(current_data) 

424 

425 

426def parse_roman_section( 

427 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

428) -> None: 

429 for link_node in level_node.find_child(NodeKind.LINK): 

430 form_text = clean_node(wxr, None, link_node) 

431 if form_text != "": 

432 form = Form(form=form_text, tags=["romanization"]) 

433 word_entry.forms.append(form) 

434 

435 

436def extract_section_end_templates( 

437 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

438) -> None: 

439 # category link templates 

440 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации 

441 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

442 if template_node.template_name in { 442 ↛ 454line 442 didn't jump to line 454 because the condition on line 442 was never true

443 "-ание", 

444 "-атель", 

445 "-ация", 

446 "-ение", 

447 "-ка", 

448 "длина слова", 

449 "Категория", 

450 "Омонимы", 

451 "forms", 

452 "multilang", 

453 }: 

454 clean_node(wxr, word_entry, template_node) 

455 elif template_node.template_name == "zh-forms": 455 ↛ 456line 455 didn't jump to line 456 because the condition on line 455 was never true

456 extract_zh_forms_template(wxr, word_entry, template_node) 

457 

458 

459def extract_zh_forms_template( 

460 wxr: WiktextractContext, 

461 base_data: WordEntry, 

462 template_node: TemplateNode, 

463) -> None: 

464 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms 

465 # https://ru.wiktionary.org/wiki/Модуль:zh-forms 

466 # similar to en and zh edition template 

467 for p_name, p_value in template_node.template_parameters.items(): 

468 if not isinstance(p_name, str): 

469 continue 

470 if re.fullmatch(r"s\d*", p_name): 

471 form_data = Form( 

472 form=clean_node(wxr, None, p_value), tags=["Simplified-Chinese"] 

473 ) 

474 if form_data.form not in ["", wxr.wtp.title]: 

475 base_data.forms.append(form_data) 

476 elif re.fullmatch(r"t\d*", p_name): 

477 form_data = Form( 

478 form=clean_node(wxr, None, p_value), 

479 tags=["Traditional-Chinese"], 

480 ) 

481 if form_data.form not in ["", wxr.wtp.title]: 

482 base_data.forms.append(form_data) 

483 elif p_name == "lit": 

484 base_data.literal_meaning = clean_node(wxr, None, p_value) 

485 

486 

487def extract_level2_node_contents( 

488 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

489) -> None: 

490 for t_node in level_node.find_content(NodeKind.TEMPLATE): 

491 if t_node.template_name in ["заголовок", "з"]: 491 ↛ 490line 491 didn't jump to line 490 because the condition on line 491 was always true

492 # https://ru.wiktionary.org/wiki/Шаблон:з 

493 stressed_form = clean_node( 

494 wxr, None, t_node.template_parameters.get("ударение", "") 

495 ) 

496 if "(" in stressed_form: 

497 stressed_form = stressed_form[ 

498 : stressed_form.index("(") 

499 ].strip() 

500 if stressed_form not in ["", wxr.wtp.title]: 

501 word_entry.forms.append( 

502 Form(form=stressed_form, tags=["stressed"]) 

503 )