Coverage for src/wiktextract/extractor/ru/page.py: 76%

246 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2from typing import Any 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...config import POSSubtitleData 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .etymology import extract_etymology 

17from .gloss import extract_gloss, process_meaning_template 

18from .inflection import ( 

19 extract_прил_ru_comparative_forms, 

20 parse_html_forms_table, 

21 parse_wikitext_forms_table, 

22) 

23from .linkage import ( 

24 extract_alt_form_section, 

25 extract_linkage_section, 

26 extract_phrase_section, 

27) 

28from .models import AltForm, Form, Hyphenation, Sense, Sound, WordEntry 

29from .pronunciation import ( 

30 extract_homophone_section, 

31 extract_pronunciation_section, 

32 extract_rhyme_section, 

33) 

34from .section_titles import ( 

35 ALT_FORM_SECTIONS, 

36 LINKAGE_TITLES, 

37 POS_TEMPLATE_NAMES, 

38 POS_TITLES, 

39) 

40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS 

41from .translation import extract_translations 

42 

43 

44def process_semantic_section( 

45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

46): 

47 for list_node in level_node.find_child(NodeKind.LIST): 

48 for template_node in list_node.find_child_recursively( 

49 NodeKind.TEMPLATE 

50 ): 

51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true

52 sense = process_meaning_template( 

53 wxr, None, page_data[-1], template_node 

54 ) 

55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true

56 page_data[-1].senses.append(sense) 

57 

58 

59MORPH_TEMPLATE_ARGS = { 

60 "p": "prefix", 

61 "prefix": "prefix", 

62 "i": "interfix", 

63 "interfix": "interfix", 

64 "in": "infix", 

65 "infix": "infix", 

66 "s": "suffix", 

67 "suffix": "suffix", 

68 "t": "transfix", 

69 "transfix": "transfix", 

70 "po": "suffix", 

71 "postfix": "suffix", 

72 "c": "circumfix", 

73 "confix": "circumfix", 

74 "circumfix": "circumfix", 

75 "r": "root", 

76 "e": "suffix", 

77 "ending": "suffix", 

78} 

79 

80 

81def get_pos_from_template( 

82 wxr: WiktextractContext, template_node: TemplateNode 

83) -> POSSubtitleData | None: 

84 # Search for POS in template names 

85 template_name = template_node.template_name.lower() 

86 if template_name == "morph": 

87 # https://ru.wiktionary.org/wiki/Шаблон:morph 

88 pos_type = template_node.template_parameters.get("тип", "") 

89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true

90 return { 

91 "pos": MORPH_TEMPLATE_ARGS[pos_type], 

92 "tags": ["morpheme"], 

93 } 

94 elif ( 

95 template_name in {"заголовок", "з"} 

96 and 1 in template_node.template_parameters 

97 ): 

98 pos_text = clean_node( 

99 wxr, None, template_node.template_parameters[1] 

100 ).strip("()") 

101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 return 

103 pos_text = pos_text.split()[0] 

104 if pos_text in POS_TITLES: 

105 return POS_TITLES[pos_text] 

106 

107 if template_name.startswith("прил ru"): 

108 pos_arg = clean_node( 

109 wxr, None, template_node.template_parameters.get("часть речи", "") 

110 ).lower() 

111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 for pos_string in POS_TITLES.keys(): 

113 if pos_string in pos_arg: 

114 return POS_TITLES[pos_string] 

115 else: 

116 return {"pos": "adj"} 

117 

118 for part in template_name.split(maxsplit=2): 

119 for subpart in part.split("-", maxsplit=2): 

120 if subpart in POS_TEMPLATE_NAMES: 

121 return POS_TEMPLATE_NAMES[subpart] 

122 

123 

124def get_pos( 

125 wxr: WiktextractContext, level_node: WikiNode 

126) -> POSSubtitleData | None: 

127 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

128 pos_data = get_pos_from_template(wxr, template_node) 

129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true

130 return pos_data 

131 # POS text could also in level node content 

132 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

133 pos_data = get_pos_from_template(wxr, template_node) 

134 if pos_data is not None: 

135 return pos_data 

136 

137 # Search for POS in section text 

138 text = clean_node( 

139 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

140 ) 

141 for pos_string in POS_TITLES.keys(): 

142 if pos_string in text.lower(): 

143 return POS_TITLES[pos_string] 

144 

145 

146def extract_morphological_section( 

147 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

148) -> None: 

149 pos_data = get_pos(wxr, level_node) 

150 if pos_data is not None: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 page_data[-1].pos = pos_data["pos"] 

152 page_data[-1].tags.extend(pos_data.get("tags", [])) 

153 for child_node in level_node.find_child(NodeKind.TEMPLATE): 

154 expanded_template = wxr.wtp.parse( 

155 wxr.wtp.node_to_wikitext(child_node), expand_all=True 

156 ) 

157 clean_node(wxr, page_data[-1], expanded_template) # add category links 

158 if child_node.template_name.startswith( 

159 ( 

160 "прил ru", 

161 "прил-ru", 

162 "сущ ", 

163 "сущ-ru", 

164 "гл ", 

165 "мест ru ", 

166 "числ ru ", 

167 "числ-", 

168 "прич ru ", 

169 "Фам ru ", 

170 "падежи ", 

171 ) 

172 ): 

173 for table_node in expanded_template.find_child_recursively( 

174 NodeKind.TABLE 

175 ): 

176 parse_wikitext_forms_table(wxr, page_data[-1], table_node) 

177 for table_tag in expanded_template.find_html("table"): 

178 parse_html_forms_table(wxr, page_data[-1], table_tag) 

179 h_str = clean_node( 

180 wxr, None, child_node.template_parameters.get("слоги", "") 

181 ) 

182 if h_str != "": 

183 page_data[-1].hyphenations.append( 

184 Hyphenation(parts=h_str.split("-")) 

185 ) 

186 

187 if child_node.template_name.startswith("прил ru"): 

188 extract_прил_ru_comparative_forms( 

189 wxr, page_data[-1], expanded_template 

190 ) 

191 

192 for node in expanded_template.children: 

193 node_text = clean_node(wxr, page_data[-1], node) 

194 for text in node_text.split(","): 

195 text = text.strip() 

196 if text in MORPHOLOGICAL_TEMPLATE_TAGS: 

197 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text] 

198 if isinstance(tr_tag, str): 198 ↛ 200line 198 didn't jump to line 200 because the condition on line 198 was always true

199 page_data[-1].tags.append(tr_tag) 

200 elif isinstance(tr_tag, list): 

201 page_data[-1].tags.extend(tr_tag) 

202 

203 

204def parse_section( 

205 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

206) -> None: 

207 section_title = clean_node(wxr, None, level_node.largs).lower() 

208 wxr.wtp.start_subsection(section_title) 

209 if section_title in [ 

210 # Morphological and syntactic properties 

211 "морфологические и синтаксические свойства", 

212 # Type and syntactic properties of the word combination 

213 "тип и синтаксические свойства сочетания", 

214 "тип и свойства сочетания", 

215 ]: 

216 extract_morphological_section(wxr, page_data, level_node) 

217 elif section_title in POS_TITLES: 

218 pos_data = POS_TITLES[section_title] 

219 page_data[-1].pos = pos_data["pos"] 

220 page_data[-1].tags.extend(pos_data.get("tags", [])) 

221 extract_gloss(wxr, page_data[-1], level_node) 

222 elif section_title == "произношение" and wxr.config.capture_pronunciation: 

223 extract_pronunciation_section(wxr, page_data[-1], level_node) 

224 elif section_title == "семантические свойства": # Semantic properties 

225 process_semantic_section(wxr, page_data, level_node) 

226 elif section_title in [ 

227 "значение", 

228 "значения", 

229 "как самостоятельный глагол", 

230 "в значении вспомогательного глагола или связки", 

231 ]: 

232 extract_gloss(wxr, page_data[-1], level_node) 

233 elif section_title == "этимология" and wxr.config.capture_etymologies: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 extract_etymology(wxr, page_data[-1], level_node) 

235 elif ( 

236 section_title 

237 in [ 

238 "фразеологизмы и устойчивые сочетания", 

239 "типичные сочетания", 

240 "фразеологизмы", 

241 "пословицы и поговорки", 

242 ] 

243 and wxr.config.capture_linkages 

244 ): 

245 extract_phrase_section(wxr, page_data[-1], level_node, section_title) 

246 elif ( 246 ↛ 250line 246 didn't jump to line 250 because the condition on line 246 was never true

247 section_title in ["перевод", "иноязычные аналоги"] 

248 and wxr.config.capture_translations 

249 ): 

250 extract_translations(wxr, page_data[-1], level_node) 

251 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 extract_linkage_section( 

253 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node 

254 ) 

255 elif section_title == "библиография": 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 pass 

257 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 parse_roman_section(wxr, page_data[-1], level_node) 

259 elif section_title == "прочее": 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 pass 

261 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 extract_homophone_section(wxr, page_data[-1], level_node) 

263 elif section_title in ALT_FORM_SECTIONS: 263 ↛ 264line 263 didn't jump to line 264 because the condition on line 263 was never true

264 extract_alt_form_section( 

265 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title] 

266 ) 

267 elif section_title == "рифмы": 267 ↛ 269line 267 didn't jump to line 269 because the condition on line 267 was always true

268 extract_rhyme_section(wxr, page_data[-1], level_node) 

269 elif section_title not in ["см. также", "смотреть также", "смотрите также"]: 

270 wxr.wtp.debug( 

271 f"Unprocessed section {section_title}", 

272 sortid="wixtextract/extractor/ru/page/parse_section/66", 

273 ) 

274 

275 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

276 parse_section(wxr, page_data, next_level_node) 

277 

278 extract_section_end_templates(wxr, page_data[-1], level_node) 

279 

280 

281def parse_page( 

282 wxr: WiktextractContext, page_title: str, page_text: str 

283) -> list[dict[str, Any]]: 

284 # Help site describing page structure: 

285 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей 

286 

287 if wxr.config.verbose: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 logger.info(f"Parsing page: {page_title}") 

289 wxr.config.word = page_title 

290 wxr.wtp.start_page(page_title) 

291 tree = wxr.wtp.parse(page_text) 

292 page_data: list[WordEntry] = [] 

293 

294 for level1_node in tree.find_child(NodeKind.LEVEL1): 

295 lang_code = "" 

296 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 296 ↛ 299line 296 didn't jump to line 299 because the loop on line 296 didn't complete

297 lang_code = subtitle_template.template_name.strip(" -") 

298 break 

299 if lang_code == "": 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true

300 lang_code = "unknown" 

301 if ( 301 ↛ 305line 301 didn't jump to line 305 because the condition on line 301 was never true

302 wxr.config.capture_language_codes is not None 

303 and lang_code not in wxr.config.capture_language_codes 

304 ): 

305 continue 

306 categories = {"categories": []} 

307 lang_name = clean_node(wxr, categories, level1_node.largs) 

308 wxr.wtp.start_section(lang_name) 

309 base_data = WordEntry( 

310 lang=lang_name, 

311 lang_code=lang_code, 

312 word=page_title, 

313 pos="unknown", 

314 ) 

315 base_data.categories.extend(categories["categories"]) 

316 extract_section_end_templates(wxr, base_data, level1_node) 

317 pos_data = get_pos(wxr, level1_node) 

318 if pos_data is not None: 

319 base_data.pos = pos_data["pos"] 

320 base_data.tags.extend(pos_data.get("tags", [])) 

321 

322 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

323 if base_data.pos == "unknown": 

324 pos_data = get_pos(wxr, level2_node) 

325 if pos_data is not None: 

326 base_data.pos = pos_data["pos"] 

327 base_data.tags.extend(pos_data.get("tags", [])) 

328 page_data.append(base_data.model_copy(deep=True)) 

329 extract_level2_node_contents(wxr, page_data[-1], level2_node) 

330 has_level3 = False 

331 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

332 parse_section(wxr, page_data, level3_node) 

333 has_level3 = True 

334 if page_data[-1] == base_data or not has_level3: 334 ↛ 335line 334 didn't jump to line 335 because the condition on line 334 was never true

335 page_data.pop() 

336 extract_low_quality_page(wxr, page_data, base_data, level2_node) 

337 

338 for any_level_index, any_level_node in enumerate( 

339 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2) 

340 ): 

341 if any_level_index == 0 and ( 

342 len(page_data) == 0 

343 or page_data[-1].lang_code != base_data.lang_code 

344 ): 

345 page_data.append(base_data.model_copy(deep=True)) 

346 parse_section(wxr, page_data, any_level_node) 

347 

348 if len(page_data) > 0 and page_data[-1] == base_data: 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 page_data.pop() 

350 extract_low_quality_page(wxr, page_data, base_data, level1_node) 

351 

352 for d in page_data: 

353 if len(d.senses) == 0: 353 ↛ 354line 353 didn't jump to line 354 because the condition on line 353 was never true

354 d.senses.append(Sense(tags=["no-gloss"])) 

355 return [d.model_dump(exclude_defaults=True) for d in page_data] 

356 

357 

358def extract_low_quality_page( 

359 wxr: WiktextractContext, 

360 page_data: list[WordEntry], 

361 base_data: WordEntry, 

362 level_node: WikiNode, 

363) -> None: 

364 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

365 if isinstance(node, TemplateNode) and node.template_name.startswith( 

366 "Форма-" 

367 ): 

368 process_form_template(wxr, page_data, base_data, node) 

369 elif isinstance(node, WikiNode): 

370 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 

371 if template_node.template_name.startswith("Форма-"): 371 ↛ 372line 371 didn't jump to line 372 because the condition on line 371 was never true

372 process_form_template( 

373 wxr, page_data, base_data, template_node 

374 ) 

375 

376 

377def process_form_template( 

378 wxr: WiktextractContext, 

379 page_data: list[WordEntry], 

380 base_data: WordEntry, 

381 template_node: TemplateNode, 

382) -> None: 

383 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ 

384 # Шаблон:Форма-гл, "Шаблон:форма-гл en" 

385 pos_data = get_pos_from_template(wxr, template_node) 

386 if pos_data is not None: 386 ↛ 390line 386 didn't jump to line 390 because the condition on line 386 was always true

387 base_data.pos = pos_data["pos"] 

388 base_data.tags.extend(pos_data.get("tags", [])) 

389 

390 form_of = clean_node( 

391 wxr, 

392 None, 

393 template_node.template_parameters.get( 

394 "база", template_node.template_parameters.get(1, "") 

395 ), 

396 ) 

397 ipa = clean_node( 

398 wxr, None, template_node.template_parameters.get("МФА", "") 

399 ) 

400 expanded_node = wxr.wtp.parse( 

401 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

402 ) 

403 current_data = base_data.model_copy(deep=True) 

404 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

405 gloss_text = clean_node(wxr, None, list_item.children) 

406 if len(gloss_text) > 0: 406 ↛ 404line 406 didn't jump to line 404 because the condition on line 406 was always true

407 sense = Sense(glosses=[gloss_text]) 

408 if len(form_of) > 0: 408 ↛ 411line 408 didn't jump to line 411 because the condition on line 408 was always true

409 sense.form_of.append(AltForm(word=form_of)) 

410 sense.tags.append("form-of") 

411 current_data.senses.append(sense) 

412 

413 if len(ipa) > 0: 413 ↛ 415line 413 didn't jump to line 415 because the condition on line 413 was always true

414 current_data.sounds.append(Sound(ipa=ipa)) 

415 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 415 ↛ exitline 415 didn't return from function 'process_form_template' because the condition on line 415 was always true

416 clean_node(wxr, current_data, template_node) 

417 page_data.append(current_data) 

418 

419 

420def parse_roman_section( 

421 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

422) -> None: 

423 for link_node in level_node.find_child(NodeKind.LINK): 

424 form_text = clean_node(wxr, None, link_node) 

425 if form_text != "": 

426 form = Form(form=form_text, tags=["romanization"]) 

427 word_entry.forms.append(form) 

428 

429 

430def extract_section_end_templates( 

431 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

432) -> None: 

433 # category link templates 

434 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации 

435 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

436 if template_node.template_name in { 436 ↛ 448line 436 didn't jump to line 448 because the condition on line 436 was never true

437 "-ание", 

438 "-атель", 

439 "-ация", 

440 "-ение", 

441 "-ка", 

442 "длина слова", 

443 "Категория", 

444 "Омонимы", 

445 "forms", 

446 "multilang", 

447 }: 

448 clean_node(wxr, word_entry, template_node) 

449 elif template_node.template_name == "zh-forms": 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true

450 extract_zh_forms_template(wxr, word_entry, template_node) 

451 

452 

453def extract_zh_forms_template( 

454 wxr: WiktextractContext, 

455 base_data: WordEntry, 

456 template_node: TemplateNode, 

457) -> None: 

458 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms 

459 # https://ru.wiktionary.org/wiki/Модуль:zh-forms 

460 # similar to en and zh edition template 

461 for p_name, p_value in template_node.template_parameters.items(): 

462 if not isinstance(p_name, str): 

463 continue 

464 if re.fullmatch(r"s\d*", p_name): 

465 form_data = Form( 

466 form=clean_node(wxr, None, p_value), tags=["Simplified-Chinese"] 

467 ) 

468 if form_data.form not in ["", wxr.wtp.title]: 

469 base_data.forms.append(form_data) 

470 elif re.fullmatch(r"t\d*", p_name): 

471 form_data = Form( 

472 form=clean_node(wxr, None, p_value), 

473 tags=["Traditional-Chinese"], 

474 ) 

475 if form_data.form not in ["", wxr.wtp.title]: 

476 base_data.forms.append(form_data) 

477 elif p_name == "lit": 

478 base_data.literal_meaning = clean_node(wxr, None, p_value) 

479 

480 

481def extract_level2_node_contents( 

482 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

483) -> None: 

484 for t_node in level_node.find_content(NodeKind.TEMPLATE): 

485 if t_node.template_name in ["заголовок", "з"]: 485 ↛ 484line 485 didn't jump to line 484 because the condition on line 485 was always true

486 # https://ru.wiktionary.org/wiki/Шаблон:з 

487 stressed_form = clean_node( 

488 wxr, None, t_node.template_parameters.get("ударение", "") 

489 ) 

490 if "(" in stressed_form: 

491 stressed_form = stressed_form[ 

492 : stressed_form.index("(") 

493 ].strip() 

494 if stressed_form not in ["", wxr.wtp.title]: 

495 word_entry.forms.append( 

496 Form(form=stressed_form, tags=["stressed"]) 

497 )