Coverage for src/wiktextract/extractor/ru/page.py: 76%

244 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1import re 

2from typing import Any 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...config import POSSubtitleData 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .etymology import extract_etymology 

17from .gloss import extract_gloss, process_meaning_template 

18from .inflection import ( 

19 extract_прил_ru_comparative_forms, 

20 parse_html_forms_table, 

21 parse_wikitext_forms_table, 

22) 

23from .linkage import ( 

24 extract_alt_form_section, 

25 extract_linkage_section, 

26 extract_phrase_section, 

27) 

28from .models import AltForm, Form, Sense, Sound, WordEntry 

29from .pronunciation import ( 

30 extract_homophone_section, 

31 extract_pronunciation_section, 

32 extract_rhyme_section, 

33) 

34from .section_titles import ( 

35 ALT_FORM_SECTIONS, 

36 LINKAGE_TITLES, 

37 POS_TEMPLATE_NAMES, 

38 POS_TITLES, 

39) 

40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS 

41from .translation import extract_translations 

42 

43 

44def process_semantic_section( 

45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode 

46): 

47 for list_node in level_node.find_child(NodeKind.LIST): 

48 for template_node in list_node.find_child_recursively( 

49 NodeKind.TEMPLATE 

50 ): 

51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true

52 sense = process_meaning_template( 

53 wxr, None, page_data[-1], template_node 

54 ) 

55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true

56 page_data[-1].senses.append(sense) 

57 

58 

59MORPH_TEMPLATE_ARGS = { 

60 "p": "prefix", 

61 "prefix": "prefix", 

62 "i": "interfix", 

63 "interfix": "interfix", 

64 "in": "infix", 

65 "infix": "infix", 

66 "s": "suffix", 

67 "suffix": "suffix", 

68 "t": "transfix", 

69 "transfix": "transfix", 

70 "po": "suffix", 

71 "postfix": "suffix", 

72 "c": "circumfix", 

73 "confix": "circumfix", 

74 "circumfix": "circumfix", 

75 "r": "root", 

76 "e": "suffix", 

77 "ending": "suffix", 

78} 

79 

80 

81def get_pos_from_template( 

82 wxr: WiktextractContext, template_node: TemplateNode 

83) -> POSSubtitleData | None: 

84 # Search for POS in template names 

85 template_name = template_node.template_name.lower() 

86 if template_name == "morph": 

87 # https://ru.wiktionary.org/wiki/Шаблон:morph 

88 pos_type = template_node.template_parameters.get("тип", "") 

89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true

90 return { 

91 "pos": MORPH_TEMPLATE_ARGS[pos_type], 

92 "tags": ["morpheme"], 

93 } 

94 elif ( 

95 template_name in {"заголовок", "з"} 

96 and 1 in template_node.template_parameters 

97 ): 

98 pos_text = clean_node( 

99 wxr, None, template_node.template_parameters[1] 

100 ).strip("()") 

101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 return 

103 pos_text = pos_text.split()[0] 

104 if pos_text in POS_TITLES: 

105 return POS_TITLES[pos_text] 

106 

107 if template_name.startswith("прил ru"): 

108 pos_arg = clean_node( 

109 wxr, None, template_node.template_parameters.get("часть речи", "") 

110 ).lower() 

111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 for pos_string in POS_TITLES.keys(): 

113 if pos_string in pos_arg: 

114 return POS_TITLES[pos_string] 

115 else: 

116 return {"pos": "adj"} 

117 

118 for part in template_name.split(maxsplit=2): 

119 for subpart in part.split("-", maxsplit=2): 

120 if subpart in POS_TEMPLATE_NAMES: 

121 return POS_TEMPLATE_NAMES[subpart] 

122 

123 

124def get_pos( 

125 wxr: WiktextractContext, level_node: WikiNode 

126) -> POSSubtitleData | None: 

127 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

128 pos_data = get_pos_from_template(wxr, template_node) 

129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true

130 return pos_data 

131 # POS text could also in level node content 

132 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

133 pos_data = get_pos_from_template(wxr, template_node) 

134 if pos_data is not None: 

135 return pos_data 

136 

137 # Search for POS in section text 

138 text = clean_node( 

139 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

140 ) 

141 for pos_string in POS_TITLES.keys(): 

142 if pos_string in text.lower(): 

143 return POS_TITLES[pos_string] 

144 

145 

146def extract_morphological_section( 

147 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

148) -> None: 

149 pos_data = get_pos(wxr, level_node) 

150 if pos_data is not None: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 page_data[-1].pos = pos_data["pos"] 

152 page_data[-1].tags.extend(pos_data.get("tags", [])) 

153 for child_node in level_node.find_child(NodeKind.TEMPLATE): 

154 expanded_template = wxr.wtp.parse( 

155 wxr.wtp.node_to_wikitext(child_node), expand_all=True 

156 ) 

157 clean_node(wxr, page_data[-1], expanded_template) # add category links 

158 if child_node.template_name.startswith( 

159 ( 

160 "прил ru", 

161 "прил-ru", 

162 "сущ ", 

163 "сущ-ru", 

164 "гл ", 

165 "мест ru ", 

166 "числ ru ", 

167 "числ-", 

168 "прич ru ", 

169 "Фам ru ", 

170 "падежи ", 

171 ) 

172 ): 

173 for table_node in expanded_template.find_child_recursively( 

174 NodeKind.TABLE 

175 ): 

176 parse_wikitext_forms_table(wxr, page_data[-1], table_node) 

177 for table_tag in expanded_template.find_html("table"): 

178 parse_html_forms_table(wxr, page_data[-1], table_tag) 

179 page_data[-1].hyphenation = clean_node( 

180 wxr, None, child_node.template_parameters.get("слоги", "") 

181 ) 

182 

183 if child_node.template_name.startswith("прил ru"): 

184 extract_прил_ru_comparative_forms( 

185 wxr, page_data[-1], expanded_template 

186 ) 

187 

188 for node in expanded_template.children: 

189 node_text = clean_node(wxr, page_data[-1], node) 

190 for text in node_text.split(","): 

191 text = text.strip() 

192 if text in MORPHOLOGICAL_TEMPLATE_TAGS: 

193 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text] 

194 if isinstance(tr_tag, str): 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true

195 page_data[-1].tags.append(tr_tag) 

196 elif isinstance(tr_tag, list): 

197 page_data[-1].tags.extend(tr_tag) 

198 

199 

200def parse_section( 

201 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

202) -> None: 

203 section_title = clean_node(wxr, None, level_node.largs).lower() 

204 wxr.wtp.start_subsection(section_title) 

205 if section_title in [ 

206 # Morphological and syntactic properties 

207 "морфологические и синтаксические свойства", 

208 # Type and syntactic properties of the word combination 

209 "тип и синтаксические свойства сочетания", 

210 "тип и свойства сочетания", 

211 ]: 

212 extract_morphological_section(wxr, page_data, level_node) 

213 elif section_title in POS_TITLES: 

214 pos_data = POS_TITLES[section_title] 

215 page_data[-1].pos = pos_data["pos"] 

216 page_data[-1].tags.extend(pos_data.get("tags", [])) 

217 extract_gloss(wxr, page_data[-1], level_node) 

218 elif section_title == "произношение" and wxr.config.capture_pronunciation: 

219 extract_pronunciation_section(wxr, page_data[-1], level_node) 

220 elif section_title == "семантические свойства": # Semantic properties 

221 process_semantic_section(wxr, page_data, level_node) 

222 elif section_title in [ 

223 "значение", 

224 "значения", 

225 "как самостоятельный глагол", 

226 "в значении вспомогательного глагола или связки", 

227 ]: 

228 extract_gloss(wxr, page_data[-1], level_node) 

229 elif section_title == "этимология" and wxr.config.capture_etymologies: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 extract_etymology(wxr, page_data[-1], level_node) 

231 elif ( 

232 section_title 

233 in [ 

234 "фразеологизмы и устойчивые сочетания", 

235 "типичные сочетания", 

236 "фразеологизмы", 

237 "пословицы и поговорки", 

238 ] 

239 and wxr.config.capture_linkages 

240 ): 

241 extract_phrase_section(wxr, page_data[-1], level_node, section_title) 

242 elif ( 242 ↛ 246line 242 didn't jump to line 246 because the condition on line 242 was never true

243 section_title in ["перевод", "иноязычные аналоги"] 

244 and wxr.config.capture_translations 

245 ): 

246 extract_translations(wxr, page_data[-1], level_node) 

247 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 extract_linkage_section( 

249 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node 

250 ) 

251 elif section_title == "библиография": 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 pass 

253 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 parse_roman_section(wxr, page_data[-1], level_node) 

255 elif section_title == "прочее": 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 pass 

257 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 extract_homophone_section(wxr, page_data[-1], level_node) 

259 elif section_title in ALT_FORM_SECTIONS: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 extract_alt_form_section( 

261 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title] 

262 ) 

263 elif section_title == "рифмы": 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true

264 extract_rhyme_section(wxr, page_data[-1], level_node) 

265 elif section_title not in ["см. также", "смотреть также", "смотрите также"]: 

266 wxr.wtp.debug( 

267 f"Unprocessed section {section_title}", 

268 sortid="wixtextract/extractor/ru/page/parse_section/66", 

269 ) 

270 

271 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

272 parse_section(wxr, page_data, next_level_node) 

273 

274 extract_section_end_templates(wxr, page_data[-1], level_node) 

275 

276 

277def parse_page( 

278 wxr: WiktextractContext, page_title: str, page_text: str 

279) -> list[dict[str, Any]]: 

280 # Help site describing page structure: 

281 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей 

282 

283 if wxr.config.verbose: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 logger.info(f"Parsing page: {page_title}") 

285 wxr.config.word = page_title 

286 wxr.wtp.start_page(page_title) 

287 tree = wxr.wtp.parse(page_text) 

288 page_data: list[WordEntry] = [] 

289 

290 for level1_node in tree.find_child(NodeKind.LEVEL1): 

291 lang_code = "" 

292 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 292 ↛ 295line 292 didn't jump to line 295 because the loop on line 292 didn't complete

293 lang_code = subtitle_template.template_name.strip(" -") 

294 break 

295 if lang_code == "": 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 lang_code = "unknown" 

297 if ( 297 ↛ 301line 297 didn't jump to line 301 because the condition on line 297 was never true

298 wxr.config.capture_language_codes is not None 

299 and lang_code not in wxr.config.capture_language_codes 

300 ): 

301 continue 

302 categories = {"categories": []} 

303 lang_name = clean_node(wxr, categories, level1_node.largs) 

304 wxr.wtp.start_section(lang_name) 

305 base_data = WordEntry( 

306 lang=lang_name, 

307 lang_code=lang_code, 

308 word=page_title, 

309 pos="unknown", 

310 ) 

311 base_data.categories.extend(categories["categories"]) 

312 extract_section_end_templates(wxr, base_data, level1_node) 

313 pos_data = get_pos(wxr, level1_node) 

314 if pos_data is not None: 

315 base_data.pos = pos_data["pos"] 

316 base_data.tags.extend(pos_data.get("tags", [])) 

317 

318 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

319 if base_data.pos == "unknown": 

320 pos_data = get_pos(wxr, level2_node) 

321 if pos_data is not None: 

322 base_data.pos = pos_data["pos"] 

323 base_data.tags.extend(pos_data.get("tags", [])) 

324 page_data.append(base_data.model_copy(deep=True)) 

325 extract_level2_node_contents(wxr, page_data[-1], level2_node) 

326 has_level3 = False 

327 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

328 parse_section(wxr, page_data, level3_node) 

329 has_level3 = True 

330 if page_data[-1] == base_data or not has_level3: 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 page_data.pop() 

332 extract_low_quality_page(wxr, page_data, base_data, level2_node) 

333 

334 for any_level_index, any_level_node in enumerate( 

335 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2) 

336 ): 

337 if any_level_index == 0 and ( 

338 len(page_data) == 0 

339 or page_data[-1].lang_code != base_data.lang_code 

340 ): 

341 page_data.append(base_data.model_copy(deep=True)) 

342 parse_section(wxr, page_data, any_level_node) 

343 

344 if len(page_data) > 0 and page_data[-1] == base_data: 344 ↛ 345line 344 didn't jump to line 345 because the condition on line 344 was never true

345 page_data.pop() 

346 extract_low_quality_page(wxr, page_data, base_data, level1_node) 

347 

348 for d in page_data: 

349 if len(d.senses) == 0: 349 ↛ 350line 349 didn't jump to line 350 because the condition on line 349 was never true

350 d.senses.append(Sense(tags=["no-gloss"])) 

351 return [d.model_dump(exclude_defaults=True) for d in page_data] 

352 

353 

354def extract_low_quality_page( 

355 wxr: WiktextractContext, 

356 page_data: list[WordEntry], 

357 base_data: WordEntry, 

358 level_node: WikiNode, 

359) -> None: 

360 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

361 if isinstance(node, TemplateNode) and node.template_name.startswith( 

362 "Форма-" 

363 ): 

364 process_form_template(wxr, page_data, base_data, node) 

365 elif isinstance(node, WikiNode): 

366 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 

367 if template_node.template_name.startswith("Форма-"): 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 process_form_template( 

369 wxr, page_data, base_data, template_node 

370 ) 

371 

372 

373def process_form_template( 

374 wxr: WiktextractContext, 

375 page_data: list[WordEntry], 

376 base_data: WordEntry, 

377 template_node: TemplateNode, 

378) -> None: 

379 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ 

380 # Шаблон:Форма-гл, "Шаблон:форма-гл en" 

381 pos_data = get_pos_from_template(wxr, template_node) 

382 if pos_data is not None: 382 ↛ 386line 382 didn't jump to line 386 because the condition on line 382 was always true

383 base_data.pos = pos_data["pos"] 

384 base_data.tags.extend(pos_data.get("tags", [])) 

385 

386 form_of = clean_node( 

387 wxr, 

388 None, 

389 template_node.template_parameters.get( 

390 "база", template_node.template_parameters.get(1, "") 

391 ), 

392 ) 

393 ipa = clean_node( 

394 wxr, None, template_node.template_parameters.get("МФА", "") 

395 ) 

396 expanded_node = wxr.wtp.parse( 

397 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

398 ) 

399 current_data = base_data.model_copy(deep=True) 

400 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

401 gloss_text = clean_node(wxr, None, list_item.children) 

402 if len(gloss_text) > 0: 402 ↛ 400line 402 didn't jump to line 400 because the condition on line 402 was always true

403 sense = Sense(glosses=[gloss_text]) 

404 if len(form_of) > 0: 404 ↛ 407line 404 didn't jump to line 407 because the condition on line 404 was always true

405 sense.form_of.append(AltForm(word=form_of)) 

406 sense.tags.append("form-of") 

407 current_data.senses.append(sense) 

408 

409 if len(ipa) > 0: 409 ↛ 411line 409 didn't jump to line 411 because the condition on line 409 was always true

410 current_data.sounds.append(Sound(ipa=ipa)) 

411 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 411 ↛ exitline 411 didn't return from function 'process_form_template' because the condition on line 411 was always true

412 clean_node(wxr, current_data, template_node) 

413 page_data.append(current_data) 

414 

415 

416def parse_roman_section( 

417 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

418) -> None: 

419 for link_node in level_node.find_child(NodeKind.LINK): 

420 form_text = clean_node(wxr, None, link_node) 

421 if form_text != "": 

422 form = Form(form=form_text, tags=["romanization"]) 

423 word_entry.forms.append(form) 

424 

425 

426def extract_section_end_templates( 

427 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

428) -> None: 

429 # category link templates 

430 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации 

431 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

432 if template_node.template_name in { 432 ↛ 444line 432 didn't jump to line 444 because the condition on line 432 was never true

433 "-ание", 

434 "-атель", 

435 "-ация", 

436 "-ение", 

437 "-ка", 

438 "длина слова", 

439 "Категория", 

440 "Омонимы", 

441 "forms", 

442 "multilang", 

443 }: 

444 clean_node(wxr, word_entry, template_node) 

445 elif template_node.template_name == "zh-forms": 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true

446 extract_zh_forms_template(wxr, word_entry, template_node) 

447 

448 

449def extract_zh_forms_template( 

450 wxr: WiktextractContext, 

451 base_data: WordEntry, 

452 template_node: TemplateNode, 

453) -> None: 

454 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms 

455 # https://ru.wiktionary.org/wiki/Модуль:zh-forms 

456 # similar to en and zh edition template 

457 for p_name, p_value in template_node.template_parameters.items(): 

458 if not isinstance(p_name, str): 

459 continue 

460 if re.fullmatch(r"s\d*", p_name): 

461 form_data = Form( 

462 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"] 

463 ) 

464 if form_data.form not in ["", wxr.wtp.title]: 

465 base_data.forms.append(form_data) 

466 elif re.fullmatch(r"t\d*", p_name): 

467 form_data = Form( 

468 form=clean_node(wxr, None, p_value), 

469 tags=["Traditional Chinese"], 

470 ) 

471 if form_data.form not in ["", wxr.wtp.title]: 

472 base_data.forms.append(form_data) 

473 elif p_name == "lit": 

474 base_data.literal_meaning = clean_node(wxr, None, p_value) 

475 

476 

477def extract_level2_node_contents( 

478 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

479) -> None: 

480 for t_node in level_node.find_content(NodeKind.TEMPLATE): 

481 if t_node.template_name in ["заголовок", "з"]: 481 ↛ 480line 481 didn't jump to line 480 because the condition on line 481 was always true

482 # https://ru.wiktionary.org/wiki/Шаблон:з 

483 stressed_form = clean_node( 

484 wxr, None, t_node.template_parameters.get("ударение", "") 

485 ) 

486 if "(" in stressed_form: 

487 stressed_form = stressed_form[ 

488 : stressed_form.index("(") 

489 ].strip() 

490 if stressed_form not in ["", wxr.wtp.title]: 

491 word_entry.forms.append( 

492 Form(form=stressed_form, tags=["stressed"]) 

493 )