Coverage for src/wiktextract/extractor/ru/page.py: 74%

239 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from typing import Any 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...config import POSSubtitleData 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from ...wxr_logging import logger 

16from .etymology import extract_etymology 

17from .gloss import extract_gloss, process_meaning_template 

18from .inflection import parse_adj_forms_table, parse_wikitext_forms_table 

19from .linkage import ( 

20 extract_linkages, 

21 extract_phrase_section, 

22 process_related_block_template, 

23) 

24from .models import AltForm, Form, Sense, Sound, WordEntry 

25from .pronunciation import ( 

26 extract_homophone_section, 

27 extract_pronunciation_section, 

28) 

29from .section_titles import LINKAGE_TITLES, POS_TEMPLATE_NAMES, POS_TITLES 

30from .tags import MORPHOLOGICAL_TEMPLATE_TAGS 

31from .translation import extract_translations 

32 

33 

34def process_semantic_section( 

35 wxr: WiktextractContext, 

36 page_data: list[WordEntry], 

37 semantic_level_node: WikiNode, 

38): 

39 for list_node in semantic_level_node.find_child(NodeKind.LIST): 

40 for template_node in list_node.find_child_recursively( 

41 NodeKind.TEMPLATE 

42 ): 

43 if template_node.template_name == "значение": 43 ↛ 40line 43 didn't jump to line 40 because the condition on line 43 was always true

44 sense = process_meaning_template( 

45 wxr, None, page_data[-1], template_node 

46 ) 

47 if len(sense.glosses) > 0: 47 ↛ 40line 47 didn't jump to line 40 because the condition on line 47 was always true

48 page_data[-1].senses.append(sense) 

49 

50 

51MORPH_TEMPLATE_ARGS = { 

52 "p": "prefix", 

53 "prefix": "prefix", 

54 "i": "interfix", 

55 "interfix": "interfix", 

56 "in": "infix", 

57 "infix": "infix", 

58 "s": "suffix", 

59 "suffix": "suffix", 

60 "t": "transfix", 

61 "transfix": "transfix", 

62 "po": "suffix", 

63 "postfix": "suffix", 

64 "c": "circumfix", 

65 "confix": "circumfix", 

66 "circumfix": "circumfix", 

67 "r": "root", 

68 "e": "suffix", 

69 "ending": "suffix", 

70} 

71 

72 

73def get_pos_from_template( 

74 wxr: WiktextractContext, template_node: TemplateNode 

75) -> POSSubtitleData | None: 

76 # Search for POS in template names 

77 template_name = template_node.template_name.lower() 

78 if template_name == "morph": 

79 # https://ru.wiktionary.org/wiki/Шаблон:morph 

80 pos_type = template_node.template_parameters.get("тип", "") 

81 if pos_type in MORPH_TEMPLATE_ARGS: 81 ↛ 99line 81 didn't jump to line 99 because the condition on line 81 was always true

82 return { 

83 "pos": MORPH_TEMPLATE_ARGS[pos_type], 

84 "tags": ["morpheme"], 

85 } 

86 elif ( 

87 template_name in {"заголовок", "з"} 

88 and 1 in template_node.template_parameters 

89 ): 

90 pos_text = clean_node( 

91 wxr, None, template_node.template_parameters[1] 

92 ).strip("()") 

93 if len(pos_text) == 0: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 return 

95 pos_text = pos_text.split()[0] 

96 if pos_text in POS_TITLES: 

97 return POS_TITLES[pos_text] 

98 

99 for part in template_name.split(maxsplit=2): 

100 for subpart in part.split("-", maxsplit=2): 

101 if subpart in POS_TEMPLATE_NAMES: 

102 return POS_TEMPLATE_NAMES[subpart] 

103 

104 

105def get_pos( 

106 wxr: WiktextractContext, level_node: WikiNode 

107) -> POSSubtitleData | None: 

108 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

109 pos_data = get_pos_from_template(wxr, template_node) 

110 if pos_data is not None: 110 ↛ 108line 110 didn't jump to line 108 because the condition on line 110 was always true

111 return pos_data 

112 # POS text could also in level node content 

113 for template_node in level_node.find_content(NodeKind.TEMPLATE): 

114 pos_data = get_pos_from_template(wxr, template_node) 

115 if pos_data is not None: 

116 return pos_data 

117 

118 # Search for POS in section text 

119 text = clean_node( 

120 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS)) 

121 ) 

122 for pos_string in POS_TITLES.keys(): 

123 if pos_string in text.lower(): 

124 return POS_TITLES[pos_string] 

125 

126 

127def extract_morphological_section( 

128 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

129) -> None: 

130 param_tag_map = { 

131 "степень": "comparative", # Шаблон:inflection/ru/adj 

132 "соотв": "perfective", # Шаблон:Гл-блок 

133 } 

134 

135 pos_data = get_pos(wxr, level_node) 

136 if pos_data is not None: 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true

137 page_data[-1].pos = pos_data["pos"] 

138 page_data[-1].tags.extend(pos_data.get("tags", [])) 

139 for child_node in level_node.find_child(NodeKind.TEMPLATE): 

140 expanded_template = wxr.wtp.parse( 

141 wxr.wtp.node_to_wikitext(child_node), expand_all=True 

142 ) 

143 if child_node.template_name.startswith("прил"): 

144 parse_adj_forms_table(wxr, page_data[-1], expanded_template) 

145 elif child_node.template_name.startswith(("сущ", "гл")): 

146 parse_wikitext_forms_table(wxr, page_data[-1], expanded_template) 

147 

148 for node in expanded_template.children: 

149 node_text = clean_node(wxr, page_data[-1], node) 

150 for text in node_text.split(","): 

151 text = text.strip() 

152 if text in MORPHOLOGICAL_TEMPLATE_TAGS: 

153 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text] 

154 if isinstance(tr_tag, str): 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 page_data[-1].tags.append(tr_tag) 

156 elif isinstance(tr_tag, list): 

157 page_data[-1].tags.extend(tr_tag) 

158 

159 for param, tag in param_tag_map.items(): 

160 if param in child_node.template_parameters: 

161 forms_text = clean_node( 

162 wxr, None, child_node.template_parameters[param] 

163 ) 

164 for form in forms_text.split(","): 

165 form = form.strip() 

166 if form != "": 166 ↛ 164line 166 didn't jump to line 164 because the condition on line 166 was always true

167 page_data[-1].forms.append(Form(form=form, tags=[tag])) 

168 

169 

170def parse_section( 

171 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

172) -> None: 

173 section_title = clean_node(wxr, None, level_node.largs).lower() 

174 wxr.wtp.start_subsection(section_title) 

175 if section_title in [ 

176 # Morphological and syntactic properties 

177 "морфологические и синтаксические свойства", 

178 # Type and syntactic properties of the word combination 

179 "тип и синтаксические свойства сочетания", 

180 "тип и свойства сочетания", 

181 ]: 

182 extract_morphological_section(wxr, page_data, level_node) 

183 elif section_title in POS_TITLES: 

184 pos_data = POS_TITLES[section_title] 

185 page_data[-1].pos = pos_data["pos"] 

186 page_data[-1].tags.extend(pos_data.get("tags", [])) 

187 extract_gloss(wxr, page_data[-1], level_node) 

188 elif section_title == "произношение" and wxr.config.capture_pronunciation: 

189 extract_pronunciation_section(wxr, page_data[-1], level_node) 

190 elif section_title == "семантические свойства": # Semantic properties 

191 process_semantic_section(wxr, page_data, level_node) 

192 elif section_title in ("значение", "значения"): 

193 extract_gloss(wxr, page_data[-1], level_node) 

194 elif section_title == "родственные слова" and wxr.config.capture_linkages: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was never true

195 # Word family 

196 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

197 if template_node.template_name == "родств-блок": 

198 process_related_block_template( 

199 wxr, page_data[-1], template_node 

200 ) 

201 elif section_title == "этимология" and wxr.config.capture_etymologies: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 extract_etymology(wxr, page_data[-1], level_node) 

203 elif ( 203 ↛ 214line 203 didn't jump to line 214 because the condition on line 203 was always true

204 section_title 

205 in [ 

206 "фразеологизмы и устойчивые сочетания", 

207 "типичные сочетания", 

208 "фразеологизмы", 

209 "пословицы и поговорки", 

210 ] 

211 and wxr.config.capture_linkages 

212 ): 

213 extract_phrase_section(wxr, page_data[-1], level_node, section_title) 

214 elif ( 

215 section_title in ["перевод", "иноязычные аналоги"] 

216 and wxr.config.capture_translations 

217 ): 

218 extract_translations(wxr, page_data[-1], level_node) 

219 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 

220 extract_linkages( 

221 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node 

222 ) 

223 elif section_title == "библиография": 

224 pass 

225 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 

226 parse_roman_section(wxr, page_data[-1], level_node) 

227 elif section_title == "прочее": 

228 pass 

229 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 

230 extract_homophone_section(wxr, page_data[-1], level_node) 

231 else: 

232 wxr.wtp.debug( 

233 f"Unprocessed section {section_title}", 

234 sortid="wixtextract/extractor/ru/page/parse_section/66", 

235 ) 

236 

237 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

238 parse_section(wxr, page_data, next_level_node) 

239 

240 extract_section_end_templates(wxr, page_data[-1], level_node) 

241 

242 

243def parse_page( 

244 wxr: WiktextractContext, page_title: str, page_text: str 

245) -> list[dict[str, Any]]: 

246 # Help site describing page structure: 

247 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей 

248 

249 if wxr.config.verbose: 249 ↛ 250line 249 didn't jump to line 250 because the condition on line 249 was never true

250 logger.info(f"Parsing page: {page_title}") 

251 wxr.config.word = page_title 

252 wxr.wtp.start_page(page_title) 

253 tree = wxr.wtp.parse(page_text) 

254 page_data: list[WordEntry] = [] 

255 

256 for level1_node in tree.find_child(NodeKind.LEVEL1): 

257 lang_code = "" 

258 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 258 ↛ 261line 258 didn't jump to line 261 because the loop on line 258 didn't complete

259 lang_code = subtitle_template.template_name.strip(" -") 

260 break 

261 if lang_code == "": 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 lang_code = "unknown" 

263 if ( 263 ↛ 267line 263 didn't jump to line 267 because the condition on line 263 was never true

264 wxr.config.capture_language_codes is not None 

265 and lang_code not in wxr.config.capture_language_codes 

266 ): 

267 continue 

268 categories = {"categories": []} 

269 lang_name = clean_node(wxr, categories, level1_node.largs) 

270 wxr.wtp.start_section(lang_name) 

271 base_data = WordEntry( 

272 lang=lang_name, 

273 lang_code=lang_code, 

274 word=page_title, 

275 pos="unknown", 

276 ) 

277 base_data.categories.extend(categories["categories"]) 

278 extract_section_end_templates(wxr, base_data, level1_node) 

279 pos_data = get_pos(wxr, level1_node) 

280 if pos_data is not None: 

281 base_data.pos = pos_data["pos"] 

282 base_data.tags.extend(pos_data.get("tags", [])) 

283 

284 for level2_node in level1_node.find_child(NodeKind.LEVEL2): 

285 if base_data.pos == "unknown": 

286 pos_data = get_pos(wxr, level2_node) 

287 if pos_data is not None: 

288 base_data.pos = pos_data["pos"] 

289 base_data.tags.extend(pos_data.get("tags", [])) 

290 page_data.append(base_data.model_copy(deep=True)) 

291 extract_level2_node_contents(wxr, page_data[-1], level2_node) 

292 has_level3 = False 

293 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

294 parse_section(wxr, page_data, level3_node) 

295 has_level3 = True 

296 if page_data[-1] == base_data or not has_level3: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true

297 page_data.pop() 

298 extract_low_quality_page(wxr, page_data, base_data, level2_node) 

299 

300 for any_level_index, any_level_node in enumerate( 

301 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2) 

302 ): 

303 if any_level_index == 0 and ( 

304 len(page_data) == 0 

305 or page_data[-1].lang_code != base_data.lang_code 

306 ): 

307 page_data.append(base_data.model_copy(deep=True)) 

308 parse_section(wxr, page_data, any_level_node) 

309 

310 if len(page_data) > 0 and page_data[-1] == base_data: 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true

311 page_data.pop() 

312 extract_low_quality_page(wxr, page_data, base_data, level1_node) 

313 

314 for d in page_data: 

315 if len(d.senses) == 0: 315 ↛ 316line 315 didn't jump to line 316 because the condition on line 315 was never true

316 d.senses.append(Sense(tags=["no-gloss"])) 

317 return [d.model_dump(exclude_defaults=True) for d in page_data] 

318 

319 

320def extract_low_quality_page( 

321 wxr: WiktextractContext, 

322 page_data: list[WordEntry], 

323 base_data: WordEntry, 

324 level_node: WikiNode, 

325) -> None: 

326 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS): 

327 if isinstance(node, TemplateNode) and node.template_name.startswith( 

328 "Форма-" 

329 ): 

330 process_form_template(wxr, page_data, base_data, node) 

331 elif isinstance(node, WikiNode): 

332 for template_node in node.find_child_recursively(NodeKind.TEMPLATE): 

333 if template_node.template_name.startswith("Форма-"): 333 ↛ 334line 333 didn't jump to line 334 because the condition on line 333 was never true

334 process_form_template( 

335 wxr, page_data, base_data, template_node 

336 ) 

337 

338 

339def process_form_template( 

340 wxr: WiktextractContext, 

341 page_data: list[WordEntry], 

342 base_data: WordEntry, 

343 template_node: TemplateNode, 

344) -> None: 

345 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ 

346 # Шаблон:Форма-гл, "Шаблон:форма-гл en" 

347 pos_data = get_pos_from_template(wxr, template_node) 

348 if pos_data is not None: 348 ↛ 352line 348 didn't jump to line 352 because the condition on line 348 was always true

349 base_data.pos = pos_data["pos"] 

350 base_data.tags.extend(pos_data.get("tags", [])) 

351 

352 form_of = clean_node( 

353 wxr, 

354 None, 

355 template_node.template_parameters.get( 

356 "база", template_node.template_parameters.get(1, "") 

357 ), 

358 ) 

359 ipa = clean_node( 

360 wxr, None, template_node.template_parameters.get("МФА", "") 

361 ) 

362 expanded_node = wxr.wtp.parse( 

363 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

364 ) 

365 current_data = base_data.model_copy(deep=True) 

366 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM): 

367 gloss_text = clean_node(wxr, None, list_item.children) 

368 if len(gloss_text) > 0: 368 ↛ 366line 368 didn't jump to line 366 because the condition on line 368 was always true

369 sense = Sense(glosses=[gloss_text]) 

370 if len(form_of) > 0: 370 ↛ 373line 370 didn't jump to line 373 because the condition on line 370 was always true

371 sense.form_of.append(AltForm(word=form_of)) 

372 sense.tags.append("form-of") 

373 current_data.senses.append(sense) 

374 

375 if len(ipa) > 0: 375 ↛ 377line 375 didn't jump to line 377 because the condition on line 375 was always true

376 current_data.sounds.append(Sound(ipa=ipa)) 

377 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 377 ↛ exitline 377 didn't return from function 'process_form_template' because the condition on line 377 was always true

378 clean_node(wxr, current_data, template_node) 

379 page_data.append(current_data) 

380 

381 

382def parse_roman_section( 

383 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

384) -> None: 

385 for link_node in level_node.find_child(NodeKind.LINK): 

386 form_text = clean_node(wxr, None, link_node) 

387 if form_text != "": 

388 form = Form(form=form_text, tags=["romanization"]) 

389 word_entry.forms.append(form) 

390 

391 

392def extract_section_end_templates( 

393 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode 

394) -> None: 

395 # category link templates 

396 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации 

397 for template_node in level_node.find_child(NodeKind.TEMPLATE): 

398 if template_node.template_name in { 398 ↛ 410line 398 didn't jump to line 410 because the condition on line 398 was never true

399 "-ание", 

400 "-атель", 

401 "-ация", 

402 "-ение", 

403 "-ка", 

404 "длина слова", 

405 "Категория", 

406 "Омонимы", 

407 "forms", 

408 "multilang", 

409 }: 

410 clean_node(wxr, word_entry, template_node) 

411 elif template_node.template_name == "zh-forms": 411 ↛ 412line 411 didn't jump to line 412 because the condition on line 411 was never true

412 extract_zh_forms_template(wxr, word_entry, template_node) 

413 

414 

415def extract_zh_forms_template( 

416 wxr: WiktextractContext, 

417 base_data: WordEntry, 

418 template_node: TemplateNode, 

419) -> None: 

420 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms 

421 # https://ru.wiktionary.org/wiki/Модуль:zh-forms 

422 # similar to en and zh edition template 

423 for p_name, p_value in template_node.template_parameters.items(): 

424 if not isinstance(p_name, str): 

425 continue 

426 if re.fullmatch(r"s\d*", p_name): 

427 form_data = Form( 

428 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"] 

429 ) 

430 if form_data.form not in ["", wxr.wtp.title]: 

431 base_data.forms.append(form_data) 

432 elif re.fullmatch(r"t\d*", p_name): 

433 form_data = Form( 

434 form=clean_node(wxr, None, p_value), 

435 tags=["Traditional Chinese"], 

436 ) 

437 if form_data.form not in ["", wxr.wtp.title]: 

438 base_data.forms.append(form_data) 

439 elif p_name == "lit": 

440 base_data.literal_meaning = clean_node(wxr, None, p_value) 

441 

442 

443def extract_level2_node_contents( 

444 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode 

445) -> None: 

446 for t_node in level_node.find_content(NodeKind.TEMPLATE): 

447 if t_node.template_name in ["заголовок", "з"]: 447 ↛ 446line 447 didn't jump to line 446 because the condition on line 447 was always true

448 # https://ru.wiktionary.org/wiki/Шаблон:з 

449 stressed_form = clean_node( 

450 wxr, None, t_node.template_parameters.get("ударение", "") 

451 ) 

452 if "(" in stressed_form: 

453 stressed_form = stressed_form[ 

454 : stressed_form.index("(") 

455 ].strip() 

456 if stressed_form not in ["", wxr.wtp.title]: 

457 word_entry.forms.append( 

458 Form(form=stressed_form, tags=["stressed"]) 

459 )