Coverage for src/wiktextract/extractor/ru/page.py: 76%

1import re

2from typing import Any

4from wikitextprocessor.parser import (

5 LEVEL_KIND_FLAGS,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...config import POSSubtitleData

13from ...page import clean_node

14from ...wxr_context import WiktextractContext

15from ...wxr_logging import logger

16from .etymology import extract_etymology

17from .gloss import extract_gloss, process_meaning_template

18from .inflection import (

19 extract_прил_ru_comparative_forms,

20 parse_html_forms_table,

21 parse_wikitext_forms_table,

22)

23from .linkage import (

24 extract_alt_form_section,

25 extract_linkage_section,

26 extract_phrase_section,

27)

28from .models import AltForm, Form, Sense, Sound, WordEntry

29from .pronunciation import (

30 extract_homophone_section,

31 extract_pronunciation_section,

32 extract_rhyme_section,

33)

34from .section_titles import (

35 ALT_FORM_SECTIONS,

36 LINKAGE_TITLES,

37 POS_TEMPLATE_NAMES,

38 POS_TITLES,

39)

40from .tags import MORPHOLOGICAL_TEMPLATE_TAGS

41from .translation import extract_translations

44def process_semantic_section(

45 wxr: WiktextractContext, page_data: list[WordEntry], level_node: LevelNode

46):

47 for list_node in level_node.find_child(NodeKind.LIST):

48 for template_node in list_node.find_child_recursively(

49 NodeKind.TEMPLATE

50 ):

51 if template_node.template_name == "значение": 51 ↛ 48line 51 didn't jump to line 48 because the condition on line 51 was always true

52 sense = process_meaning_template(

53 wxr, None, page_data[-1], template_node

54 )

55 if len(sense.glosses) > 0: 55 ↛ 48line 55 didn't jump to line 48 because the condition on line 55 was always true

56 page_data[-1].senses.append(sense)

59MORPH_TEMPLATE_ARGS = {

60 "p": "prefix",

61 "prefix": "prefix",

62 "i": "interfix",

63 "interfix": "interfix",

64 "in": "infix",

65 "infix": "infix",

66 "s": "suffix",

67 "suffix": "suffix",

68 "t": "transfix",

69 "transfix": "transfix",

70 "po": "suffix",

71 "postfix": "suffix",

72 "c": "circumfix",

73 "confix": "circumfix",

74 "circumfix": "circumfix",

75 "r": "root",

76 "e": "suffix",

77 "ending": "suffix",

78}

81def get_pos_from_template(

82 wxr: WiktextractContext, template_node: TemplateNode

83) -> POSSubtitleData | None:

84 # Search for POS in template names

85 template_name = template_node.template_name.lower()

86 if template_name == "morph":

87 # https://ru.wiktionary.org/wiki/Шаблон:morph

88 pos_type = template_node.template_parameters.get("тип", "")

89 if pos_type in MORPH_TEMPLATE_ARGS: 89 ↛ 107line 89 didn't jump to line 107 because the condition on line 89 was always true

90 return {

91 "pos": MORPH_TEMPLATE_ARGS[pos_type],

92 "tags": ["morpheme"],

93 }

94 elif (

95 template_name in {"заголовок", "з"}

96 and 1 in template_node.template_parameters

97 ):

98 pos_text = clean_node(

99 wxr, None, template_node.template_parameters[1]

100 ).strip("()")

101 if len(pos_text) == 0: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 return

103 pos_text = pos_text.split()[0]

104 if pos_text in POS_TITLES:

105 return POS_TITLES[pos_text]

106

107 if template_name.startswith("прил ru"):

108 pos_arg = clean_node(

109 wxr, None, template_node.template_parameters.get("часть речи", "")

110 ).lower()

111 if pos_arg != "": 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 for pos_string in POS_TITLES.keys():

113 if pos_string in pos_arg:

114 return POS_TITLES[pos_string]

115 else:

116 return {"pos": "adj"}

117

118 for part in template_name.split(maxsplit=2):

119 for subpart in part.split("-", maxsplit=2):

120 if subpart in POS_TEMPLATE_NAMES:

121 return POS_TEMPLATE_NAMES[subpart]

122

123

124def get_pos(

125 wxr: WiktextractContext, level_node: WikiNode

126) -> POSSubtitleData | None:

127 for template_node in level_node.find_child(NodeKind.TEMPLATE):

128 pos_data = get_pos_from_template(wxr, template_node)

129 if pos_data is not None: 129 ↛ 127line 129 didn't jump to line 127 because the condition on line 129 was always true

130 return pos_data

131 # POS text could also in level node content

132 for template_node in level_node.find_content(NodeKind.TEMPLATE):

133 pos_data = get_pos_from_template(wxr, template_node)

134 if pos_data is not None:

135 return pos_data

136

137 # Search for POS in section text

138 text = clean_node(

139 wxr, None, list(level_node.invert_find_child(LEVEL_KIND_FLAGS))

140 )

141 for pos_string in POS_TITLES.keys():

142 if pos_string in text.lower():

143 return POS_TITLES[pos_string]

144

145

146def extract_morphological_section(

147 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode

148) -> None:

149 pos_data = get_pos(wxr, level_node)

150 if pos_data is not None: 150 ↛ 153line 150 didn't jump to line 153 because the condition on line 150 was always true

151 page_data[-1].pos = pos_data["pos"]

152 page_data[-1].tags.extend(pos_data.get("tags", []))

153 for child_node in level_node.find_child(NodeKind.TEMPLATE):

154 expanded_template = wxr.wtp.parse(

155 wxr.wtp.node_to_wikitext(child_node), expand_all=True

156 )

157 clean_node(wxr, page_data[-1], expanded_template) # add category links

158 if child_node.template_name.startswith(

159 (

160 "прил ru",

161 "прил-ru",

162 "сущ ",

163 "сущ-ru",

164 "гл ",

165 "мест ru ",

166 "числ ru ",

167 "числ-",

168 "прич ru ",

169 "Фам ru ",

170 "падежи ",

171 )

172 ):

173 for table_node in expanded_template.find_child_recursively(

174 NodeKind.TABLE

175 ):

176 parse_wikitext_forms_table(wxr, page_data[-1], table_node)

177 for table_tag in expanded_template.find_html("table"):

178 parse_html_forms_table(wxr, page_data[-1], table_tag)

179 page_data[-1].hyphenation = clean_node(

180 wxr, None, child_node.template_parameters.get("слоги", "")

181 )

182

183 if child_node.template_name.startswith("прил ru"):

184 extract_прил_ru_comparative_forms(

185 wxr, page_data[-1], expanded_template

186 )

187

188 for node in expanded_template.children:

189 node_text = clean_node(wxr, page_data[-1], node)

190 for text in node_text.split(","):

191 text = text.strip()

192 if text in MORPHOLOGICAL_TEMPLATE_TAGS:

193 tr_tag = MORPHOLOGICAL_TEMPLATE_TAGS[text]

194 if isinstance(tr_tag, str): 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true

195 page_data[-1].tags.append(tr_tag)

196 elif isinstance(tr_tag, list):

197 page_data[-1].tags.extend(tr_tag)

198

199

200def parse_section(

201 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode

202) -> None:

203 section_title = clean_node(wxr, None, level_node.largs).lower()

204 wxr.wtp.start_subsection(section_title)

205 if section_title in [

206 # Morphological and syntactic properties

207 "морфологические и синтаксические свойства",

208 # Type and syntactic properties of the word combination

209 "тип и синтаксические свойства сочетания",

210 "тип и свойства сочетания",

211 ]:

212 extract_morphological_section(wxr, page_data, level_node)

213 elif section_title in POS_TITLES:

214 pos_data = POS_TITLES[section_title]

215 page_data[-1].pos = pos_data["pos"]

216 page_data[-1].tags.extend(pos_data.get("tags", []))

217 extract_gloss(wxr, page_data[-1], level_node)

218 elif section_title == "произношение" and wxr.config.capture_pronunciation:

219 extract_pronunciation_section(wxr, page_data[-1], level_node)

220 elif section_title == "семантические свойства": # Semantic properties

221 process_semantic_section(wxr, page_data, level_node)

222 elif section_title in [

223 "значение",

224 "значения",

225 "как самостоятельный глагол",

226 "в значении вспомогательного глагола или связки",

227 ]:

228 extract_gloss(wxr, page_data[-1], level_node)

229 elif section_title == "этимология" and wxr.config.capture_etymologies: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 extract_etymology(wxr, page_data[-1], level_node)

231 elif (

232 section_title

233 in [

234 "фразеологизмы и устойчивые сочетания",

235 "типичные сочетания",

236 "фразеологизмы",

237 "пословицы и поговорки",

238 ]

239 and wxr.config.capture_linkages

240 ):

241 extract_phrase_section(wxr, page_data[-1], level_node, section_title)

242 elif ( 242 ↛ 246line 242 didn't jump to line 246 because the condition on line 242 was never true

243 section_title in ["перевод", "иноязычные аналоги"]

244 and wxr.config.capture_translations

245 ):

246 extract_translations(wxr, page_data[-1], level_node)

247 elif section_title in LINKAGE_TITLES and wxr.config.capture_linkages: 247 ↛ 248line 247 didn't jump to line 248 because the condition on line 247 was never true

248 extract_linkage_section(

249 wxr, page_data[-1], LINKAGE_TITLES[section_title], level_node

250 )

251 elif section_title == "библиография": 251 ↛ 252line 251 didn't jump to line 252 because the condition on line 251 was never true

252 pass

253 elif section_title in ["латиница (latinça)", "латиница (latinca)"]: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 parse_roman_section(wxr, page_data[-1], level_node)

255 elif section_title == "прочее": 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 pass

257 elif section_title == "омофоны" and wxr.config.capture_pronunciation: 257 ↛ 258line 257 didn't jump to line 258 because the condition on line 257 was never true

258 extract_homophone_section(wxr, page_data[-1], level_node)

259 elif section_title in ALT_FORM_SECTIONS: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 extract_alt_form_section(

261 wxr, page_data[-1], level_node, ALT_FORM_SECTIONS[section_title]

262 )

263 elif section_title == "рифмы": 263 ↛ 265line 263 didn't jump to line 265 because the condition on line 263 was always true

264 extract_rhyme_section(wxr, page_data[-1], level_node)

265 elif section_title not in ["см. также", "смотреть также", "смотрите также"]:

266 wxr.wtp.debug(

267 f"Unprocessed section {section_title}",

268 sortid="wixtextract/extractor/ru/page/parse_section/66",

269 )

270

271 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

272 parse_section(wxr, page_data, next_level_node)

273

274 extract_section_end_templates(wxr, page_data[-1], level_node)

275

276

277def parse_page(

278 wxr: WiktextractContext, page_title: str, page_text: str

279) -> list[dict[str, Any]]:

280 # Help site describing page structure:

281 # https://ru.wiktionary.org/wiki/Викисловарь:Правила_оформления_статей

282

283 if wxr.config.verbose: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 logger.info(f"Parsing page: {page_title}")

285 wxr.config.word = page_title

286 wxr.wtp.start_page(page_title)

287 tree = wxr.wtp.parse(page_text)

288 page_data: list[WordEntry] = []

289

290 for level1_node in tree.find_child(NodeKind.LEVEL1):

291 lang_code = ""

292 for subtitle_template in level1_node.find_content(NodeKind.TEMPLATE): 292 ↛ 295line 292 didn't jump to line 295 because the loop on line 292 didn't complete

293 lang_code = subtitle_template.template_name.strip(" -")

294 break

295 if lang_code == "": 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 lang_code = "unknown"

297 if ( 297 ↛ 301line 297 didn't jump to line 301 because the condition on line 297 was never true

298 wxr.config.capture_language_codes is not None

299 and lang_code not in wxr.config.capture_language_codes

300 ):

301 continue

302 categories = {"categories": []}

303 lang_name = clean_node(wxr, categories, level1_node.largs)

304 wxr.wtp.start_section(lang_name)

305 base_data = WordEntry(

306 lang=lang_name,

307 lang_code=lang_code,

308 word=page_title,

309 pos="unknown",

310 )

311 base_data.categories.extend(categories["categories"])

312 extract_section_end_templates(wxr, base_data, level1_node)

313 pos_data = get_pos(wxr, level1_node)

314 if pos_data is not None:

315 base_data.pos = pos_data["pos"]

316 base_data.tags.extend(pos_data.get("tags", []))

317

318 for level2_node in level1_node.find_child(NodeKind.LEVEL2):

319 if base_data.pos == "unknown":

320 pos_data = get_pos(wxr, level2_node)

321 if pos_data is not None:

322 base_data.pos = pos_data["pos"]

323 base_data.tags.extend(pos_data.get("tags", []))

324 page_data.append(base_data.model_copy(deep=True))

325 extract_level2_node_contents(wxr, page_data[-1], level2_node)

326 has_level3 = False

327 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

328 parse_section(wxr, page_data, level3_node)

329 has_level3 = True

330 if page_data[-1] == base_data or not has_level3: 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 page_data.pop()

332 extract_low_quality_page(wxr, page_data, base_data, level2_node)

333

334 for any_level_index, any_level_node in enumerate(

335 level1_node.find_child(LEVEL_KIND_FLAGS & ~NodeKind.LEVEL2)

336 ):

337 if any_level_index == 0 and (

338 len(page_data) == 0

339 or page_data[-1].lang_code != base_data.lang_code

340 ):

341 page_data.append(base_data.model_copy(deep=True))

342 parse_section(wxr, page_data, any_level_node)

343

344 if len(page_data) > 0 and page_data[-1] == base_data: 344 ↛ 345line 344 didn't jump to line 345 because the condition on line 344 was never true

345 page_data.pop()

346 extract_low_quality_page(wxr, page_data, base_data, level1_node)

347

348 for d in page_data:

349 if len(d.senses) == 0: 349 ↛ 350line 349 didn't jump to line 350 because the condition on line 349 was never true

350 d.senses.append(Sense(tags=["no-gloss"]))

351 return [d.model_dump(exclude_defaults=True) for d in page_data]

352

353

354def extract_low_quality_page(

355 wxr: WiktextractContext,

356 page_data: list[WordEntry],

357 base_data: WordEntry,

358 level_node: WikiNode,

359) -> None:

360 for node in level_node.invert_find_child(LEVEL_KIND_FLAGS):

361 if isinstance(node, TemplateNode) and node.template_name.startswith(

362 "Форма-"

363 ):

364 process_form_template(wxr, page_data, base_data, node)

365 elif isinstance(node, WikiNode):

366 for template_node in node.find_child_recursively(NodeKind.TEMPLATE):

367 if template_node.template_name.startswith("Форма-"): 367 ↛ 368line 367 didn't jump to line 368 because the condition on line 367 was never true

368 process_form_template(

369 wxr, page_data, base_data, template_node

370 )

371

372

373def process_form_template(

374 wxr: WiktextractContext,

375 page_data: list[WordEntry],

376 base_data: WordEntry,

377 template_node: TemplateNode,

378) -> None:

379 # https://ru.wiktionary.org/wiki/Шаблон:Форма-сущ

380 # Шаблон:Форма-гл, "Шаблон:форма-гл en"

381 pos_data = get_pos_from_template(wxr, template_node)

382 if pos_data is not None: 382 ↛ 386line 382 didn't jump to line 386 because the condition on line 382 was always true

383 base_data.pos = pos_data["pos"]

384 base_data.tags.extend(pos_data.get("tags", []))

385

386 form_of = clean_node(

387 wxr,

388 None,

389 template_node.template_parameters.get(

390 "база", template_node.template_parameters.get(1, "")

391 ),

392 )

393 ipa = clean_node(

394 wxr, None, template_node.template_parameters.get("МФА", "")

395 )

396 expanded_node = wxr.wtp.parse(

397 wxr.wtp.node_to_wikitext(template_node), expand_all=True

398 )

399 current_data = base_data.model_copy(deep=True)

400 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):

401 gloss_text = clean_node(wxr, None, list_item.children)

402 if len(gloss_text) > 0: 402 ↛ 400line 402 didn't jump to line 400 because the condition on line 402 was always true

403 sense = Sense(glosses=[gloss_text])

404 if len(form_of) > 0: 404 ↛ 407line 404 didn't jump to line 407 because the condition on line 404 was always true

405 sense.form_of.append(AltForm(word=form_of))

406 sense.tags.append("form-of")

407 current_data.senses.append(sense)

408

409 if len(ipa) > 0: 409 ↛ 411line 409 didn't jump to line 411 because the condition on line 409 was always true

410 current_data.sounds.append(Sound(ipa=ipa))

411 if len(current_data.senses) > 0 or len(current_data.sounds) > 0: 411 ↛ exitline 411 didn't return from function 'process_form_template' because the condition on line 411 was always true

412 clean_node(wxr, current_data, template_node)

413 page_data.append(current_data)

414

415

416def parse_roman_section(

417 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode

418) -> None:

419 for link_node in level_node.find_child(NodeKind.LINK):

420 form_text = clean_node(wxr, None, link_node)

421 if form_text != "":

422 form = Form(form=form_text, tags=["romanization"])

423 word_entry.forms.append(form)

424

425

426def extract_section_end_templates(

427 wxr: WiktextractContext, word_entry: WordEntry, level_node: WikiNode

428) -> None:

429 # category link templates

430 # https://ru.wiktionary.org/wiki/Категория:Викисловарь:Шаблоны_категоризации

431 for template_node in level_node.find_child(NodeKind.TEMPLATE):

432 if template_node.template_name in { 432 ↛ 444line 432 didn't jump to line 444 because the condition on line 432 was never true

433 "-ание",

434 "-атель",

435 "-ация",

436 "-ение",

437 "-ка",

438 "длина слова",

439 "Категория",

440 "Омонимы",

441 "forms",

442 "multilang",

443 }:

444 clean_node(wxr, word_entry, template_node)

445 elif template_node.template_name == "zh-forms": 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true

446 extract_zh_forms_template(wxr, word_entry, template_node)

447

448

449def extract_zh_forms_template(

450 wxr: WiktextractContext,

451 base_data: WordEntry,

452 template_node: TemplateNode,

453) -> None:

454 # https://ru.wiktionary.org/wiki/Шаблон:zh-forms

455 # https://ru.wiktionary.org/wiki/Модуль:zh-forms

456 # similar to en and zh edition template

457 for p_name, p_value in template_node.template_parameters.items():

458 if not isinstance(p_name, str):

459 continue

460 if re.fullmatch(r"s\d*", p_name):

461 form_data = Form(

462 form=clean_node(wxr, None, p_value), tags=["Simplified Chinese"]

463 )

464 if form_data.form not in ["", wxr.wtp.title]:

465 base_data.forms.append(form_data)

466 elif re.fullmatch(r"t\d*", p_name):

467 form_data = Form(

468 form=clean_node(wxr, None, p_value),

469 tags=["Traditional Chinese"],

470 )

471 if form_data.form not in ["", wxr.wtp.title]:

472 base_data.forms.append(form_data)

473 elif p_name == "lit":

474 base_data.literal_meaning = clean_node(wxr, None, p_value)

475

476

477def extract_level2_node_contents(

478 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode

479) -> None:

480 for t_node in level_node.find_content(NodeKind.TEMPLATE):

481 if t_node.template_name in ["заголовок", "з"]: 481 ↛ 480line 481 didn't jump to line 480 because the condition on line 481 was always true

482 # https://ru.wiktionary.org/wiki/Шаблон:з

483 stressed_form = clean_node(

484 wxr, None, t_node.template_parameters.get("ударение", "")

485 )

486 if "(" in stressed_form:

487 stressed_form = stressed_form[

488 : stressed_form.index("(")

489 ].strip()

490 if stressed_form not in ["", wxr.wtp.title]:

491 word_entry.forms.append(

492 Form(form=stressed_form, tags=["stressed"])

493 )