Coverage for src/wiktextract/wiktionary.py: 70%

300 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Wiktionary parser for extracting a lexicon and various other information 

2# from wiktionary. This file contains code to uncompress the Wiktionary 

3# dump file and to separate it into individual pages. 

4# 

5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import io 

8import json 

9import os 

10import re 

11import tarfile 

12import tempfile 

13import time 

14import traceback 

15from multiprocessing import Pool, current_process 

16from pathlib import Path 

17from typing import TextIO 

18 

19from wikitextprocessor import Page 

20from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData 

21from wikitextprocessor.dumpparser import process_dump 

22 

23from .import_utils import import_extractor_module 

24from .page import parse_page 

25from .thesaurus import ( 

26 emit_words_in_thesaurus, 

27 extract_thesaurus_data, 

28 thesaurus_linkage_number, 

29) 

30from .wxr_context import WiktextractContext 

31from .wxr_logging import logger 

32 

33 

34def page_handler( 

35 page: Page, 

36) -> tuple[list[dict[str, str]], CollatedErrorReturnData]: 

37 # Make sure there are no newlines or other strange characters in the 

38 # title. They could cause security problems at several post-processing 

39 # steps. 

40 # We've given the page_handler function an extra wxr attribute previously. 

41 # This should never cause an exception, and if it does, we want it to. 

42 wxr: WiktextractContext = page_handler.wxr # type:ignore[attr-defined] 

43 # Helps debug extraction hangs. This writes the path of each file being 

44 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang 

45 # has been observed, these files contain page(s) that hang. They should 

46 # be checked before aborting the process, as an interrupt might delete them. 

47 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname: 

48 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid()) 

49 with open(debug_path, "w", encoding="utf-8") as f: 

50 f.write(page.title + "\n") 

51 

52 wxr.wtp.start_page(page.title) 

53 try: 

54 title = re.sub(r"[\s\000-\037]+", " ", page.title) 

55 title = title.strip() 

56 if page.redirect_to is not None: 

57 page_data = [ 

58 { 

59 "title": title, 

60 "redirect": page.redirect_to, 

61 "pos": "hard-redirect", 

62 } 

63 ] 

64 else: 

65 # XXX Sign gloss pages? 

66 start_t = time.time() 

67 page_data = parse_page(wxr, title, page.body) # type: ignore[arg-type] 

68 dur = time.time() - start_t 

69 if dur > 100: 

70 logger.warning( 

71 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format( 

72 dur, title 

73 ) 

74 ) 

75 

76 return page_data, wxr.wtp.to_return() 

77 except Exception: 

78 wxr.wtp.error( 

79 f'=== EXCEPTION while parsing page "{page.title}" ' 

80 f"in process {current_process().name}", 

81 traceback.format_exc(), 

82 "page_handler_exception", 

83 ) 

84 return [], wxr.wtp.to_return() 

85 

86 

87def parse_wiktionary( 

88 wxr: WiktextractContext, 

89 dump_path: str, 

90 num_processes: int | None, 

91 phase1_only: bool, 

92 namespace_ids: set[int], 

93 out_f: TextIO, 

94 human_readable: bool = False, 

95 override_folders: list[str] | list[Path] | None = None, 

96 skip_extract_dump: bool = False, 

97 save_pages_path: str | Path | None = None, 

98) -> None: 

99 """Parses Wiktionary from the dump file ``path`` (which should point 

100 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This 

101 calls `word_cb(data)` for all words defined for languages in `languages`.""" 

102 capture_language_codes = wxr.config.capture_language_codes 

103 if capture_language_codes is not None: 103 ↛ 108line 103 didn't jump to line 108 because the condition on line 103 was always true

104 assert isinstance(capture_language_codes, (list, tuple, set)) 

105 for x in capture_language_codes: 

106 assert isinstance(x, str) 

107 

108 logger.info("First phase - extracting templates, macros, and pages") 

109 if override_folders is not None: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 override_folders = [Path(folder) for folder in override_folders] 

111 if save_pages_path is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 save_pages_path = Path(save_pages_path) 

113 

114 analyze_template_mod = import_extractor_module( 

115 wxr.wtp.lang_code, "analyze_template" 

116 ) 

117 process_dump( 

118 wxr.wtp, 

119 dump_path, 

120 namespace_ids, 

121 override_folders, 

122 skip_extract_dump, 

123 save_pages_path, 

124 analyze_template_mod.analyze_template 

125 if analyze_template_mod is not None 

126 else None, 

127 ) 

128 

129 if not phase1_only: 129 ↛ exitline 129 didn't return from function 'parse_wiktionary' because the condition on line 129 was always true

130 reprocess_wiktionary(wxr, num_processes, out_f, human_readable) 

131 

132 

133def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None: 

134 if out_f is not None: 134 ↛ exitline 134 didn't return from function 'write_json_data' because the condition on line 134 was always true

135 if human_readable: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 out_f.write( 

137 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False) 

138 ) 

139 else: 

140 out_f.write(json.dumps(data, ensure_ascii=False)) 

141 out_f.write("\n") 

142 

143 

144def estimate_progress( 

145 processed_pages: int, all_pages: int, start_time: float, last_time: float 

146) -> float: 

147 current_time = time.time() 

148 processed_pages += 1 

149 if current_time - last_time > 1: 

150 remaining_pages = all_pages - processed_pages 

151 estimate_seconds = ( 

152 (current_time - start_time) / processed_pages * remaining_pages 

153 ) 

154 logger.info( 

155 " ... {}/{} pages ({:.1%}) processed, " 

156 "{:02d}:{:02d}:{:02d} remaining".format( 

157 processed_pages, 

158 all_pages, 

159 processed_pages / all_pages, 

160 int(estimate_seconds / 3600), 

161 int(estimate_seconds / 60 % 60), 

162 int(estimate_seconds % 60), 

163 ) 

164 ) 

165 last_time = current_time 

166 return last_time 

167 

168 

169def init_worker_process(worker_func, wxr: WiktextractContext) -> None: 

170 wxr.reconnect_databases() 

171 worker_func.wxr = wxr 

172 

173 

174def check_error( 

175 wxr: WiktextractContext, 

176 dt: dict, 

177 word: str | None, 

178 lang: str | None, 

179 pos: str | None, 

180 msg: str, 

181 called_from: str | None = None, 

182) -> None: 

183 """Formats and outputs an error message about data format checks.""" 

184 if called_from is None: 184 ↛ 187line 184 didn't jump to line 187 because the condition on line 184 was always true

185 called_from = "wiktionary/179/20240425" 

186 else: 

187 called_from = "wiktionary/179/20240425" + called_from 

188 msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False) 

189 prefix = word or "" 

190 if lang: 190 ↛ 192line 190 didn't jump to line 192 because the condition on line 190 was always true

191 prefix += "/" + lang 

192 if pos: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true

193 prefix += "/" + pos 

194 if prefix: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true

195 msg = prefix + ": " + msg 

196 print(msg) 

197 config = wxr.config 

198 if len(config.debugs) > 100000: # Avoid excessive size 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true

199 return 

200 error_data: ErrorMessageData = { 

201 "msg": msg, 

202 "trace": "", 

203 "title": word, 

204 "section": lang, 

205 "subsection": pos, 

206 "called_from": called_from, 

207 "path": tuple(), 

208 } 

209 config.debugs.append(error_data) 

210 

211 

212def check_tags( 

213 wxr: WiktextractContext, 

214 dt: dict, 

215 word: str, 

216 lang: str, 

217 pos: str, 

218 item: dict, 

219) -> None: 

220 assert isinstance(item, dict) 

221 tags = item.get("tags") 

222 if tags is None: 

223 return 

224 if not isinstance(tags, (list, tuple)): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true

225 check_error( 

226 wxr, 

227 dt, 

228 word, 

229 lang, 

230 pos, 

231 '"tags" field value must be a list of strings: {}'.format( 

232 repr(tags) 

233 ), 

234 ) 

235 return 

236 for tag in tags: 

237 if not isinstance(tag, str): 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 check_error( 

239 wxr, 

240 dt, 

241 word, 

242 lang, 

243 pos, 

244 '"tags" field should only contain strings: {}'.format( 

245 repr(tag) 

246 ), 

247 ) 

248 continue 

249 # XXX enable the following later (currently too many bogus tags in 

250 # non-English editions). Tag values should be standardized across 

251 # editions, except for uppercase tags (e.g., regional variants). 

252 if wxr.wtp.lang_code in ("en",): # Check edition 252 ↛ 236line 252 didn't jump to line 236 because the condition on line 252 was always true

253 from .tags import uppercase_tags, valid_tags 

254 

255 if tag not in valid_tags and tag not in uppercase_tags: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true

256 if len(tag) > 0 and tag[0].isupper(): 

257 check_error( 

258 wxr, 

259 dt, 

260 word, 

261 lang, 

262 pos, 

263 f"invalid uppercase tag {tag} not in or uppercase_tags", 

264 called_from="uppercase_tags", 

265 ) 

266 else: 

267 check_error( 

268 wxr, 

269 dt, 

270 word, 

271 lang, 

272 pos, 

273 f"invalid tag {tag} not in valid_tags " 

274 "or uppercase_tags", 

275 ) 

276 

277 

278def check_str_fields( 

279 wxr: WiktextractContext, 

280 dt: dict, 

281 word: str, 

282 lang: str, 

283 pos: str, 

284 item: dict, 

285 fields: list[str], 

286 mandatory: bool = False, 

287 empty_ok: bool = False, 

288) -> None: 

289 """Checks that each of the listed fields contains a non-empty string. 

290 Non-existent fields are ok unless ``mandatory`` is True.""" 

291 assert isinstance(item, dict) 

292 for field in fields: 

293 v = item.get(field) 

294 if v is None: 

295 if mandatory: 

296 check_error( 

297 wxr, 

298 dt, 

299 word, 

300 lang, 

301 pos, 

302 "{!r} should be a{} string (it is a " 

303 "mandatory field): {}".format( 

304 field, 

305 "" if empty_ok else " non-empty", 

306 json.dumps(item, sort_keys=True), 

307 ), 

308 ) 

309 continue 

310 if not isinstance(v, str): 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true

311 check_error( 

312 wxr, 

313 dt, 

314 word, 

315 lang, 

316 pos, 

317 "{!r} should be a{} string: {}".format( 

318 field, 

319 "" if empty_ok else " non-empty", 

320 json.dumps(item, sort_keys=True), 

321 ), 

322 ) 

323 if not v and not empty_ok: 

324 check_error( 

325 wxr, 

326 dt, 

327 word, 

328 lang, 

329 pos, 

330 "{!r} should contain a non-empty string: {}".format( 

331 field, json.dumps(item, sort_keys=True) 

332 ), 

333 ) 

334 

335 

336def check_dict_list_fields( 

337 wxr: WiktextractContext, 

338 dt: dict, 

339 word: str, 

340 lang: str, 

341 pos: str, 

342 item: dict, 

343 fields: list[str], 

344) -> bool: 

345 """Checks that each listed field, if present, is a list of dicts.""" 

346 assert isinstance(item, dict) 

347 for field in fields: 

348 lst = item.get(field) 

349 if lst is None: 

350 continue 

351 if not isinstance(lst, (list, tuple)): 351 ↛ 352line 351 didn't jump to line 352 because the condition on line 351 was never true

352 check_error( 

353 wxr, 

354 dt, 

355 word, 

356 lang, 

357 pos, 

358 "{!r} should be a list of dicts: {}".format( 

359 field, json.dumps(lst, sort_keys=True) 

360 ), 

361 ) 

362 return False 

363 for x in lst: 

364 if not isinstance(x, dict): 364 ↛ 365line 364 didn't jump to line 365 because the condition on line 364 was never true

365 check_error( 

366 wxr, 

367 dt, 

368 word, 

369 lang, 

370 pos, 

371 "{!r} should be a list of dicts: {}".format( 

372 field, json.dumps(lst, sort_keys=True) 

373 ), 

374 ) 

375 return False 

376 return True 

377 

378 

379def check_str_list_fields( 

380 wxr: WiktextractContext, 

381 dt: dict, 

382 word: str, 

383 lang: str, 

384 pos: str, 

385 item: dict, 

386 fields: list[str], 

387) -> None: 

388 """Checks that each of the listed fields contains a list of non-empty 

389 strings or is not present.""" 

390 assert isinstance(item, dict) 

391 for field in fields: 

392 lst = item.get(field) 

393 if lst is None: 

394 continue 

395 if not isinstance(lst, (list, tuple)): 395 ↛ 396line 395 didn't jump to line 396 because the condition on line 395 was never true

396 check_error( 

397 wxr, 

398 dt, 

399 word, 

400 lang, 

401 pos, 

402 "{!r} should be a list of dicts: {}".format( 

403 field, json.dumps(item, sort_keys=True) 

404 ), 

405 ) 

406 continue 

407 for x in lst: 

408 if not isinstance(x, str) or not x: 408 ↛ 409line 408 didn't jump to line 409 because the condition on line 408 was never true

409 check_error( 

410 wxr, 

411 dt, 

412 word, 

413 lang, 

414 pos, 

415 "{!r} should be a list of non-empty strings: {}".format( 

416 field, json.dumps(item, sort_keys=True) 

417 ), 

418 ) 

419 break 

420 

421 

422def check_json_data(wxr: WiktextractContext, dt: dict) -> None: 

423 """Performs some basic checks on the generated data.""" 

424 word = dt.get("word", dt.get("title")) 

425 if word is None: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true

426 check_error( 

427 wxr, 

428 dt, 

429 None, 

430 None, 

431 None, 

432 'missing "word" or "title" field in data', 

433 ) 

434 return 

435 if "title" in dt: 

436 return # redirect pages don't have following fields 

437 lang = dt.get("lang") 

438 if not lang: 438 ↛ 439line 438 didn't jump to line 439 because the condition on line 438 was never true

439 check_error(wxr, dt, word, None, None, 'missing "lang" field in data') 

440 return 

441 pos = dt.get("pos") 

442 if not pos: 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true

443 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data') 

444 return 

445 if not dt.get("lang_code"): 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true

446 check_error( 

447 wxr, dt, word, lang, pos, 'missing "lang_code" field in data' 

448 ) 

449 check_tags(wxr, dt, word, lang, pos, dt) 

450 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"]) 

451 num = dt.get("etymology_number") 

452 if num is not None and not isinstance(num, int): 452 ↛ 453line 452 didn't jump to line 453 because the condition on line 452 was never true

453 check_error( 

454 wxr, dt, word, lang, pos, '"etymology_number" must be an int' 

455 ) 

456 # Check that certain fields, if present, contain lists of dicts 

457 if not check_dict_list_fields( 457 ↛ 483line 457 didn't jump to line 483 because the condition on line 457 was never true

458 wxr, 

459 dt, 

460 word, 

461 lang, 

462 pos, 

463 dt, 

464 [ 

465 "forms", 

466 "senses", 

467 "synonyms", 

468 "antonyms", 

469 "hypernyms", 

470 "holonyms", 

471 "meronyms", 

472 "coordinate_terms", 

473 "derived", 

474 "related", 

475 "sounds", 

476 "translations", 

477 "descendants", 

478 "etymology_templates", 

479 "head_templates", 

480 "inflection_templates", 

481 ], 

482 ): 

483 return # Avoid further processing because it would cause type errors 

484 # Check the "forms" field 

485 forms = dt.get("forms") or [] 

486 for form in forms: 

487 check_tags(wxr, dt, word, lang, pos, form) 

488 tags = dt.get("tags") 

489 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 489 ↛ 486line 489 didn't jump to line 486 because the condition on line 489 was always true

490 check_str_fields( 

491 wxr, dt, word, lang, pos, form, ["form"], mandatory=True 

492 ) 

493 check_str_list_fields( 

494 wxr, 

495 dt, 

496 word, 

497 lang, 

498 pos, 

499 dt, 

500 ["categories", "topics", "wikidata", "wikipedia"], 

501 ) 

502 # Check the "senses" field 

503 senses = dt.get("senses") or [] 

504 if not senses: 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true

505 check_error( 

506 wxr, 

507 dt, 

508 word, 

509 lang, 

510 pos, 

511 'missing "senses" in data (must have at least one ' 

512 'sense, add empty sense with "no-gloss" tag if none ' 

513 "otherwise available)", 

514 ) 

515 return 

516 for sense in senses: 

517 check_str_list_fields( 

518 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"] 

519 ) 

520 # Extra check: should have no-gloss tag if no glosses 

521 for field in ("glosses", "raw_glosses"): 

522 glosses = sense.get(field) or [] 

523 if ( 523 ↛ 528line 523 didn't jump to line 528 because the condition on line 523 was never true

524 not glosses 

525 and isinstance(sense.get("tags"), str) 

526 and "no-gloss" not in sense.get("tags", "").split() 

527 ): 

528 check_error( 

529 wxr, 

530 dt, 

531 word, 

532 lang, 

533 pos, 

534 "{!r} should have at least one gloss or " 

535 '"no-gloss" in "tags"'.format(field), 

536 ) 

537 continue 

538 check_tags(wxr, dt, word, lang, pos, sense) 

539 check_str_list_fields( 

540 wxr, 

541 dt, 

542 word, 

543 lang, 

544 pos, 

545 sense, 

546 ["categories", "topics", "wikidata", "wikipedia"], 

547 ) 

548 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"]) 

549 if not check_dict_list_fields( 549 ↛ 569line 549 didn't jump to line 569 because the condition on line 549 was never true

550 wxr, 

551 dt, 

552 word, 

553 lang, 

554 pos, 

555 sense, 

556 [ 

557 "alt_of", 

558 "form_of", 

559 "synonyms", 

560 "antonyms", 

561 "hypernyms", 

562 "holonyms", 

563 "meronyms", 

564 "coordinate_terms", 

565 "derived", 

566 "related", 

567 ], 

568 ): 

569 continue 

570 for field in ("alt_of", "form_of"): 

571 lst = sense.get(field) 

572 if lst is None: 

573 continue 

574 for item in lst: 

575 check_str_fields( 

576 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

577 ) 

578 check_str_fields( 

579 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False 

580 ) 

581 

582 for field in ( 

583 "synonyms", 

584 "antonyms", 

585 "hypernyms", 

586 "holonyms", 

587 "meronyms", 

588 "coordinate_terms", 

589 "derived", 

590 "related", 

591 ): 

592 lst = sense.get(field) 

593 if lst is None: 

594 continue 

595 for item in lst: 

596 check_str_fields( 

597 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

598 ) 

599 check_tags(wxr, dt, word, lang, pos, item) 

600 check_str_fields( 

601 wxr, 

602 dt, 

603 word, 

604 lang, 

605 pos, 

606 item, 

607 ["english", "roman", "sense", "taxonomic"], 

608 mandatory=False, 

609 empty_ok=True, 

610 ) 

611 check_str_list_fields( 

612 wxr, dt, word, lang, pos, item, ["topics"] 

613 ) 

614 # Check the "sounds" field 

615 # We will permit having any number of different types (ipa, enpr, etc) 

616 # in the same sound entry or in different sound entries. 

617 sounds = dt.get("sounds") or [] 

618 for item in sounds: 

619 check_str_fields( 

620 wxr, 

621 dt, 

622 word, 

623 lang, 

624 pos, 

625 item, 

626 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"], 

627 ) 

628 check_tags(wxr, dt, word, lang, pos, item) 

629 check_str_list_fields( 

630 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"] 

631 ) 

632 # Check the "translations" field 

633 translations = dt.get("translations") or [] 

634 for item in translations: 

635 check_str_fields( 

636 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

637 ) 

638 check_tags(wxr, dt, word, lang, pos, item) 

639 check_str_fields( 

640 wxr, 

641 dt, 

642 word, 

643 lang, 

644 pos, 

645 item, 

646 [ 

647 "alt", 

648 "code", 

649 "english", 

650 "lang", 

651 "note", 

652 "roman", 

653 "sense", 

654 "taxonomic", 

655 ], 

656 ) 

657 if not item.get("code") and not item.get("lang"): 657 ↛ 658line 657 didn't jump to line 658 because the condition on line 657 was never true

658 check_error( 

659 wxr, 

660 dt, 

661 word, 

662 lang, 

663 pos, 

664 '"translations" items must contain at least one ' 

665 'of "code" and "lang" (normally both): {}'.format( 

666 json.dumps(item, sort_keys=True, ensure_ascii=False) 

667 ), 

668 ) 

669 # Check the "etymology_templates", "head_templates", and 

670 # "inflection_templates" fields 

671 for field in [ 

672 "etymology_templates", 

673 "head_templates", 

674 "inflection_templates", 

675 ]: 

676 lst = dt.get(field) 

677 if lst is None: 

678 continue 

679 for item in lst: 

680 check_str_fields( 

681 wxr, dt, word, lang, pos, item, ["name"], mandatory=True 

682 ) 

683 check_str_fields( 

684 wxr, 

685 dt, 

686 word, 

687 lang, 

688 pos, 

689 item, 

690 ["expansion"], 

691 # empty_ok=True because there are some templates 

692 # that generate empty expansions. 

693 mandatory=False, 

694 empty_ok=True, 

695 ) 

696 args = item.get("args") 

697 if args is None: 697 ↛ 698line 697 didn't jump to line 698 because the condition on line 697 was never true

698 continue 

699 if not isinstance(args, dict): 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true

700 check_error( 

701 wxr, 

702 dt, 

703 word, 

704 lang, 

705 pos, 

706 '{!r} item "args" value must be a dict: {}'.format( 

707 field, json.dumps(args, sort_keys=True) 

708 ), 

709 ) 

710 continue 

711 for k, v in args.items(): 

712 if not isinstance(k, str) or not isinstance(v, str): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true

713 check_error( 

714 wxr, 

715 dt, 

716 word, 

717 lang, 

718 pos, 

719 '{!r} item "args" must be a dict with ' 

720 "string keys and values: {}".format( 

721 field, json.dumps(args, sort_keys=True) 

722 ), 

723 ) 

724 continue 

725 # Check the "descendants" field 

726 descendants = dt.get("descendants") or [] 

727 for item in descendants: 

728 check_str_fields(wxr, dt, word, lang, pos, item, ["text"]) 

729 depth = item.get("depth") 

730 if depth is not None and not isinstance(depth, int): 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 check_error( 

732 wxr, 

733 dt, 

734 word, 

735 lang, 

736 pos, 

737 '"descentants" field "depth" must be an int', 

738 ) 

739 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"]) 

740 # XXX should check that they are valid templates, perhaps turn 

741 # template checking code above into a function 

742 

743 

744def reprocess_wiktionary( 

745 wxr: WiktextractContext, 

746 num_processes: int | None, 

747 out_f: TextIO, 

748 human_readable: bool = False, 

749 search_pattern: str | None = None, 

750) -> None: 

751 """Reprocesses the Wiktionary from the sqlite db.""" 

752 logger.info("Second phase - processing pages") 

753 

754 # Extract thesaurus data. This iterates over thesaurus pages, 

755 # but is very fast. 

756 if ( 756 ↛ 762line 756 didn't jump to line 762 because the condition on line 756 was always true

757 wxr.config.extract_thesaurus_pages 

758 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type] 

759 ): 

760 extract_thesaurus_data(wxr, num_processes) 

761 

762 emitted = set() 

763 process_ns_ids: list[int] = list( 

764 { 

765 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload] 

766 for ns in wxr.config.extract_ns_names 

767 } 

768 ) 

769 start_time = time.time() 

770 last_time = start_time 

771 all_page_nums = wxr.wtp.saved_page_nums( 

772 process_ns_ids, True, "wikitext", search_pattern 

773 ) 

774 wxr.remove_unpicklable_objects() 

775 with Pool(num_processes, init_worker_process, (page_handler, wxr)) as pool: 

776 wxr.reconnect_databases(False) 

777 for processed_pages, (page_data, wtp_stats) in enumerate( 

778 pool.imap_unordered( 

779 page_handler, 

780 wxr.wtp.get_all_pages( 

781 process_ns_ids, True, "wikitext", search_pattern 

782 ), 

783 ) 

784 ): 

785 wxr.config.merge_return(wtp_stats) 

786 for dt in page_data: 

787 check_json_data(wxr, dt) 

788 write_json_data(dt, out_f, human_readable) 

789 word = dt.get("word") 

790 lang_code = dt.get("lang_code") 

791 pos = dt.get("pos") 

792 if word and lang_code and pos: 

793 emitted.add((word, lang_code, pos)) 

794 last_time = estimate_progress( 

795 processed_pages, all_page_nums, start_time, last_time 

796 ) 

797 if wxr.config.dump_file_lang_code == "en": 797 ↛ 799line 797 didn't jump to line 799 because the condition on line 797 was always true

798 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable) 

799 logger.info("Reprocessing wiktionary complete") 

800 

801 

802def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]: 

803 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment] 

804 title = page.title[page.title.find(":") + 1 :] 

805 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title) 

806 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title) 

807 title = title.replace("//", "__slashslash__") 

808 title = re.sub(r"^/", r"__slash__", title) 

809 title = re.sub(r"/$", r"__slash__", title) 

810 title = ns_name + "/" + title 

811 return title, text 

812 

813 

814def extract_namespace( 

815 wxr: WiktextractContext, namespace: str, path: str 

816) -> None: 

817 """Extracts all pages in the given namespace and writes them to a .tar 

818 file with the given path.""" 

819 logger.info( 

820 f"Extracting pages from namespace {namespace} to tar file {path}" 

821 ) 

822 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload] 

823 t = time.time() 

824 with tarfile.open(path, "w") as tarf: 

825 for page in wxr.wtp.get_all_pages([ns_id]): 

826 title, text = process_ns_page_title(page, namespace) 

827 text = text.encode("utf-8") 

828 f = io.BytesIO(text) 

829 title += ".txt" 

830 ti = tarfile.TarInfo(name=title) 

831 ti.size = len(text) 

832 # According to documentation, TarInfo.mtime can be int, float, 

833 # or even None in newer versions, but mypy can't tell because 

834 # it's not annotated and assumes it can only be int 

835 ti.mtime = t # type: ignore[assignment] 

836 ti.uid = 0 

837 ti.gid = 0 

838 ti.type = tarfile.REGTYPE 

839 tarf.addfile(ti, f)