Coverage for src/wiktextract/wiktionary.py: 75%

302 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1# Wiktionary parser for extracting a lexicon and various other information 

2# from wiktionary. This file contains code to uncompress the Wiktionary 

3# dump file and to separate it into individual pages. 

4# 

5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import atexit 

8import io 

9import json 

10import os 

11import re 

12import tarfile 

13import tempfile 

14import time 

15from concurrent.futures import ProcessPoolExecutor 

16from copy import deepcopy 

17from multiprocessing import current_process, get_context 

18from pathlib import Path 

19from traceback import format_exc 

20from typing import TextIO 

21 

22from wikitextprocessor import Page 

23from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData 

24from wikitextprocessor.dumpparser import process_dump 

25 

26from .import_utils import import_extractor_module 

27from .page import parse_page 

28from .thesaurus import ( 

29 emit_words_in_thesaurus, 

30 extract_thesaurus_data, 

31 thesaurus_linkage_number, 

32) 

33from .wxr_context import WiktextractContext 

34from .wxr_logging import logger 

35 

36 

37def page_handler( 

38 page: Page, 

39) -> tuple[list[dict[str, str]], CollatedErrorReturnData]: 

40 # Make sure there are no newlines or other strange characters in the 

41 # title. They could cause security problems at several post-processing 

42 # steps. 

43 # We've given the page_handler function an extra wxr attribute previously. 

44 # This should never cause an exception, and if it does, we want it to. 

45 

46 # Helps debug extraction hangs. This writes the path of each file being 

47 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang 

48 # has been observed, these files contain page(s) that hang. They should 

49 # be checked before aborting the process, as an interrupt might delete them. 

50 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname: 

51 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid()) 

52 with open(debug_path, "w", encoding="utf-8") as f: 

53 f.write(page.title + "\n") 

54 

55 worker_wxr.wtp.start_page(page.title) 

56 try: 

57 title = re.sub(r"[\s\000-\037]+", " ", page.title) 

58 title = title.strip() 

59 if page.redirect_to is not None: 

60 page_data = [ 

61 { 

62 "title": title, 

63 "redirect": page.redirect_to, 

64 "pos": "hard-redirect", 

65 } 

66 ] 

67 else: 

68 # XXX Sign gloss pages? 

69 start_t = time.time() 

70 page_data = parse_page(worker_wxr, title, page.body) # type: ignore[arg-type] 

71 dur = time.time() - start_t 

72 if dur > 100: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 logger.warning( 

74 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format( 

75 dur, title 

76 ) 

77 ) 

78 

79 return page_data, worker_wxr.wtp.to_return() 

80 except Exception: 

81 worker_wxr.wtp.error( 

82 f'=== EXCEPTION while parsing page "{page.title}" ' 

83 f"in process {current_process().name}", 

84 format_exc(), 

85 "page_handler_exception", 

86 ) 

87 return [], worker_wxr.wtp.to_return() 

88 

89 

90def parse_wiktionary( 

91 wxr: WiktextractContext, 

92 dump_path: str, 

93 num_processes: int | None, 

94 phase1_only: bool, 

95 namespace_ids: set[int], 

96 out_f: TextIO, 

97 human_readable: bool = False, 

98 override_folders: list[str] | list[Path] | None = None, 

99 skip_extract_dump: bool = False, 

100 save_pages_path: str | Path | None = None, 

101) -> None: 

102 """Parses Wiktionary from the dump file ``path`` (which should point 

103 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This 

104 calls `word_cb(data)` for all words defined for languages in `languages`.""" 

105 capture_language_codes = wxr.config.capture_language_codes 

106 if capture_language_codes is not None: 106 ↛ 111line 106 didn't jump to line 111 because the condition on line 106 was always true

107 assert isinstance(capture_language_codes, (list, tuple, set)) 

108 for x in capture_language_codes: 

109 assert isinstance(x, str) 

110 

111 logger.info("First phase - extracting templates, macros, and pages") 

112 if override_folders is not None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 override_folders = [Path(folder) for folder in override_folders] 

114 if save_pages_path is not None: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 save_pages_path = Path(save_pages_path) 

116 

117 analyze_template_mod = import_extractor_module( 

118 wxr.wtp.lang_code, "analyze_template" 

119 ) 

120 process_dump( 

121 wxr.wtp, 

122 dump_path, 

123 namespace_ids, 

124 override_folders, 

125 skip_extract_dump, 

126 save_pages_path, 

127 analyze_template_mod.analyze_template 

128 if analyze_template_mod is not None 

129 else None, 

130 ) 

131 

132 if not phase1_only: 132 ↛ exitline 132 didn't return from function 'parse_wiktionary' because the condition on line 132 was always true

133 reprocess_wiktionary(wxr, num_processes, out_f, human_readable) 

134 

135 

136def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None: 

137 if out_f is not None: 137 ↛ exitline 137 didn't return from function 'write_json_data' because the condition on line 137 was always true

138 if human_readable: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 out_f.write( 

140 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False) 

141 ) 

142 else: 

143 out_f.write(json.dumps(data, ensure_ascii=False)) 

144 out_f.write("\n") 

145 

146 

147def estimate_progress( 

148 processed_pages: int, all_pages: int, start_time: float, last_time: float 

149) -> float: 

150 current_time = time.time() 

151 processed_pages += 1 

152 if current_time - last_time > 1: 

153 remaining_pages = all_pages - processed_pages 

154 estimate_seconds = ( 

155 (current_time - start_time) / processed_pages * remaining_pages 

156 ) 

157 logger.info( 

158 " ... {}/{} pages ({:.1%}) processed, " 

159 "{:02d}:{:02d}:{:02d} remaining".format( 

160 processed_pages, 

161 all_pages, 

162 processed_pages / all_pages, 

163 int(estimate_seconds / 3600), 

164 int(estimate_seconds / 60 % 60), 

165 int(estimate_seconds % 60), 

166 ) 

167 ) 

168 last_time = current_time 

169 return last_time 

170 

171 

172def check_error( 

173 wxr: WiktextractContext, 

174 dt: dict, 

175 word: str | None, 

176 lang: str | None, 

177 pos: str | None, 

178 msg: str, 

179 called_from: str | None = None, 

180) -> None: 

181 """Formats and outputs an error message about data format checks.""" 

182 if called_from is None: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true

183 called_from = "wiktionary/179/20240425" 

184 else: 

185 called_from = "wiktionary/179/20240425" + called_from 

186 # msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False) 

187 prefix = word or "" 

188 if lang: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true

189 prefix += "/" + lang 

190 if pos: 190 ↛ 192line 190 didn't jump to line 192 because the condition on line 190 was always true

191 prefix += "/" + pos 

192 if prefix: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true

193 msg = prefix + ": " + msg 

194 print(msg) 

195 config = wxr.config 

196 if len(config.debugs) > 100000: # Avoid excessive size 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 return 

198 error_data: ErrorMessageData = { 

199 "msg": msg, 

200 "trace": "", 

201 "title": word, 

202 "section": lang, 

203 "subsection": pos, 

204 "called_from": called_from, 

205 "path": tuple(), 

206 } 

207 config.debugs.append(error_data) 

208 

209 

210def check_tags( 

211 wxr: WiktextractContext, 

212 dt: dict, 

213 word: str, 

214 lang: str, 

215 pos: str, 

216 item: dict, 

217) -> None: 

218 assert isinstance(item, dict) 

219 tags = item.get("tags") 

220 if tags is None: 

221 return 

222 if not isinstance(tags, (list, tuple)): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 check_error( 

224 wxr, 

225 dt, 

226 word, 

227 lang, 

228 pos, 

229 '"tags" field value must be a list of strings: {}'.format( 

230 repr(tags) 

231 ), 

232 ) 

233 return 

234 for tag in tags: 

235 if not isinstance(tag, str): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true

236 check_error( 

237 wxr, 

238 dt, 

239 word, 

240 lang, 

241 pos, 

242 '"tags" field should only contain strings: {}'.format( 

243 repr(tag) 

244 ), 

245 ) 

246 continue 

247 # XXX enable the following later (currently too many bogus tags in 

248 # non-English editions). Tag values should be standardized across 

249 # editions, except for uppercase tags (e.g., regional variants). 

250 if wxr.wtp.lang_code in ("en",): # Check edition 250 ↛ 234line 250 didn't jump to line 234 because the condition on line 250 was always true

251 from .tags import uppercase_tags, valid_tags 

252 

253 if tag not in valid_tags and tag not in uppercase_tags: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 if len(tag) > 0 and tag[0].isupper(): 

255 check_error( 

256 wxr, 

257 dt, 

258 word, 

259 lang, 

260 pos, 

261 f"invalid uppercase tag {tag} not in or uppercase_tags", 

262 called_from="uppercase_tags", 

263 ) 

264 else: 

265 check_error( 

266 wxr, 

267 dt, 

268 word, 

269 lang, 

270 pos, 

271 f"invalid tag {tag} not in valid_tags " 

272 "or uppercase_tags", 

273 ) 

274 

275 

276def check_str_fields( 

277 wxr: WiktextractContext, 

278 dt: dict, 

279 word: str, 

280 lang: str, 

281 pos: str, 

282 item: dict, 

283 fields: list[str], 

284 mandatory: bool = False, 

285 empty_ok: bool = False, 

286) -> None: 

287 """Checks that each of the listed fields contains a non-empty string. 

288 Non-existent fields are ok unless ``mandatory`` is True.""" 

289 assert isinstance(item, dict) 

290 for field in fields: 

291 v = item.get(field) 

292 if field not in item: 

293 if mandatory: 

294 check_error( 

295 wxr, 

296 dt, 

297 word, 

298 lang, 

299 pos, 

300 "{!r} is missing and should be a{} string: {}".format( 

301 field, 

302 "" if empty_ok else " non-empty", 

303 json.dumps(item, sort_keys=True, ensure_ascii=False), 

304 ), 

305 ) 

306 continue 

307 if not isinstance(v, str): 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 check_error( 

309 wxr, 

310 dt, 

311 word, 

312 lang, 

313 pos, 

314 "{!r} should be a{} string: {}".format( 

315 field, 

316 "" if empty_ok else " non-empty", 

317 json.dumps(item, sort_keys=True, ensure_ascii=False), 

318 ), 

319 ) 

320 if not v and not empty_ok: 

321 check_error( 

322 wxr, 

323 dt, 

324 word, 

325 lang, 

326 pos, 

327 "{!r} should contain a non-empty string: {}".format( 

328 field, json.dumps(item, sort_keys=True, ensure_ascii=False) 

329 ), 

330 ) 

331 

332 

333def check_dict_list_fields( 

334 wxr: WiktextractContext, 

335 dt: dict, 

336 word: str, 

337 lang: str, 

338 pos: str, 

339 item: dict, 

340 fields: list[str], 

341) -> bool: 

342 """Checks that each listed field, if present, is a list of dicts.""" 

343 assert isinstance(item, dict) 

344 for field in fields: 

345 lst = item.get(field) 

346 if lst is None: 

347 continue 

348 if not isinstance(lst, (list, tuple)): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 check_error( 

350 wxr, 

351 dt, 

352 word, 

353 lang, 

354 pos, 

355 "{!r} should be a list of dicts: {}".format( 

356 field, json.dumps(lst, sort_keys=True) 

357 ), 

358 ) 

359 return False 

360 for x in lst: 

361 if not isinstance(x, dict): 361 ↛ 362line 361 didn't jump to line 362 because the condition on line 361 was never true

362 check_error( 

363 wxr, 

364 dt, 

365 word, 

366 lang, 

367 pos, 

368 "{!r} should be a list of dicts: {}".format( 

369 field, json.dumps(lst, sort_keys=True) 

370 ), 

371 ) 

372 return False 

373 return True 

374 

375 

376def check_str_list_fields( 

377 wxr: WiktextractContext, 

378 dt: dict, 

379 word: str, 

380 lang: str, 

381 pos: str, 

382 item: dict, 

383 fields: list[str], 

384) -> None: 

385 """Checks that each of the listed fields contains a list of non-empty 

386 strings or is not present.""" 

387 assert isinstance(item, dict) 

388 for field in fields: 

389 lst = item.get(field) 

390 if lst is None: 

391 continue 

392 if not isinstance(lst, (list, tuple)): 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 check_error( 

394 wxr, 

395 dt, 

396 word, 

397 lang, 

398 pos, 

399 "{!r} should be a list of dicts: {}".format( 

400 field, json.dumps(item, sort_keys=True) 

401 ), 

402 ) 

403 continue 

404 for x in lst: 

405 if not isinstance(x, str) or not x: 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true

406 check_error( 

407 wxr, 

408 dt, 

409 word, 

410 lang, 

411 pos, 

412 "{!r} should be a list of non-empty strings: {}".format( 

413 field, json.dumps(item, sort_keys=True) 

414 ), 

415 ) 

416 break 

417 

418 

419def check_json_data(wxr: WiktextractContext, dt: dict) -> None: 

420 """Performs some basic checks on the generated data.""" 

421 word = dt.get("word", dt.get("title")) 

422 if word is None: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 check_error( 

424 wxr, 

425 dt, 

426 None, 

427 None, 

428 None, 

429 'missing "word" or "title" field in data', 

430 ) 

431 return 

432 if "title" in dt: 

433 return # redirect pages don't have following fields 

434 lang = dt.get("lang") 

435 if not lang: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true

436 check_error(wxr, dt, word, None, None, 'missing "lang" field in data') 

437 return 

438 pos = dt.get("pos") 

439 if not pos: 439 ↛ 440line 439 didn't jump to line 440 because the condition on line 439 was never true

440 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data') 

441 return 

442 if not dt.get("lang_code"): 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true

443 check_error( 

444 wxr, dt, word, lang, pos, 'missing "lang_code" field in data' 

445 ) 

446 check_tags(wxr, dt, word, lang, pos, dt) 

447 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"]) 

448 num = dt.get("etymology_number") 

449 if num is not None and not isinstance(num, int): 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true

450 check_error( 

451 wxr, dt, word, lang, pos, '"etymology_number" must be an int' 

452 ) 

453 # Check that certain fields, if present, contain lists of dicts 

454 if not check_dict_list_fields( 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was never true

455 wxr, 

456 dt, 

457 word, 

458 lang, 

459 pos, 

460 dt, 

461 [ 

462 "forms", 

463 "senses", 

464 "synonyms", 

465 "antonyms", 

466 "hypernyms", 

467 "holonyms", 

468 "meronyms", 

469 "coordinate_terms", 

470 "derived", 

471 "related", 

472 "sounds", 

473 "translations", 

474 "descendants", 

475 "etymology_templates", 

476 "head_templates", 

477 "inflection_templates", 

478 ], 

479 ): 

480 return # Avoid further processing because it would cause type errors 

481 # Check the "forms" field 

482 forms = dt.get("forms") or [] 

483 for form in forms: 

484 check_tags(wxr, dt, word, lang, pos, form) 

485 tags = dt.get("tags") 

486 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 486 ↛ 483line 486 didn't jump to line 483 because the condition on line 486 was always true

487 check_str_fields( 

488 wxr, dt, word, lang, pos, form, ["form"], mandatory=True 

489 ) 

490 check_str_list_fields( 

491 wxr, 

492 dt, 

493 word, 

494 lang, 

495 pos, 

496 dt, 

497 ["categories", "topics", "wikidata", "wikipedia"], 

498 ) 

499 # Check the "senses" field 

500 senses = dt.get("senses") or [] 

501 if not senses: 501 ↛ 502line 501 didn't jump to line 502 because the condition on line 501 was never true

502 check_error( 

503 wxr, 

504 dt, 

505 word, 

506 lang, 

507 pos, 

508 'missing "senses" in data (must have at least one ' 

509 'sense, add empty sense with "no-gloss" tag if none ' 

510 "otherwise available)", 

511 ) 

512 return 

513 for sense in senses: 

514 check_str_list_fields( 

515 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"] 

516 ) 

517 # Extra check: should have no-gloss tag if no glosses 

518 for field in ("glosses", "raw_glosses"): 

519 glosses = sense.get(field) or [] 

520 if ( 520 ↛ 525line 520 didn't jump to line 525 because the condition on line 520 was never true

521 not glosses 

522 and isinstance(sense.get("tags"), str) 

523 and "no-gloss" not in sense.get("tags", "").split() 

524 ): 

525 check_error( 

526 wxr, 

527 dt, 

528 word, 

529 lang, 

530 pos, 

531 "{!r} should have at least one gloss or " 

532 '"no-gloss" in "tags"'.format(field), 

533 ) 

534 continue 

535 check_tags(wxr, dt, word, lang, pos, sense) 

536 check_str_list_fields( 

537 wxr, 

538 dt, 

539 word, 

540 lang, 

541 pos, 

542 sense, 

543 ["categories", "topics", "wikidata", "wikipedia"], 

544 ) 

545 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"]) 

546 if not check_dict_list_fields( 546 ↛ 566line 546 didn't jump to line 566 because the condition on line 546 was never true

547 wxr, 

548 dt, 

549 word, 

550 lang, 

551 pos, 

552 sense, 

553 [ 

554 "alt_of", 

555 "form_of", 

556 "synonyms", 

557 "antonyms", 

558 "hypernyms", 

559 "holonyms", 

560 "meronyms", 

561 "coordinate_terms", 

562 "derived", 

563 "related", 

564 ], 

565 ): 

566 continue 

567 for field in ("alt_of", "form_of"): 

568 lst = sense.get(field) 

569 if lst is None: 

570 continue 

571 for item in lst: 

572 check_str_fields( 

573 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

574 ) 

575 check_str_fields( 

576 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False 

577 ) 

578 

579 for field in ( 

580 "synonyms", 

581 "antonyms", 

582 "hypernyms", 

583 "holonyms", 

584 "meronyms", 

585 "coordinate_terms", 

586 "derived", 

587 "related", 

588 ): 

589 lst = sense.get(field) 

590 if lst is None: 

591 continue 

592 for item in lst: 

593 check_str_fields( 

594 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

595 ) 

596 check_tags(wxr, dt, word, lang, pos, item) 

597 check_str_fields( 

598 wxr, 

599 dt, 

600 word, 

601 lang, 

602 pos, 

603 item, 

604 ["english", "roman", "sense", "taxonomic"], 

605 mandatory=False, 

606 empty_ok=True, 

607 ) 

608 check_str_list_fields( 

609 wxr, dt, word, lang, pos, item, ["topics"] 

610 ) 

611 # Check the "sounds" field 

612 # We will permit having any number of different types (ipa, enpr, etc) 

613 # in the same sound entry or in different sound entries. 

614 sounds = dt.get("sounds") or [] 

615 for item in sounds: 

616 check_str_fields( 

617 wxr, 

618 dt, 

619 word, 

620 lang, 

621 pos, 

622 item, 

623 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"], 

624 ) 

625 check_tags(wxr, dt, word, lang, pos, item) 

626 check_str_list_fields( 

627 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"] 

628 ) 

629 # Check the "translations" field 

630 translations = dt.get("translations") or [] 

631 for item in translations: 

632 check_str_fields( 

633 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

634 ) 

635 check_tags(wxr, dt, word, lang, pos, item) 

636 check_str_fields( 

637 wxr, 

638 dt, 

639 word, 

640 lang, 

641 pos, 

642 item, 

643 [ 

644 "alt", 

645 "code", 

646 "english", 

647 "lang", 

648 "note", 

649 "roman", 

650 "sense", 

651 "taxonomic", 

652 ], 

653 ) 

654 if not item.get("code") and not item.get("lang"): 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true

655 check_error( 

656 wxr, 

657 dt, 

658 word, 

659 lang, 

660 pos, 

661 '"translations" items must contain at least one ' 

662 'of "code" and "lang" (normally both): {}'.format( 

663 json.dumps(item, sort_keys=True, ensure_ascii=False) 

664 ), 

665 ) 

666 # Check the "etymology_templates", "head_templates", and 

667 # "inflection_templates" fields 

668 for field in [ 

669 "etymology_templates", 

670 "head_templates", 

671 "inflection_templates", 

672 ]: 

673 lst = dt.get(field) 

674 if lst is None: 

675 continue 

676 for item in lst: 

677 check_str_fields( 

678 wxr, dt, word, lang, pos, item, ["name"], mandatory=True 

679 ) 

680 check_str_fields( 

681 wxr, 

682 dt, 

683 word, 

684 lang, 

685 pos, 

686 item, 

687 ["expansion"], 

688 # empty_ok=True because there are some templates 

689 # that generate empty expansions. 

690 mandatory=False, 

691 empty_ok=True, 

692 ) 

693 args = item.get("args") 

694 if args is None: 694 ↛ 695line 694 didn't jump to line 695 because the condition on line 694 was never true

695 continue 

696 if not isinstance(args, dict): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true

697 check_error( 

698 wxr, 

699 dt, 

700 word, 

701 lang, 

702 pos, 

703 '{!r} item "args" value must be a dict: {}'.format( 

704 field, json.dumps(args, sort_keys=True) 

705 ), 

706 ) 

707 continue 

708 for k, v in args.items(): 

709 if not isinstance(k, str) or not isinstance(v, str): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 check_error( 

711 wxr, 

712 dt, 

713 word, 

714 lang, 

715 pos, 

716 '{!r} item "args" must be a dict with ' 

717 "string keys and values: {}".format( 

718 field, json.dumps(args, sort_keys=True) 

719 ), 

720 ) 

721 continue 

722 # Check the "descendants" field 

723 descendants = dt.get("descendants") or [] 

724 for item in descendants: 

725 check_str_fields(wxr, dt, word, lang, pos, item, ["text"]) 

726 depth = item.get("depth") 

727 if depth is not None and not isinstance(depth, int): 727 ↛ 728line 727 didn't jump to line 728 because the condition on line 727 was never true

728 check_error( 

729 wxr, 

730 dt, 

731 word, 

732 lang, 

733 pos, 

734 '"descentants" field "depth" must be an int', 

735 ) 

736 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"]) 

737 # XXX should check that they are valid templates, perhaps turn 

738 # template checking code above into a function 

739 

740 

741def init_worker(wxr: WiktextractContext) -> None: 

742 global worker_wxr 

743 worker_wxr = wxr 

744 worker_wxr.reconnect_databases() 

745 atexit.register(worker_wxr.remove_unpicklable_objects) 

746 

747 

748def reprocess_wiktionary( 

749 wxr: WiktextractContext, 

750 num_processes: int | None, 

751 out_f: TextIO, 

752 human_readable: bool = False, 

753 search_pattern: str | None = None, 

754) -> None: 

755 """Reprocesses the Wiktionary from the sqlite db.""" 

756 logger.info("Second phase - processing pages") 

757 

758 # Extract thesaurus data. This iterates over thesaurus pages, 

759 # but is very fast. 

760 if ( 760 ↛ 766line 760 didn't jump to line 766 because the condition on line 760 was always true

761 wxr.config.extract_thesaurus_pages 

762 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type] 

763 ): 

764 extract_thesaurus_data(wxr, num_processes) 

765 

766 emitted = set() 

767 process_ns_ids: list[int] = list( 

768 { 

769 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload] 

770 for ns in wxr.config.extract_ns_names 

771 } 

772 ) 

773 start_time = time.time() 

774 last_time = start_time 

775 all_page_nums = wxr.wtp.saved_page_nums( 

776 process_ns_ids, True, "wikitext", search_pattern 

777 ) 

778 wxr.remove_unpicklable_objects() 

779 with ProcessPoolExecutor( 

780 max_workers=num_processes, 

781 mp_context=get_context("spawn"), 

782 initializer=init_worker, 

783 initargs=(deepcopy(wxr),), 

784 ) as executor: 

785 wxr.reconnect_databases() 

786 for processed_pages, (page_data, wtp_stats) in enumerate( 

787 executor.map( 

788 page_handler, 

789 wxr.wtp.get_all_pages( 

790 process_ns_ids, True, "wikitext", search_pattern 

791 ), 

792 chunksize=100, # default is 1 too slow 

793 ) 

794 ): 

795 wxr.config.merge_return(wtp_stats) 

796 for dt in page_data: 

797 check_json_data(wxr, dt) 

798 write_json_data(dt, out_f, human_readable) 

799 word = dt.get("word") 

800 lang_code = dt.get("lang_code") 

801 pos = dt.get("pos") 

802 if word and lang_code and pos: 

803 emitted.add((word, lang_code, pos)) 

804 last_time = estimate_progress( 

805 processed_pages, all_page_nums, start_time, last_time 

806 ) 

807 

808 if wxr.config.dump_file_lang_code == "en": 808 ↛ 810line 808 didn't jump to line 810 because the condition on line 808 was always true

809 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable) 

810 logger.info("Reprocessing wiktionary complete") 

811 

812 

813def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]: 

814 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment] 

815 title = page.title[page.title.find(":") + 1 :] 

816 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title) 

817 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title) 

818 title = title.replace("//", "__slashslash__") 

819 title = re.sub(r"^/", r"__slash__", title) 

820 title = re.sub(r"/$", r"__slash__", title) 

821 title = ns_name + "/" + title 

822 return title, text 

823 

824 

825def extract_namespace( 

826 wxr: WiktextractContext, namespace: str, path: str 

827) -> None: 

828 """Extracts all pages in the given namespace and writes them to a .tar 

829 file with the given path.""" 

830 logger.info( 

831 f"Extracting pages from namespace {namespace} to tar file {path}" 

832 ) 

833 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload] 

834 t = time.time() 

835 with tarfile.open(path, "w") as tarf: 

836 for page in wxr.wtp.get_all_pages([ns_id]): 

837 title, text = process_ns_page_title(page, namespace) 

838 text = text.encode("utf-8") 

839 f = io.BytesIO(text) 

840 title += ".txt" 

841 ti = tarfile.TarInfo(name=title) 

842 ti.size = len(text) 

843 # According to documentation, TarInfo.mtime can be int, float, 

844 # or even None in newer versions, but mypy can't tell because 

845 # it's not annotated and assumes it can only be int 

846 ti.mtime = t # type: ignore[assignment] 

847 ti.uid = 0 

848 ti.gid = 0 

849 ti.type = tarfile.REGTYPE 

850 tarf.addfile(ti, f)