Coverage for src/wiktextract/wiktionary.py: 72%

295 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Wiktionary parser for extracting a lexicon and various other information 

2# from wiktionary. This file contains code to uncompress the Wiktionary 

3# dump file and to separate it into individual pages. 

4# 

5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org 

6 

7import io 

8import json 

9import os 

10import re 

11import tarfile 

12import tempfile 

13import time 

14import traceback 

15from multiprocessing import Pool, current_process 

16from pathlib import Path 

17from typing import TextIO 

18 

19from wikitextprocessor import Page 

20from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData 

21from wikitextprocessor.dumpparser import process_dump 

22 

23from .import_utils import import_extractor_module 

24from .page import parse_page 

25from .thesaurus import ( 

26 emit_words_in_thesaurus, 

27 extract_thesaurus_data, 

28 thesaurus_linkage_number, 

29) 

30from .wxr_context import WiktextractContext 

31from .wxr_logging import logger 

32 

33 

34def page_handler( 

35 page: Page, 

36) -> tuple[list[dict[str, str]], CollatedErrorReturnData]: 

37 # Make sure there are no newlines or other strange characters in the 

38 # title. They could cause security problems at several post-processing 

39 # steps. 

40 # We've given the page_handler function an extra wxr attribute previously. 

41 # This should never cause an exception, and if it does, we want it to. 

42 wxr: WiktextractContext = page_handler.wxr # type:ignore[attr-defined] 

43 # Helps debug extraction hangs. This writes the path of each file being 

44 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang 

45 # has been observed, these files contain page(s) that hang. They should 

46 # be checked before aborting the process, as an interrupt might delete them. 

47 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname: 

48 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid()) 

49 with open(debug_path, "w", encoding="utf-8") as f: 

50 f.write(page.title + "\n") 

51 

52 wxr.wtp.start_page(page.title) 

53 try: 

54 title = re.sub(r"[\s\000-\037]+", " ", page.title) 

55 title = title.strip() 

56 if page.redirect_to is not None: 

57 page_data = [ 

58 { 

59 "title": title, 

60 "redirect": page.redirect_to, 

61 "pos": "hard-redirect", 

62 } 

63 ] 

64 else: 

65 # XXX Sign gloss pages? 

66 start_t = time.time() 

67 page_data = parse_page(wxr, title, page.body) # type: ignore[arg-type] 

68 dur = time.time() - start_t 

69 if dur > 100: 

70 logger.warning( 

71 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format( 

72 dur, title 

73 ) 

74 ) 

75 

76 return page_data, wxr.wtp.to_return() 

77 except Exception: 

78 wxr.wtp.error( 

79 f'=== EXCEPTION while parsing page "{page.title}" ' 

80 f"in process {current_process().name}", 

81 traceback.format_exc(), 

82 "page_handler_exception", 

83 ) 

84 return [], wxr.wtp.to_return() 

85 

86 

87def parse_wiktionary( 

88 wxr: WiktextractContext, 

89 dump_path: str, 

90 num_processes: int | None, 

91 phase1_only: bool, 

92 namespace_ids: set[int], 

93 out_f: TextIO, 

94 human_readable: bool = False, 

95 override_folders: list[str] | list[Path] | None = None, 

96 skip_extract_dump: bool = False, 

97 save_pages_path: str | Path | None = None, 

98) -> None: 

99 """Parses Wiktionary from the dump file ``path`` (which should point 

100 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This 

101 calls `word_cb(data)` for all words defined for languages in `languages`.""" 

102 capture_language_codes = wxr.config.capture_language_codes 

103 if capture_language_codes is not None: 103 ↛ 108line 103 didn't jump to line 108 because the condition on line 103 was always true

104 assert isinstance(capture_language_codes, (list, tuple, set)) 

105 for x in capture_language_codes: 

106 assert isinstance(x, str) 

107 

108 logger.info("First phase - extracting templates, macros, and pages") 

109 if override_folders is not None: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 override_folders = [Path(folder) for folder in override_folders] 

111 if save_pages_path is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true

112 save_pages_path = Path(save_pages_path) 

113 

114 analyze_template_mod = import_extractor_module( 

115 wxr.wtp.lang_code, "analyze_template" 

116 ) 

117 process_dump( 

118 wxr.wtp, 

119 dump_path, 

120 namespace_ids, 

121 override_folders, 

122 skip_extract_dump, 

123 save_pages_path, 

124 analyze_template_mod.analyze_template 

125 if analyze_template_mod is not None 

126 else None, 

127 ) 

128 

129 if not phase1_only: 129 ↛ exitline 129 didn't return from function 'parse_wiktionary' because the condition on line 129 was always true

130 reprocess_wiktionary(wxr, num_processes, out_f, human_readable) 

131 

132 

133def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None: 

134 if out_f is not None: 134 ↛ exitline 134 didn't return from function 'write_json_data' because the condition on line 134 was always true

135 if human_readable: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true

136 out_f.write( 

137 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False) 

138 ) 

139 else: 

140 out_f.write(json.dumps(data, ensure_ascii=False)) 

141 out_f.write("\n") 

142 

143 

144def estimate_progress( 

145 processed_pages: int, all_pages: int, start_time: float, last_time: float 

146) -> float: 

147 current_time = time.time() 

148 processed_pages += 1 

149 if current_time - last_time > 1: 

150 remaining_pages = all_pages - processed_pages 

151 estimate_seconds = ( 

152 (current_time - start_time) / processed_pages * remaining_pages 

153 ) 

154 logger.info( 

155 " ... {}/{} pages ({:.1%}) processed, " 

156 "{:02d}:{:02d}:{:02d} remaining".format( 

157 processed_pages, 

158 all_pages, 

159 processed_pages / all_pages, 

160 int(estimate_seconds / 3600), 

161 int(estimate_seconds / 60 % 60), 

162 int(estimate_seconds % 60), 

163 ) 

164 ) 

165 last_time = current_time 

166 return last_time 

167 

168 

169def init_worker_process(worker_func, wxr: WiktextractContext) -> None: 

170 wxr.reconnect_databases() 

171 worker_func.wxr = wxr 

172 

173 

174def check_error( 

175 wxr: WiktextractContext, 

176 dt: dict, 

177 word: str | None, 

178 lang: str | None, 

179 pos: str | None, 

180 msg: str, 

181) -> None: 

182 """Formats and outputs an error message about data format checks.""" 

183 msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False) 

184 prefix = word or "" 

185 if lang: 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true

186 prefix += "/" + lang 

187 if pos: 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true

188 prefix += "/" + pos 

189 if prefix: 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was always true

190 msg = prefix + ": " + msg 

191 print(msg) 

192 config = wxr.config 

193 if len(config.debugs) > 100000: # Avoid excessive size 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 return 

195 error_data: ErrorMessageData = { 

196 "msg": msg, 

197 "trace": "", 

198 "title": word, 

199 "section": lang, 

200 "subsection": pos, 

201 "called_from": "wiktionary/179/20240425", 

202 "path": tuple(), 

203 } 

204 config.debugs.append(error_data) 

205 

206 

207def check_tags( 

208 wxr: WiktextractContext, 

209 dt: dict, 

210 word: str, 

211 lang: str, 

212 pos: str, 

213 item: dict, 

214) -> None: 

215 assert isinstance(item, dict) 

216 tags = item.get("tags") 

217 if tags is None: 

218 return 

219 if not isinstance(tags, (list, tuple)): 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 check_error( 

221 wxr, 

222 dt, 

223 word, 

224 lang, 

225 pos, 

226 '"tags" field value must be a list of strings: {}'.format( 

227 repr(tags) 

228 ), 

229 ) 

230 return 

231 for tag in tags: 

232 if not isinstance(tag, str): 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true

233 check_error( 

234 wxr, 

235 dt, 

236 word, 

237 lang, 

238 pos, 

239 '"tags" field should only contain strings: {}'.format( 

240 repr(tag) 

241 ), 

242 ) 

243 continue 

244 # XXX enable the following later (currently too many bogus tags in 

245 # non-English editions). Tag values should be standardized across 

246 # editions, except for uppercase tags (e.g., regional variants). 

247 if wxr.wtp.lang_code in ("en",): # Check edition 247 ↛ 231line 247 didn't jump to line 231 because the condition on line 247 was always true

248 from .tags import valid_tags 

249 

250 if tag not in valid_tags: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true

251 check_error( 

252 wxr, 

253 dt, 

254 word, 

255 lang, 

256 pos, 

257 "invalid tag {} not in valid_tags (or " 

258 "uppercase_tags)".format(repr(tag)), 

259 ) 

260 

261 

262def check_str_fields( 

263 wxr: WiktextractContext, 

264 dt: dict, 

265 word: str, 

266 lang: str, 

267 pos: str, 

268 item: dict, 

269 fields: list[str], 

270 mandatory: bool = False, 

271 empty_ok: bool = False, 

272) -> None: 

273 """Checks that each of the listed fields contains a non-empty string. 

274 Non-existent fields are ok unless ``mandatory`` is True.""" 

275 assert isinstance(item, dict) 

276 for field in fields: 

277 v = item.get(field) 

278 if v is None: 

279 if mandatory: 

280 check_error( 

281 wxr, 

282 dt, 

283 word, 

284 lang, 

285 pos, 

286 "{!r} should be a{} string (it is a " 

287 "mandatory field): {}".format( 

288 field, 

289 "" if empty_ok else " non-empty", 

290 json.dumps(item, sort_keys=True), 

291 ), 

292 ) 

293 continue 

294 if not isinstance(v, str): 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true

295 check_error( 

296 wxr, 

297 dt, 

298 word, 

299 lang, 

300 pos, 

301 "{!r} should be a{} string: {}".format( 

302 field, 

303 "" if empty_ok else " non-empty", 

304 json.dumps(item, sort_keys=True), 

305 ), 

306 ) 

307 if not v and not empty_ok: 

308 check_error( 

309 wxr, 

310 dt, 

311 word, 

312 lang, 

313 pos, 

314 "{!r} should contain a non-empty string: {}".format( 

315 field, json.dumps(item, sort_keys=True) 

316 ), 

317 ) 

318 

319 

320def check_dict_list_fields( 

321 wxr: WiktextractContext, 

322 dt: dict, 

323 word: str, 

324 lang: str, 

325 pos: str, 

326 item: dict, 

327 fields: list[str], 

328) -> bool: 

329 """Checks that each listed field, if present, is a list of dicts.""" 

330 assert isinstance(item, dict) 

331 for field in fields: 

332 lst = item.get(field) 

333 if lst is None: 

334 continue 

335 if not isinstance(lst, (list, tuple)): 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true

336 check_error( 

337 wxr, 

338 dt, 

339 word, 

340 lang, 

341 pos, 

342 "{!r} should be a list of dicts: {}".format( 

343 field, json.dumps(lst, sort_keys=True) 

344 ), 

345 ) 

346 return False 

347 for x in lst: 

348 if not isinstance(x, dict): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 check_error( 

350 wxr, 

351 dt, 

352 word, 

353 lang, 

354 pos, 

355 "{!r} should be a list of dicts: {}".format( 

356 field, json.dumps(lst, sort_keys=True) 

357 ), 

358 ) 

359 return False 

360 return True 

361 

362 

363def check_str_list_fields( 

364 wxr: WiktextractContext, 

365 dt: dict, 

366 word: str, 

367 lang: str, 

368 pos: str, 

369 item: dict, 

370 fields: list[str], 

371) -> None: 

372 """Checks that each of the listed fields contains a list of non-empty 

373 strings or is not present.""" 

374 assert isinstance(item, dict) 

375 for field in fields: 

376 lst = item.get(field) 

377 if lst is None: 

378 continue 

379 if not isinstance(lst, (list, tuple)): 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true

380 check_error( 

381 wxr, 

382 dt, 

383 word, 

384 lang, 

385 pos, 

386 "{!r} should be a list of dicts: {}".format( 

387 field, json.dumps(item, sort_keys=True) 

388 ), 

389 ) 

390 continue 

391 for x in lst: 

392 if not isinstance(x, str) or not x: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 check_error( 

394 wxr, 

395 dt, 

396 word, 

397 lang, 

398 pos, 

399 "{!r} should be a list of non-empty strings: {}".format( 

400 field, json.dumps(item, sort_keys=True) 

401 ), 

402 ) 

403 break 

404 

405 

406def check_json_data(wxr: WiktextractContext, dt: dict) -> None: 

407 """Performs some basic checks on the generated data.""" 

408 word = dt.get("word", dt.get("title")) 

409 if word is None: 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true

410 check_error( 

411 wxr, 

412 dt, 

413 None, 

414 None, 

415 None, 

416 'missing "word" or "title" field in data', 

417 ) 

418 return 

419 if "title" in dt: 

420 return # redirect pages don't have following fields 

421 lang = dt.get("lang") 

422 if not lang: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 check_error(wxr, dt, word, None, None, 'missing "lang" field in data') 

424 return 

425 pos = dt.get("pos") 

426 if not pos: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data') 

428 return 

429 if not dt.get("lang_code"): 429 ↛ 430line 429 didn't jump to line 430 because the condition on line 429 was never true

430 check_error( 

431 wxr, dt, word, lang, pos, 'missing "lang_code" field in data' 

432 ) 

433 check_tags(wxr, dt, word, lang, pos, dt) 

434 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"]) 

435 num = dt.get("etymology_number") 

436 if num is not None and not isinstance(num, int): 436 ↛ 437line 436 didn't jump to line 437 because the condition on line 436 was never true

437 check_error( 

438 wxr, dt, word, lang, pos, '"etymology_number" must be an int' 

439 ) 

440 # Check that certain fields, if present, contain lists of dicts 

441 if not check_dict_list_fields( 441 ↛ 467line 441 didn't jump to line 467 because the condition on line 441 was never true

442 wxr, 

443 dt, 

444 word, 

445 lang, 

446 pos, 

447 dt, 

448 [ 

449 "forms", 

450 "senses", 

451 "synonyms", 

452 "antonyms", 

453 "hypernyms", 

454 "holonyms", 

455 "meronyms", 

456 "coordinate_terms", 

457 "derived", 

458 "related", 

459 "sounds", 

460 "translations", 

461 "descendants", 

462 "etymology_templates", 

463 "head_templates", 

464 "inflection_templates", 

465 ], 

466 ): 

467 return # Avoid further processing because it would cause type errors 

468 # Check the "forms" field 

469 forms = dt.get("forms") or [] 

470 for form in forms: 

471 check_tags(wxr, dt, word, lang, pos, form) 

472 tags = dt.get("tags") 

473 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 473 ↛ 470line 473 didn't jump to line 470 because the condition on line 473 was always true

474 check_str_fields( 

475 wxr, dt, word, lang, pos, form, ["form"], mandatory=True 

476 ) 

477 check_str_list_fields( 

478 wxr, 

479 dt, 

480 word, 

481 lang, 

482 pos, 

483 dt, 

484 ["categories", "topics", "wikidata", "wikipedia"], 

485 ) 

486 # Check the "senses" field 

487 senses = dt.get("senses") or [] 

488 if not senses: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true

489 check_error( 

490 wxr, 

491 dt, 

492 word, 

493 lang, 

494 pos, 

495 'missing "senses" in data (must have at least one ' 

496 'sense, add empty sense with "no-gloss" tag if none ' 

497 "otherwise available)", 

498 ) 

499 return 

500 for sense in senses: 

501 check_str_list_fields( 

502 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"] 

503 ) 

504 # Extra check: should have no-gloss tag if no glosses 

505 for field in ("glosses", "raw_glosses"): 

506 glosses = sense.get(field) or [] 

507 if ( 507 ↛ 512line 507 didn't jump to line 512

508 not glosses 

509 and isinstance(sense.get("tags"), str) 

510 and "no-gloss" not in sense.get("tags", "").split() 

511 ): 

512 check_error( 

513 wxr, 

514 dt, 

515 word, 

516 lang, 

517 pos, 

518 "{!r} should have at least one gloss or " 

519 '"no-gloss" in "tags"'.format(field), 

520 ) 

521 continue 

522 check_tags(wxr, dt, word, lang, pos, sense) 

523 check_str_list_fields( 

524 wxr, 

525 dt, 

526 word, 

527 lang, 

528 pos, 

529 sense, 

530 ["categories", "topics", "wikidata", "wikipedia"], 

531 ) 

532 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"]) 

533 if not check_dict_list_fields( 533 ↛ 553line 533 didn't jump to line 553 because the condition on line 533 was never true

534 wxr, 

535 dt, 

536 word, 

537 lang, 

538 pos, 

539 sense, 

540 [ 

541 "alt_of", 

542 "form_of", 

543 "synonyms", 

544 "antonyms", 

545 "hypernyms", 

546 "holonyms", 

547 "meronyms", 

548 "coordinate_terms", 

549 "derived", 

550 "related", 

551 ], 

552 ): 

553 continue 

554 for field in ("alt_of", "form_of"): 

555 lst = sense.get(field) 

556 if lst is None: 

557 continue 

558 for item in lst: 

559 check_str_fields( 

560 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

561 ) 

562 check_str_fields( 

563 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False 

564 ) 

565 

566 for field in ( 

567 "synonyms", 

568 "antonyms", 

569 "hypernyms", 

570 "holonyms", 

571 "meronyms", 

572 "coordinate_terms", 

573 "derived", 

574 "related", 

575 ): 

576 lst = sense.get(field) 

577 if lst is None: 

578 continue 

579 for item in lst: 

580 check_str_fields( 

581 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

582 ) 

583 check_tags(wxr, dt, word, lang, pos, item) 

584 check_str_fields( 

585 wxr, 

586 dt, 

587 word, 

588 lang, 

589 pos, 

590 item, 

591 ["english", "roman", "sense", "taxonomic"], 

592 mandatory=False, 

593 empty_ok=True, 

594 ) 

595 check_str_list_fields( 

596 wxr, dt, word, lang, pos, item, ["topics"] 

597 ) 

598 # Check the "sounds" field 

599 # We will permit having any number of different types (ipa, enpr, etc) 

600 # in the same sound entry or in different sound entries. 

601 sounds = dt.get("sounds") or [] 

602 for item in sounds: 

603 check_str_fields( 

604 wxr, 

605 dt, 

606 word, 

607 lang, 

608 pos, 

609 item, 

610 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"], 

611 ) 

612 check_tags(wxr, dt, word, lang, pos, item) 

613 check_str_list_fields( 

614 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"] 

615 ) 

616 # Check the "translations" field 

617 translations = dt.get("translations") or [] 

618 for item in translations: 

619 check_str_fields( 

620 wxr, dt, word, lang, pos, item, ["word"], mandatory=True 

621 ) 

622 check_tags(wxr, dt, word, lang, pos, item) 

623 check_str_fields( 

624 wxr, 

625 dt, 

626 word, 

627 lang, 

628 pos, 

629 item, 

630 [ 

631 "alt", 

632 "code", 

633 "english", 

634 "lang", 

635 "note", 

636 "roman", 

637 "sense", 

638 "taxonomic", 

639 ], 

640 ) 

641 if not item.get("code") and not item.get("lang"): 641 ↛ 642line 641 didn't jump to line 642 because the condition on line 641 was never true

642 check_error( 

643 wxr, 

644 dt, 

645 word, 

646 lang, 

647 pos, 

648 '"translations" items must contain at least one ' 

649 'of "code" and "lang" (normally both): {}'.format( 

650 json.dumps(item, sort_keys=True, ensure_ascii=False) 

651 ), 

652 ) 

653 # Check the "etymology_templates", "head_templates", and 

654 # "inflection_templates" fields 

655 for field in [ 

656 "etymology_templates", 

657 "head_templates", 

658 "inflection_templates", 

659 ]: 

660 lst = dt.get(field) 

661 if lst is None: 

662 continue 

663 for item in lst: 

664 check_str_fields( 

665 wxr, dt, word, lang, pos, item, ["name"], mandatory=True 

666 ) 

667 check_str_fields( 

668 wxr, 

669 dt, 

670 word, 

671 lang, 

672 pos, 

673 item, 

674 ["expansion"], 

675 # empty_ok=True because there are some templates 

676 # that generate empty expansions. 

677 mandatory=False, 

678 empty_ok=True, 

679 ) 

680 args = item.get("args") 

681 if args is None: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true

682 continue 

683 if not isinstance(args, dict): 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true

684 check_error( 

685 wxr, 

686 dt, 

687 word, 

688 lang, 

689 pos, 

690 '{!r} item "args" value must be a dict: {}'.format( 

691 field, json.dumps(args, sort_keys=True) 

692 ), 

693 ) 

694 continue 

695 for k, v in args.items(): 

696 if not isinstance(k, str) or not isinstance(v, str): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true

697 check_error( 

698 wxr, 

699 dt, 

700 word, 

701 lang, 

702 pos, 

703 '{!r} item "args" must be a dict with ' 

704 "string keys and values: {}".format( 

705 field, json.dumps(args, sort_keys=True) 

706 ), 

707 ) 

708 continue 

709 # Check the "descendants" field 

710 descendants = dt.get("descendants") or [] 

711 for item in descendants: 

712 check_str_fields(wxr, dt, word, lang, pos, item, ["text"]) 

713 depth = item.get("depth") 

714 if depth is not None and not isinstance(depth, int): 714 ↛ 715line 714 didn't jump to line 715 because the condition on line 714 was never true

715 check_error( 

716 wxr, 

717 dt, 

718 word, 

719 lang, 

720 pos, 

721 '"descentants" field "depth" must be an int', 

722 ) 

723 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"]) 

724 # XXX should check that they are valid templates, perhaps turn 

725 # template checking code above into a function 

726 

727 

728def reprocess_wiktionary( 

729 wxr: WiktextractContext, 

730 num_processes: int | None, 

731 out_f: TextIO, 

732 human_readable: bool = False, 

733 search_pattern: str | None = None, 

734) -> None: 

735 """Reprocesses the Wiktionary from the sqlite db.""" 

736 logger.info("Second phase - processing pages") 

737 

738 # Extract thesaurus data. This iterates over thesaurus pages, 

739 # but is very fast. 

740 if ( 740 ↛ 746line 740 didn't jump to line 746

741 wxr.config.extract_thesaurus_pages 

742 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type] 

743 ): 

744 extract_thesaurus_data(wxr, num_processes) 

745 

746 emitted = set() 

747 process_ns_ids: list[int] = list( 

748 { 

749 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload] 

750 for ns in wxr.config.extract_ns_names 

751 } 

752 ) 

753 start_time = time.time() 

754 last_time = start_time 

755 all_page_nums = wxr.wtp.saved_page_nums( 

756 process_ns_ids, True, "wikitext", search_pattern 

757 ) 

758 wxr.remove_unpicklable_objects() 

759 with Pool(num_processes, init_worker_process, (page_handler, wxr)) as pool: 

760 wxr.reconnect_databases(False) 

761 for processed_pages, (page_data, wtp_stats) in enumerate( 

762 pool.imap_unordered( 

763 page_handler, 

764 wxr.wtp.get_all_pages( 

765 process_ns_ids, True, "wikitext", search_pattern 

766 ), 

767 ) 

768 ): 

769 wxr.config.merge_return(wtp_stats) 

770 for dt in page_data: 

771 check_json_data(wxr, dt) 

772 write_json_data(dt, out_f, human_readable) 

773 word = dt.get("word") 

774 lang_code = dt.get("lang_code") 

775 pos = dt.get("pos") 

776 if word and lang_code and pos: 

777 emitted.add((word, lang_code, pos)) 

778 last_time = estimate_progress( 

779 processed_pages, all_page_nums, start_time, last_time 

780 ) 

781 if wxr.config.dump_file_lang_code == "en": 781 ↛ 783line 781 didn't jump to line 783 because the condition on line 781 was always true

782 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable) 

783 logger.info("Reprocessing wiktionary complete") 

784 

785 

786def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]: 

787 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment] 

788 title = page.title[page.title.find(":") + 1 :] 

789 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title) 

790 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title) 

791 title = title.replace("//", "__slashslash__") 

792 title = re.sub(r"^/", r"__slash__", title) 

793 title = re.sub(r"/$", r"__slash__", title) 

794 title = ns_name + "/" + title 

795 return title, text 

796 

797 

798def extract_namespace( 

799 wxr: WiktextractContext, namespace: str, path: str 

800) -> None: 

801 """Extracts all pages in the given namespace and writes them to a .tar 

802 file with the given path.""" 

803 logger.info( 

804 f"Extracting pages from namespace {namespace} to tar file {path}" 

805 ) 

806 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload] 

807 t = time.time() 

808 with tarfile.open(path, "w") as tarf: 

809 for page in wxr.wtp.get_all_pages([ns_id]): 

810 title, text = process_ns_page_title(page, namespace) 

811 text = text.encode("utf-8") 

812 f = io.BytesIO(text) 

813 title += ".txt" 

814 ti = tarfile.TarInfo(name=title) 

815 ti.size = len(text) 

816 # According to documentation, TarInfo.mtime can be int, float, 

817 # or even None in newer versions, but mypy can't tell because 

818 # it's not annotated and assumes it can only be int 

819 ti.mtime = t # type: ignore[assignment] 

820 ti.uid = 0 

821 ti.gid = 0 

822 ti.type = tarfile.REGTYPE 

823 tarf.addfile(ti, f)