Coverage for src/wiktextract/wiktionary.py: 75%

1# Wiktionary parser for extracting a lexicon and various other information

2# from wiktionary. This file contains code to uncompress the Wiktionary

3# dump file and to separate it into individual pages.

7import atexit

8import io

9import json

10import os

11import re

12import tarfile

13import tempfile

14import time

15from concurrent.futures import ProcessPoolExecutor

16from copy import deepcopy

17from multiprocessing import current_process, get_context

18from pathlib import Path

19from traceback import format_exc

20from typing import TextIO

22from wikitextprocessor import Page

23from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData

24from wikitextprocessor.dumpparser import process_dump

26from .import_utils import import_extractor_module

27from .page import parse_page

28from .thesaurus import (

29 emit_words_in_thesaurus,

30 extract_thesaurus_data,

31 thesaurus_linkage_number,

32)

33from .wxr_context import WiktextractContext

34from .wxr_logging import logger

37def page_handler(

38 page: Page,

39) -> tuple[list[dict[str, str]], CollatedErrorReturnData]:

40 # Make sure there are no newlines or other strange characters in the

41 # title. They could cause security problems at several post-processing

42 # steps.

43 # We've given the page_handler function an extra wxr attribute previously.

44 # This should never cause an exception, and if it does, we want it to.

46 # Helps debug extraction hangs. This writes the path of each file being

47 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang

48 # has been observed, these files contain page(s) that hang. They should

49 # be checked before aborting the process, as an interrupt might delete them.

50 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname:

51 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid())

52 with open(debug_path, "w", encoding="utf-8") as f:

53 f.write(page.title + "\n")

55 worker_wxr.wtp.start_page(page.title)

56 try:

57 title = re.sub(r"[\s\000-\037]+", " ", page.title)

58 title = title.strip()

59 if page.redirect_to is not None:

60 page_data = [

61 {

62 "title": title,

63 "redirect": page.redirect_to,

64 "pos": "hard-redirect",

65 }

66 ]

67 else:

68 # XXX Sign gloss pages?

69 start_t = time.time()

70 page_data = parse_page(worker_wxr, title, page.body) # type: ignore[arg-type]

71 dur = time.time() - start_t

72 if dur > 100: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 logger.warning(

74 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format(

75 dur, title

76 )

77 )

79 return page_data, worker_wxr.wtp.to_return()

80 except Exception:

81 worker_wxr.wtp.error(

82 f'=== EXCEPTION while parsing page "{page.title}" '

83 f"in process {current_process().name}",

84 format_exc(),

85 "page_handler_exception",

86 )

87 return [], worker_wxr.wtp.to_return()

90def parse_wiktionary(

91 wxr: WiktextractContext,

92 dump_path: str,

93 num_processes: int | None,

94 phase1_only: bool,

95 namespace_ids: set[int],

96 out_f: TextIO,

97 human_readable: bool = False,

98 override_folders: list[str] | list[Path] | None = None,

99 skip_extract_dump: bool = False,

100 save_pages_path: str | Path | None = None,

101) -> None:

102 """Parses Wiktionary from the dump file ``path`` (which should point

103 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This

104 calls `word_cb(data)` for all words defined for languages in `languages`."""

105 capture_language_codes = wxr.config.capture_language_codes

106 if capture_language_codes is not None: 106 ↛ 111line 106 didn't jump to line 111 because the condition on line 106 was always true

107 assert isinstance(capture_language_codes, (list, tuple, set))

108 for x in capture_language_codes:

109 assert isinstance(x, str)

110

111 logger.info("First phase - extracting templates, macros, and pages")

112 if override_folders is not None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true

113 override_folders = [Path(folder) for folder in override_folders]

114 if save_pages_path is not None: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true

115 save_pages_path = Path(save_pages_path)

116

117 analyze_template_mod = import_extractor_module(

118 wxr.wtp.lang_code, "analyze_template"

119 )

120 process_dump(

121 wxr.wtp,

122 dump_path,

123 namespace_ids,

124 override_folders,

125 skip_extract_dump,

126 save_pages_path,

127 analyze_template_mod.analyze_template

128 if analyze_template_mod is not None

129 else None,

130 )

131

132 if not phase1_only: 132 ↛ exitline 132 didn't return from function 'parse_wiktionary' because the condition on line 132 was always true

133 reprocess_wiktionary(wxr, num_processes, out_f, human_readable)

134

135

136def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None:

137 if out_f is not None: 137 ↛ exitline 137 didn't return from function 'write_json_data' because the condition on line 137 was always true

138 if human_readable: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true

139 out_f.write(

140 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False)

141 )

142 else:

143 out_f.write(json.dumps(data, ensure_ascii=False))

144 out_f.write("\n")

145

146

147def estimate_progress(

148 processed_pages: int, all_pages: int, start_time: float, last_time: float

149) -> float:

150 current_time = time.time()

151 processed_pages += 1

152 if current_time - last_time > 1:

153 remaining_pages = all_pages - processed_pages

154 estimate_seconds = (

155 (current_time - start_time) / processed_pages * remaining_pages

156 )

157 logger.info(

158 " ... {}/{} pages ({:.1%}) processed, "

159 "{:02d}:{:02d}:{:02d} remaining".format(

160 processed_pages,

161 all_pages,

162 processed_pages / all_pages,

163 int(estimate_seconds / 3600),

164 int(estimate_seconds / 60 % 60),

165 int(estimate_seconds % 60),

166 )

167 )

168 last_time = current_time

169 return last_time

170

171

172def check_error(

173 wxr: WiktextractContext,

174 dt: dict,

175 word: str | None,

176 lang: str | None,

177 pos: str | None,

178 msg: str,

179 called_from: str | None = None,

180) -> None:

181 """Formats and outputs an error message about data format checks."""

182 if called_from is None: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true

183 called_from = "wiktionary/179/20240425"

184 else:

185 called_from = "wiktionary/179/20240425" + called_from

186 # msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False)

187 prefix = word or ""

188 if lang: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true

189 prefix += "/" + lang

190 if pos: 190 ↛ 192line 190 didn't jump to line 192 because the condition on line 190 was always true

191 prefix += "/" + pos

192 if prefix: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true

193 msg = prefix + ": " + msg

194 print(msg)

195 config = wxr.config

196 if len(config.debugs) > 100000: # Avoid excessive size 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true

197 return

198 error_data: ErrorMessageData = {

199 "msg": msg,

200 "trace": "",

201 "title": word,

202 "section": lang,

203 "subsection": pos,

204 "called_from": called_from,

205 "path": tuple(),

206 }

207 config.debugs.append(error_data)

208

209

210def check_tags(

211 wxr: WiktextractContext,

212 dt: dict,

213 word: str,

214 lang: str,

215 pos: str,

216 item: dict,

217) -> None:

218 assert isinstance(item, dict)

219 tags = item.get("tags")

220 if tags is None:

221 return

222 if not isinstance(tags, (list, tuple)): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 check_error(

224 wxr,

225 dt,

226 word,

227 lang,

228 pos,

229 '"tags" field value must be a list of strings: {}'.format(

230 repr(tags)

231 ),

232 )

233 return

234 for tag in tags:

235 if not isinstance(tag, str): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true

236 check_error(

237 wxr,

238 dt,

239 word,

240 lang,

241 pos,

242 '"tags" field should only contain strings: {}'.format(

243 repr(tag)

244 ),

245 )

246 continue

247 # XXX enable the following later (currently too many bogus tags in

248 # non-English editions). Tag values should be standardized across

249 # editions, except for uppercase tags (e.g., regional variants).

250 if wxr.wtp.lang_code in ("en",): # Check edition 250 ↛ 234line 250 didn't jump to line 234 because the condition on line 250 was always true

251 from .tags import uppercase_tags, valid_tags

252

253 if tag not in valid_tags and tag not in uppercase_tags: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 if len(tag) > 0 and tag[0].isupper():

255 check_error(

256 wxr,

257 dt,

258 word,

259 lang,

260 pos,

261 f"invalid uppercase tag {tag} not in or uppercase_tags",

262 called_from="uppercase_tags",

263 )

264 else:

265 check_error(

266 wxr,

267 dt,

268 word,

269 lang,

270 pos,

271 f"invalid tag {tag} not in valid_tags "

272 "or uppercase_tags",

273 )

274

275

276def check_str_fields(

277 wxr: WiktextractContext,

278 dt: dict,

279 word: str,

280 lang: str,

281 pos: str,

282 item: dict,

283 fields: list[str],

284 mandatory: bool = False,

285 empty_ok: bool = False,

286) -> None:

287 """Checks that each of the listed fields contains a non-empty string.

288 Non-existent fields are ok unless ``mandatory`` is True."""

289 assert isinstance(item, dict)

290 for field in fields:

291 v = item.get(field)

292 if field not in item:

293 if mandatory:

294 check_error(

295 wxr,

296 dt,

297 word,

298 lang,

299 pos,

300 "{!r} is missing and should be a{} string: {}".format(

301 field,

302 "" if empty_ok else " non-empty",

303 json.dumps(item, sort_keys=True, ensure_ascii=False),

304 ),

305 )

306 continue

307 if not isinstance(v, str): 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 check_error(

309 wxr,

310 dt,

311 word,

312 lang,

313 pos,

314 "{!r} should be a{} string: {}".format(

315 field,

316 "" if empty_ok else " non-empty",

317 json.dumps(item, sort_keys=True, ensure_ascii=False),

318 ),

319 )

320 if not v and not empty_ok:

321 check_error(

322 wxr,

323 dt,

324 word,

325 lang,

326 pos,

327 "{!r} should contain a non-empty string: {}".format(

328 field, json.dumps(item, sort_keys=True, ensure_ascii=False)

329 ),

330 )

331

332

333def check_dict_list_fields(

334 wxr: WiktextractContext,

335 dt: dict,

336 word: str,

337 lang: str,

338 pos: str,

339 item: dict,

340 fields: list[str],

341) -> bool:

342 """Checks that each listed field, if present, is a list of dicts."""

343 assert isinstance(item, dict)

344 for field in fields:

345 lst = item.get(field)

346 if lst is None:

347 continue

348 if not isinstance(lst, (list, tuple)): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 check_error(

350 wxr,

351 dt,

352 word,

353 lang,

354 pos,

355 "{!r} should be a list of dicts: {}".format(

356 field, json.dumps(lst, sort_keys=True, ensure_ascii=False)

357 ),

358 )

359 return False

360 for x in lst:

361 if not isinstance(x, dict): 361 ↛ 362line 361 didn't jump to line 362 because the condition on line 361 was never true

362 check_error(

363 wxr,

364 dt,

365 word,

366 lang,

367 pos,

368 "{!r} should be a list of dicts: {}".format(

369 field, json.dumps(x, sort_keys=True, ensure_ascii=False)

370 ),

371 )

372 return False

373 return True

374

375

376def check_str_list_fields(

377 wxr: WiktextractContext,

378 dt: dict,

379 word: str,

380 lang: str,

381 pos: str,

382 item: dict,

383 fields: list[str],

384) -> None:

385 """Checks that each of the listed fields contains a list of non-empty

386 strings or is not present."""

387 assert isinstance(item, dict)

388 for field in fields:

389 lst = item.get(field)

390 if lst is None:

391 continue

392 if not isinstance(lst, (list, tuple)): 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true

393 check_error(

394 wxr,

395 dt,

396 word,

397 lang,

398 pos,

399 "{!r} should be a list of dicts: {}".format(

400 field, json.dumps(item, sort_keys=True)

401 ),

402 )

403 continue

404 for x in lst:

405 if not isinstance(x, str) or not x: 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true

406 check_error(

407 wxr,

408 dt,

409 word,

410 lang,

411 pos,

412 "{!r} should be a list of non-empty strings: {}".format(

413 field, json.dumps(item, sort_keys=True)

414 ),

415 )

416 break

417

418

419def check_json_data(wxr: WiktextractContext, dt: dict) -> None:

420 """Performs some basic checks on the generated data."""

421 word = dt.get("word", dt.get("title"))

422 if word is None: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 check_error(

424 wxr,

425 dt,

426 None,

427 None,

428 None,

429 'missing "word" or "title" field in data',

430 )

431 return

432 if "title" in dt:

433 return # redirect pages don't have following fields

434 lang = dt.get("lang")

435 if not lang: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true

436 check_error(wxr, dt, word, None, None, 'missing "lang" field in data')

437 return

438 pos = dt.get("pos")

439 if not pos: 439 ↛ 440line 439 didn't jump to line 440 because the condition on line 439 was never true

440 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data')

441 return

442 if not dt.get("lang_code"): 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true

443 check_error(

444 wxr, dt, word, lang, pos, 'missing "lang_code" field in data'

445 )

446 check_tags(wxr, dt, word, lang, pos, dt)

447 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"])

448 num = dt.get("etymology_number")

449 if num is not None and not isinstance(num, int): 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true

450 check_error(

451 wxr, dt, word, lang, pos, '"etymology_number" must be an int'

452 )

453 # Check that certain fields, if present, contain lists of dicts

454 if not check_dict_list_fields( 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was never true

455 wxr,

456 dt,

457 word,

458 lang,

459 pos,

460 dt,

461 [

462 "forms",

463 "senses",

464 "synonyms",

465 "antonyms",

466 "hypernyms",

467 "holonyms",

468 "meronyms",

469 "coordinate_terms",

470 "derived",

471 "related",

472 "sounds",

473 "translations",

474 "descendants",

475 "etymology_templates",

476 "head_templates",

477 "inflection_templates",

478 ],

479 ):

480 return # Avoid further processing because it would cause type errors

481 # Check the "forms" field

482 forms = dt.get("forms") or []

483 for form in forms:

484 check_tags(wxr, dt, word, lang, pos, form)

485 tags = dt.get("tags")

486 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 486 ↛ 483line 486 didn't jump to line 483 because the condition on line 486 was always true

487 check_str_fields(

488 wxr, dt, word, lang, pos, form, ["form"], mandatory=True

489 )

490 check_str_list_fields(

491 wxr,

492 dt,

493 word,

494 lang,

495 pos,

496 dt,

497 ["categories", "topics", "wikidata", "wikipedia"],

498 )

499 # Check the "senses" field

500 senses = dt.get("senses") or []

501 if not senses: 501 ↛ 502line 501 didn't jump to line 502 because the condition on line 501 was never true

502 check_error(

503 wxr,

504 dt,

505 word,

506 lang,

507 pos,

508 'missing "senses" in data (must have at least one '

509 'sense, add empty sense with "no-gloss" tag if none '

510 "otherwise available)",

511 )

512 return

513 for sense in senses:

514 check_str_list_fields(

515 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"]

516 )

517 # Extra check: should have no-gloss tag if no glosses

518 for field in ("glosses", "raw_glosses"):

519 glosses = sense.get(field) or []

520 if ( 520 ↛ 525line 520 didn't jump to line 525 because the condition on line 520 was never true

521 not glosses

522 and isinstance(sense.get("tags"), str)

523 and "no-gloss" not in sense.get("tags", "").split()

524 ):

525 check_error(

526 wxr,

527 dt,

528 word,

529 lang,

530 pos,

531 "{!r} should have at least one gloss or "

532 '"no-gloss" in "tags"'.format(field),

533 )

534 continue

535 check_tags(wxr, dt, word, lang, pos, sense)

536 check_str_list_fields(

537 wxr,

538 dt,

539 word,

540 lang,

541 pos,

542 sense,

543 ["categories", "topics", "wikidata", "wikipedia"],

544 )

545 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"])

546 if not check_dict_list_fields( 546 ↛ 566line 546 didn't jump to line 566 because the condition on line 546 was never true

547 wxr,

548 dt,

549 word,

550 lang,

551 pos,

552 sense,

553 [

554 "alt_of",

555 "form_of",

556 "synonyms",

557 "antonyms",

558 "hypernyms",

559 "holonyms",

560 "meronyms",

561 "coordinate_terms",

562 "derived",

563 "related",

564 ],

565 ):

566 continue

567 for field in ("alt_of", "form_of"):

568 lst = sense.get(field)

569 if lst is None:

570 continue

571 for item in lst:

572 check_str_fields(

573 wxr, dt, word, lang, pos, item, ["word"], mandatory=True

574 )

575 check_str_fields(

576 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False

577 )

578

579 for field in (

580 "synonyms",

581 "antonyms",

582 "hypernyms",

583 "holonyms",

584 "meronyms",

585 "coordinate_terms",

586 "derived",

587 "related",

588 ):

589 lst = sense.get(field)

590 if lst is None:

591 continue

592 for item in lst:

593 check_str_fields(

594 wxr, dt, word, lang, pos, item, ["word"], mandatory=True

595 )

596 check_tags(wxr, dt, word, lang, pos, item)

597 check_str_fields(

598 wxr,

599 dt,

600 word,

601 lang,

602 pos,

603 item,

604 ["english", "roman", "sense", "taxonomic"],

605 mandatory=False,

606 empty_ok=True,

607 )

608 check_str_list_fields(

609 wxr, dt, word, lang, pos, item, ["topics"]

610 )

611 # Check the "sounds" field

612 # We will permit having any number of different types (ipa, enpr, etc)

613 # in the same sound entry or in different sound entries.

614 sounds = dt.get("sounds", [])

615 for item in sounds:

616 check_str_fields(

617 wxr,

618 dt,

619 word,

620 lang,

621 pos,

622 item,

623 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"],

624 )

625 check_tags(wxr, dt, word, lang, pos, item)

626 check_str_list_fields(

627 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"]

628 )

629 # Check the "translations" field

630 translations = dt.get("translations") or []

631 for item in translations:

632 check_str_fields(

633 wxr, dt, word, lang, pos, item, ["word"], mandatory=True

634 )

635 check_tags(wxr, dt, word, lang, pos, item)

636 check_str_fields(

637 wxr,

638 dt,

639 word,

640 lang,

641 pos,

642 item,

643 [

644 "alt",

645 "code",

646 "english",

647 "lang",

648 "note",

649 "roman",

650 "sense",

651 "taxonomic",

652 ],

653 )

654 if not item.get("code") and not item.get("lang"): 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true

655 check_error(

656 wxr,

657 dt,

658 word,

659 lang,

660 pos,

661 '"translations" items must contain at least one '

662 'of "code" and "lang" (normally both): {}'.format(

663 json.dumps(item, sort_keys=True, ensure_ascii=False)

664 ),

665 )

666 # Check the "etymology_templates", "head_templates", and

667 # "inflection_templates" fields

668 for field in [

669 "etymology_templates",

670 "head_templates",

671 "inflection_templates",

672 ]:

673 lst = dt.get(field)

674 if lst is None:

675 continue

676 for item in lst:

677 check_str_fields(

678 wxr, dt, word, lang, pos, item, ["name"], mandatory=True

679 )

680 check_str_fields(

681 wxr,

682 dt,

683 word,

684 lang,

685 pos,

686 item,

687 ["expansion"],

688 # empty_ok=True because there are some templates

689 # that generate empty expansions.

690 mandatory=False,

691 empty_ok=True,

692 )

693 args = item.get("args")

694 if args is None: 694 ↛ 695line 694 didn't jump to line 695 because the condition on line 694 was never true

695 continue

696 if not isinstance(args, dict): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true

697 check_error(

698 wxr,

699 dt,

700 word,

701 lang,

702 pos,

703 '{!r} item "args" value must be a dict: {}'.format(

704 field, json.dumps(args, sort_keys=True)

705 ),

706 )

707 continue

708 for k, v in args.items():

709 if not isinstance(k, str) or not isinstance(v, str): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true

710 check_error(

711 wxr,

712 dt,

713 word,

714 lang,

715 pos,

716 '{!r} item "args" must be a dict with '

717 "string keys and values: {}".format(

718 field, json.dumps(args, sort_keys=True)

719 ),

720 )

721 continue

722 # Check the "descendants" field

723 descendants = dt.get("descendants") or []

724 for item in descendants:

725 check_str_fields(wxr, dt, word, lang, pos, item, ["text"])

726 depth = item.get("depth")

727 if depth is not None and not isinstance(depth, int): 727 ↛ 728line 727 didn't jump to line 728 because the condition on line 727 was never true

728 check_error(

729 wxr,

730 dt,

731 word,

732 lang,

733 pos,

734 '"descentants" field "depth" must be an int',

735 )

736 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])

737 # XXX should check that they are valid templates, perhaps turn

738 # template checking code above into a function

739

740

741def init_worker(wxr: WiktextractContext) -> None:

742 global worker_wxr

743 worker_wxr = wxr

744 worker_wxr.reconnect_databases()

745 atexit.register(worker_wxr.remove_unpicklable_objects)

746

747

748def reprocess_wiktionary(

749 wxr: WiktextractContext,

750 num_processes: int | None,

751 out_f: TextIO,

752 human_readable: bool = False,

753 search_pattern: str | None = None,

754) -> None:

755 """Reprocesses the Wiktionary from the sqlite db."""

756 logger.info("Second phase - processing pages")

757

758 # Extract thesaurus data. This iterates over thesaurus pages,

759 # but is very fast.

760 if ( 760 ↛ 766line 760 didn't jump to line 766 because the condition on line 760 was always true

761 wxr.config.extract_thesaurus_pages

762 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type]

763 ):

764 extract_thesaurus_data(wxr, num_processes)

765

766 emitted = set()

767 process_ns_ids: list[int] = list(

768 {

769 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload]

770 for ns in wxr.config.extract_ns_names

771 }

772 )

773 start_time = time.time()

774 last_time = start_time

775 all_page_nums = wxr.wtp.saved_page_nums(

776 process_ns_ids, True, "wikitext", search_pattern

777 )

778 wxr.remove_unpicklable_objects()

779 with ProcessPoolExecutor(

780 max_workers=num_processes,

781 mp_context=get_context("spawn"),

782 initializer=init_worker,

783 initargs=(deepcopy(wxr),),

784 ) as executor:

785 wxr.reconnect_databases()

786 for processed_pages, (page_data, wtp_stats) in enumerate(

787 executor.map(

788 page_handler,

789 wxr.wtp.get_all_pages(

790 process_ns_ids, True, "wikitext", search_pattern

791 ),

792 chunksize=100, # default is 1 too slow

793 )

794 ):

795 wxr.config.merge_return(wtp_stats)

796 for dt in page_data:

797 check_json_data(wxr, dt)

798 write_json_data(dt, out_f, human_readable)

799 word = dt.get("word")

800 lang_code = dt.get("lang_code")

801 pos = dt.get("pos")

802 if word and lang_code and pos:

803 emitted.add((word, lang_code, pos))

804 last_time = estimate_progress(

805 processed_pages, all_page_nums, start_time, last_time

806 )

807

808 if wxr.config.dump_file_lang_code == "en": 808 ↛ 810line 808 didn't jump to line 810 because the condition on line 808 was always true

809 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)

810 logger.info("Reprocessing wiktionary complete")

811

812

813def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]:

814 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment]

815 title = page.title[page.title.find(":") + 1 :]

816 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title)

817 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title)

818 title = title.replace("//", "__slashslash__")

819 title = re.sub(r"^/", r"__slash__", title)

820 title = re.sub(r"/$", r"__slash__", title)

821 title = ns_name + "/" + title

822 return title, text

823

824

825def extract_namespace(

826 wxr: WiktextractContext, namespace: str, path: str

827) -> None:

828 """Extracts all pages in the given namespace and writes them to a .tar

829 file with the given path."""

830 logger.info(

831 f"Extracting pages from namespace {namespace} to tar file {path}"

832 )

833 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload]

834 t = time.time()

835 with tarfile.open(path, "w") as tarf:

836 for page in wxr.wtp.get_all_pages([ns_id]):

837 title, text = process_ns_page_title(page, namespace)

838 text = text.encode("utf-8")

839 f = io.BytesIO(text)

840 title += ".txt"

841 ti = tarfile.TarInfo(name=title)

842 ti.size = len(text)

843 # According to documentation, TarInfo.mtime can be int, float,

844 # or even None in newer versions, but mypy can't tell because

845 # it's not annotated and assumes it can only be int

846 ti.mtime = t # type: ignore[assignment]

847 ti.uid = 0

848 ti.gid = 0

849 ti.type = tarfile.REGTYPE

850 tarf.addfile(ti, f)