Coverage for src/wiktextract/wiktionary.py: 75%
302 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1# Wiktionary parser for extracting a lexicon and various other information
2# from wiktionary. This file contains code to uncompress the Wiktionary
3# dump file and to separate it into individual pages.
4#
5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import atexit
8import io
9import json
10import os
11import re
12import tarfile
13import tempfile
14import time
15from concurrent.futures import ProcessPoolExecutor
16from copy import deepcopy
17from multiprocessing import current_process, get_context
18from pathlib import Path
19from traceback import format_exc
20from typing import TextIO
22from wikitextprocessor import Page
23from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData
24from wikitextprocessor.dumpparser import process_dump
26from .import_utils import import_extractor_module
27from .page import parse_page
28from .thesaurus import (
29 emit_words_in_thesaurus,
30 extract_thesaurus_data,
31 thesaurus_linkage_number,
32)
33from .wxr_context import WiktextractContext
34from .wxr_logging import logger
37def page_handler(
38 page: Page,
39) -> tuple[list[dict[str, str]], CollatedErrorReturnData]:
40 # Make sure there are no newlines or other strange characters in the
41 # title. They could cause security problems at several post-processing
42 # steps.
43 # We've given the page_handler function an extra wxr attribute previously.
44 # This should never cause an exception, and if it does, we want it to.
46 # Helps debug extraction hangs. This writes the path of each file being
47 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang
48 # has been observed, these files contain page(s) that hang. They should
49 # be checked before aborting the process, as an interrupt might delete them.
50 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname:
51 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid())
52 with open(debug_path, "w", encoding="utf-8") as f:
53 f.write(page.title + "\n")
55 worker_wxr.wtp.start_page(page.title)
56 try:
57 title = re.sub(r"[\s\000-\037]+", " ", page.title)
58 title = title.strip()
59 if page.redirect_to is not None:
60 page_data = [
61 {
62 "title": title,
63 "redirect": page.redirect_to,
64 "pos": "hard-redirect",
65 }
66 ]
67 else:
68 # XXX Sign gloss pages?
69 start_t = time.time()
70 page_data = parse_page(worker_wxr, title, page.body) # type: ignore[arg-type]
71 dur = time.time() - start_t
72 if dur > 100: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true
73 logger.warning(
74 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format(
75 dur, title
76 )
77 )
79 return page_data, worker_wxr.wtp.to_return()
80 except Exception:
81 worker_wxr.wtp.error(
82 f'=== EXCEPTION while parsing page "{page.title}" '
83 f"in process {current_process().name}",
84 format_exc(),
85 "page_handler_exception",
86 )
87 return [], worker_wxr.wtp.to_return()
90def parse_wiktionary(
91 wxr: WiktextractContext,
92 dump_path: str,
93 num_processes: int | None,
94 phase1_only: bool,
95 namespace_ids: set[int],
96 out_f: TextIO,
97 human_readable: bool = False,
98 override_folders: list[str] | list[Path] | None = None,
99 skip_extract_dump: bool = False,
100 save_pages_path: str | Path | None = None,
101) -> None:
102 """Parses Wiktionary from the dump file ``path`` (which should point
103 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This
104 calls `word_cb(data)` for all words defined for languages in `languages`."""
105 capture_language_codes = wxr.config.capture_language_codes
106 if capture_language_codes is not None: 106 ↛ 111line 106 didn't jump to line 111 because the condition on line 106 was always true
107 assert isinstance(capture_language_codes, (list, tuple, set))
108 for x in capture_language_codes:
109 assert isinstance(x, str)
111 logger.info("First phase - extracting templates, macros, and pages")
112 if override_folders is not None: 112 ↛ 113line 112 didn't jump to line 113 because the condition on line 112 was never true
113 override_folders = [Path(folder) for folder in override_folders]
114 if save_pages_path is not None: 114 ↛ 115line 114 didn't jump to line 115 because the condition on line 114 was never true
115 save_pages_path = Path(save_pages_path)
117 analyze_template_mod = import_extractor_module(
118 wxr.wtp.lang_code, "analyze_template"
119 )
120 process_dump(
121 wxr.wtp,
122 dump_path,
123 namespace_ids,
124 override_folders,
125 skip_extract_dump,
126 save_pages_path,
127 analyze_template_mod.analyze_template
128 if analyze_template_mod is not None
129 else None,
130 )
132 if not phase1_only: 132 ↛ exitline 132 didn't return from function 'parse_wiktionary' because the condition on line 132 was always true
133 reprocess_wiktionary(wxr, num_processes, out_f, human_readable)
136def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None:
137 if out_f is not None: 137 ↛ exitline 137 didn't return from function 'write_json_data' because the condition on line 137 was always true
138 if human_readable: 138 ↛ 139line 138 didn't jump to line 139 because the condition on line 138 was never true
139 out_f.write(
140 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False)
141 )
142 else:
143 out_f.write(json.dumps(data, ensure_ascii=False))
144 out_f.write("\n")
147def estimate_progress(
148 processed_pages: int, all_pages: int, start_time: float, last_time: float
149) -> float:
150 current_time = time.time()
151 processed_pages += 1
152 if current_time - last_time > 1:
153 remaining_pages = all_pages - processed_pages
154 estimate_seconds = (
155 (current_time - start_time) / processed_pages * remaining_pages
156 )
157 logger.info(
158 " ... {}/{} pages ({:.1%}) processed, "
159 "{:02d}:{:02d}:{:02d} remaining".format(
160 processed_pages,
161 all_pages,
162 processed_pages / all_pages,
163 int(estimate_seconds / 3600),
164 int(estimate_seconds / 60 % 60),
165 int(estimate_seconds % 60),
166 )
167 )
168 last_time = current_time
169 return last_time
172def check_error(
173 wxr: WiktextractContext,
174 dt: dict,
175 word: str | None,
176 lang: str | None,
177 pos: str | None,
178 msg: str,
179 called_from: str | None = None,
180) -> None:
181 """Formats and outputs an error message about data format checks."""
182 if called_from is None: 182 ↛ 185line 182 didn't jump to line 185 because the condition on line 182 was always true
183 called_from = "wiktionary/179/20240425"
184 else:
185 called_from = "wiktionary/179/20240425" + called_from
186 # msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False)
187 prefix = word or ""
188 if lang: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true
189 prefix += "/" + lang
190 if pos: 190 ↛ 192line 190 didn't jump to line 192 because the condition on line 190 was always true
191 prefix += "/" + pos
192 if prefix: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true
193 msg = prefix + ": " + msg
194 print(msg)
195 config = wxr.config
196 if len(config.debugs) > 100000: # Avoid excessive size 196 ↛ 197line 196 didn't jump to line 197 because the condition on line 196 was never true
197 return
198 error_data: ErrorMessageData = {
199 "msg": msg,
200 "trace": "",
201 "title": word,
202 "section": lang,
203 "subsection": pos,
204 "called_from": called_from,
205 "path": tuple(),
206 }
207 config.debugs.append(error_data)
210def check_tags(
211 wxr: WiktextractContext,
212 dt: dict,
213 word: str,
214 lang: str,
215 pos: str,
216 item: dict,
217) -> None:
218 assert isinstance(item, dict)
219 tags = item.get("tags")
220 if tags is None:
221 return
222 if not isinstance(tags, (list, tuple)): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true
223 check_error(
224 wxr,
225 dt,
226 word,
227 lang,
228 pos,
229 '"tags" field value must be a list of strings: {}'.format(
230 repr(tags)
231 ),
232 )
233 return
234 for tag in tags:
235 if not isinstance(tag, str): 235 ↛ 236line 235 didn't jump to line 236 because the condition on line 235 was never true
236 check_error(
237 wxr,
238 dt,
239 word,
240 lang,
241 pos,
242 '"tags" field should only contain strings: {}'.format(
243 repr(tag)
244 ),
245 )
246 continue
247 # XXX enable the following later (currently too many bogus tags in
248 # non-English editions). Tag values should be standardized across
249 # editions, except for uppercase tags (e.g., regional variants).
250 if wxr.wtp.lang_code in ("en",): # Check edition 250 ↛ 234line 250 didn't jump to line 234 because the condition on line 250 was always true
251 from .tags import uppercase_tags, valid_tags
253 if tag not in valid_tags and tag not in uppercase_tags: 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 if len(tag) > 0 and tag[0].isupper():
255 check_error(
256 wxr,
257 dt,
258 word,
259 lang,
260 pos,
261 f"invalid uppercase tag {tag} not in or uppercase_tags",
262 called_from="uppercase_tags",
263 )
264 else:
265 check_error(
266 wxr,
267 dt,
268 word,
269 lang,
270 pos,
271 f"invalid tag {tag} not in valid_tags "
272 "or uppercase_tags",
273 )
276def check_str_fields(
277 wxr: WiktextractContext,
278 dt: dict,
279 word: str,
280 lang: str,
281 pos: str,
282 item: dict,
283 fields: list[str],
284 mandatory: bool = False,
285 empty_ok: bool = False,
286) -> None:
287 """Checks that each of the listed fields contains a non-empty string.
288 Non-existent fields are ok unless ``mandatory`` is True."""
289 assert isinstance(item, dict)
290 for field in fields:
291 v = item.get(field)
292 if field not in item:
293 if mandatory:
294 check_error(
295 wxr,
296 dt,
297 word,
298 lang,
299 pos,
300 "{!r} is missing and should be a{} string: {}".format(
301 field,
302 "" if empty_ok else " non-empty",
303 json.dumps(item, sort_keys=True, ensure_ascii=False),
304 ),
305 )
306 continue
307 if not isinstance(v, str): 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true
308 check_error(
309 wxr,
310 dt,
311 word,
312 lang,
313 pos,
314 "{!r} should be a{} string: {}".format(
315 field,
316 "" if empty_ok else " non-empty",
317 json.dumps(item, sort_keys=True, ensure_ascii=False),
318 ),
319 )
320 if not v and not empty_ok:
321 check_error(
322 wxr,
323 dt,
324 word,
325 lang,
326 pos,
327 "{!r} should contain a non-empty string: {}".format(
328 field, json.dumps(item, sort_keys=True, ensure_ascii=False)
329 ),
330 )
333def check_dict_list_fields(
334 wxr: WiktextractContext,
335 dt: dict,
336 word: str,
337 lang: str,
338 pos: str,
339 item: dict,
340 fields: list[str],
341) -> bool:
342 """Checks that each listed field, if present, is a list of dicts."""
343 assert isinstance(item, dict)
344 for field in fields:
345 lst = item.get(field)
346 if lst is None:
347 continue
348 if not isinstance(lst, (list, tuple)): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 check_error(
350 wxr,
351 dt,
352 word,
353 lang,
354 pos,
355 "{!r} should be a list of dicts: {}".format(
356 field, json.dumps(lst, sort_keys=True)
357 ),
358 )
359 return False
360 for x in lst:
361 if not isinstance(x, dict): 361 ↛ 362line 361 didn't jump to line 362 because the condition on line 361 was never true
362 check_error(
363 wxr,
364 dt,
365 word,
366 lang,
367 pos,
368 "{!r} should be a list of dicts: {}".format(
369 field, json.dumps(lst, sort_keys=True)
370 ),
371 )
372 return False
373 return True
376def check_str_list_fields(
377 wxr: WiktextractContext,
378 dt: dict,
379 word: str,
380 lang: str,
381 pos: str,
382 item: dict,
383 fields: list[str],
384) -> None:
385 """Checks that each of the listed fields contains a list of non-empty
386 strings or is not present."""
387 assert isinstance(item, dict)
388 for field in fields:
389 lst = item.get(field)
390 if lst is None:
391 continue
392 if not isinstance(lst, (list, tuple)): 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true
393 check_error(
394 wxr,
395 dt,
396 word,
397 lang,
398 pos,
399 "{!r} should be a list of dicts: {}".format(
400 field, json.dumps(item, sort_keys=True)
401 ),
402 )
403 continue
404 for x in lst:
405 if not isinstance(x, str) or not x: 405 ↛ 406line 405 didn't jump to line 406 because the condition on line 405 was never true
406 check_error(
407 wxr,
408 dt,
409 word,
410 lang,
411 pos,
412 "{!r} should be a list of non-empty strings: {}".format(
413 field, json.dumps(item, sort_keys=True)
414 ),
415 )
416 break
419def check_json_data(wxr: WiktextractContext, dt: dict) -> None:
420 """Performs some basic checks on the generated data."""
421 word = dt.get("word", dt.get("title"))
422 if word is None: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 check_error(
424 wxr,
425 dt,
426 None,
427 None,
428 None,
429 'missing "word" or "title" field in data',
430 )
431 return
432 if "title" in dt:
433 return # redirect pages don't have following fields
434 lang = dt.get("lang")
435 if not lang: 435 ↛ 436line 435 didn't jump to line 436 because the condition on line 435 was never true
436 check_error(wxr, dt, word, None, None, 'missing "lang" field in data')
437 return
438 pos = dt.get("pos")
439 if not pos: 439 ↛ 440line 439 didn't jump to line 440 because the condition on line 439 was never true
440 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data')
441 return
442 if not dt.get("lang_code"): 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true
443 check_error(
444 wxr, dt, word, lang, pos, 'missing "lang_code" field in data'
445 )
446 check_tags(wxr, dt, word, lang, pos, dt)
447 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"])
448 num = dt.get("etymology_number")
449 if num is not None and not isinstance(num, int): 449 ↛ 450line 449 didn't jump to line 450 because the condition on line 449 was never true
450 check_error(
451 wxr, dt, word, lang, pos, '"etymology_number" must be an int'
452 )
453 # Check that certain fields, if present, contain lists of dicts
454 if not check_dict_list_fields( 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was never true
455 wxr,
456 dt,
457 word,
458 lang,
459 pos,
460 dt,
461 [
462 "forms",
463 "senses",
464 "synonyms",
465 "antonyms",
466 "hypernyms",
467 "holonyms",
468 "meronyms",
469 "coordinate_terms",
470 "derived",
471 "related",
472 "sounds",
473 "translations",
474 "descendants",
475 "etymology_templates",
476 "head_templates",
477 "inflection_templates",
478 ],
479 ):
480 return # Avoid further processing because it would cause type errors
481 # Check the "forms" field
482 forms = dt.get("forms") or []
483 for form in forms:
484 check_tags(wxr, dt, word, lang, pos, form)
485 tags = dt.get("tags")
486 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 486 ↛ 483line 486 didn't jump to line 483 because the condition on line 486 was always true
487 check_str_fields(
488 wxr, dt, word, lang, pos, form, ["form"], mandatory=True
489 )
490 check_str_list_fields(
491 wxr,
492 dt,
493 word,
494 lang,
495 pos,
496 dt,
497 ["categories", "topics", "wikidata", "wikipedia"],
498 )
499 # Check the "senses" field
500 senses = dt.get("senses") or []
501 if not senses: 501 ↛ 502line 501 didn't jump to line 502 because the condition on line 501 was never true
502 check_error(
503 wxr,
504 dt,
505 word,
506 lang,
507 pos,
508 'missing "senses" in data (must have at least one '
509 'sense, add empty sense with "no-gloss" tag if none '
510 "otherwise available)",
511 )
512 return
513 for sense in senses:
514 check_str_list_fields(
515 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"]
516 )
517 # Extra check: should have no-gloss tag if no glosses
518 for field in ("glosses", "raw_glosses"):
519 glosses = sense.get(field) or []
520 if ( 520 ↛ 525line 520 didn't jump to line 525 because the condition on line 520 was never true
521 not glosses
522 and isinstance(sense.get("tags"), str)
523 and "no-gloss" not in sense.get("tags", "").split()
524 ):
525 check_error(
526 wxr,
527 dt,
528 word,
529 lang,
530 pos,
531 "{!r} should have at least one gloss or "
532 '"no-gloss" in "tags"'.format(field),
533 )
534 continue
535 check_tags(wxr, dt, word, lang, pos, sense)
536 check_str_list_fields(
537 wxr,
538 dt,
539 word,
540 lang,
541 pos,
542 sense,
543 ["categories", "topics", "wikidata", "wikipedia"],
544 )
545 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"])
546 if not check_dict_list_fields( 546 ↛ 566line 546 didn't jump to line 566 because the condition on line 546 was never true
547 wxr,
548 dt,
549 word,
550 lang,
551 pos,
552 sense,
553 [
554 "alt_of",
555 "form_of",
556 "synonyms",
557 "antonyms",
558 "hypernyms",
559 "holonyms",
560 "meronyms",
561 "coordinate_terms",
562 "derived",
563 "related",
564 ],
565 ):
566 continue
567 for field in ("alt_of", "form_of"):
568 lst = sense.get(field)
569 if lst is None:
570 continue
571 for item in lst:
572 check_str_fields(
573 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
574 )
575 check_str_fields(
576 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False
577 )
579 for field in (
580 "synonyms",
581 "antonyms",
582 "hypernyms",
583 "holonyms",
584 "meronyms",
585 "coordinate_terms",
586 "derived",
587 "related",
588 ):
589 lst = sense.get(field)
590 if lst is None:
591 continue
592 for item in lst:
593 check_str_fields(
594 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
595 )
596 check_tags(wxr, dt, word, lang, pos, item)
597 check_str_fields(
598 wxr,
599 dt,
600 word,
601 lang,
602 pos,
603 item,
604 ["english", "roman", "sense", "taxonomic"],
605 mandatory=False,
606 empty_ok=True,
607 )
608 check_str_list_fields(
609 wxr, dt, word, lang, pos, item, ["topics"]
610 )
611 # Check the "sounds" field
612 # We will permit having any number of different types (ipa, enpr, etc)
613 # in the same sound entry or in different sound entries.
614 sounds = dt.get("sounds") or []
615 for item in sounds:
616 check_str_fields(
617 wxr,
618 dt,
619 word,
620 lang,
621 pos,
622 item,
623 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"],
624 )
625 check_tags(wxr, dt, word, lang, pos, item)
626 check_str_list_fields(
627 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"]
628 )
629 # Check the "translations" field
630 translations = dt.get("translations") or []
631 for item in translations:
632 check_str_fields(
633 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
634 )
635 check_tags(wxr, dt, word, lang, pos, item)
636 check_str_fields(
637 wxr,
638 dt,
639 word,
640 lang,
641 pos,
642 item,
643 [
644 "alt",
645 "code",
646 "english",
647 "lang",
648 "note",
649 "roman",
650 "sense",
651 "taxonomic",
652 ],
653 )
654 if not item.get("code") and not item.get("lang"): 654 ↛ 655line 654 didn't jump to line 655 because the condition on line 654 was never true
655 check_error(
656 wxr,
657 dt,
658 word,
659 lang,
660 pos,
661 '"translations" items must contain at least one '
662 'of "code" and "lang" (normally both): {}'.format(
663 json.dumps(item, sort_keys=True, ensure_ascii=False)
664 ),
665 )
666 # Check the "etymology_templates", "head_templates", and
667 # "inflection_templates" fields
668 for field in [
669 "etymology_templates",
670 "head_templates",
671 "inflection_templates",
672 ]:
673 lst = dt.get(field)
674 if lst is None:
675 continue
676 for item in lst:
677 check_str_fields(
678 wxr, dt, word, lang, pos, item, ["name"], mandatory=True
679 )
680 check_str_fields(
681 wxr,
682 dt,
683 word,
684 lang,
685 pos,
686 item,
687 ["expansion"],
688 # empty_ok=True because there are some templates
689 # that generate empty expansions.
690 mandatory=False,
691 empty_ok=True,
692 )
693 args = item.get("args")
694 if args is None: 694 ↛ 695line 694 didn't jump to line 695 because the condition on line 694 was never true
695 continue
696 if not isinstance(args, dict): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true
697 check_error(
698 wxr,
699 dt,
700 word,
701 lang,
702 pos,
703 '{!r} item "args" value must be a dict: {}'.format(
704 field, json.dumps(args, sort_keys=True)
705 ),
706 )
707 continue
708 for k, v in args.items():
709 if not isinstance(k, str) or not isinstance(v, str): 709 ↛ 710line 709 didn't jump to line 710 because the condition on line 709 was never true
710 check_error(
711 wxr,
712 dt,
713 word,
714 lang,
715 pos,
716 '{!r} item "args" must be a dict with '
717 "string keys and values: {}".format(
718 field, json.dumps(args, sort_keys=True)
719 ),
720 )
721 continue
722 # Check the "descendants" field
723 descendants = dt.get("descendants") or []
724 for item in descendants:
725 check_str_fields(wxr, dt, word, lang, pos, item, ["text"])
726 depth = item.get("depth")
727 if depth is not None and not isinstance(depth, int): 727 ↛ 728line 727 didn't jump to line 728 because the condition on line 727 was never true
728 check_error(
729 wxr,
730 dt,
731 word,
732 lang,
733 pos,
734 '"descentants" field "depth" must be an int',
735 )
736 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])
737 # XXX should check that they are valid templates, perhaps turn
738 # template checking code above into a function
741def init_worker(wxr: WiktextractContext) -> None:
742 global worker_wxr
743 worker_wxr = wxr
744 worker_wxr.reconnect_databases()
745 atexit.register(worker_wxr.remove_unpicklable_objects)
748def reprocess_wiktionary(
749 wxr: WiktextractContext,
750 num_processes: int | None,
751 out_f: TextIO,
752 human_readable: bool = False,
753 search_pattern: str | None = None,
754) -> None:
755 """Reprocesses the Wiktionary from the sqlite db."""
756 logger.info("Second phase - processing pages")
758 # Extract thesaurus data. This iterates over thesaurus pages,
759 # but is very fast.
760 if ( 760 ↛ 766line 760 didn't jump to line 766 because the condition on line 760 was always true
761 wxr.config.extract_thesaurus_pages
762 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type]
763 ):
764 extract_thesaurus_data(wxr, num_processes)
766 emitted = set()
767 process_ns_ids: list[int] = list(
768 {
769 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload]
770 for ns in wxr.config.extract_ns_names
771 }
772 )
773 start_time = time.time()
774 last_time = start_time
775 all_page_nums = wxr.wtp.saved_page_nums(
776 process_ns_ids, True, "wikitext", search_pattern
777 )
778 wxr.remove_unpicklable_objects()
779 with ProcessPoolExecutor(
780 max_workers=num_processes,
781 mp_context=get_context("spawn"),
782 initializer=init_worker,
783 initargs=(deepcopy(wxr),),
784 ) as executor:
785 wxr.reconnect_databases()
786 for processed_pages, (page_data, wtp_stats) in enumerate(
787 executor.map(
788 page_handler,
789 wxr.wtp.get_all_pages(
790 process_ns_ids, True, "wikitext", search_pattern
791 ),
792 chunksize=100, # default is 1 too slow
793 )
794 ):
795 wxr.config.merge_return(wtp_stats)
796 for dt in page_data:
797 check_json_data(wxr, dt)
798 write_json_data(dt, out_f, human_readable)
799 word = dt.get("word")
800 lang_code = dt.get("lang_code")
801 pos = dt.get("pos")
802 if word and lang_code and pos:
803 emitted.add((word, lang_code, pos))
804 last_time = estimate_progress(
805 processed_pages, all_page_nums, start_time, last_time
806 )
808 if wxr.config.dump_file_lang_code == "en": 808 ↛ 810line 808 didn't jump to line 810 because the condition on line 808 was always true
809 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
810 logger.info("Reprocessing wiktionary complete")
813def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]:
814 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment]
815 title = page.title[page.title.find(":") + 1 :]
816 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title)
817 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title)
818 title = title.replace("//", "__slashslash__")
819 title = re.sub(r"^/", r"__slash__", title)
820 title = re.sub(r"/$", r"__slash__", title)
821 title = ns_name + "/" + title
822 return title, text
825def extract_namespace(
826 wxr: WiktextractContext, namespace: str, path: str
827) -> None:
828 """Extracts all pages in the given namespace and writes them to a .tar
829 file with the given path."""
830 logger.info(
831 f"Extracting pages from namespace {namespace} to tar file {path}"
832 )
833 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload]
834 t = time.time()
835 with tarfile.open(path, "w") as tarf:
836 for page in wxr.wtp.get_all_pages([ns_id]):
837 title, text = process_ns_page_title(page, namespace)
838 text = text.encode("utf-8")
839 f = io.BytesIO(text)
840 title += ".txt"
841 ti = tarfile.TarInfo(name=title)
842 ti.size = len(text)
843 # According to documentation, TarInfo.mtime can be int, float,
844 # or even None in newer versions, but mypy can't tell because
845 # it's not annotated and assumes it can only be int
846 ti.mtime = t # type: ignore[assignment]
847 ti.uid = 0
848 ti.gid = 0
849 ti.type = tarfile.REGTYPE
850 tarf.addfile(ti, f)