Coverage for src/wiktextract/wiktionary.py: 70%
300 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Wiktionary parser for extracting a lexicon and various other information
2# from wiktionary. This file contains code to uncompress the Wiktionary
3# dump file and to separate it into individual pages.
4#
5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import io
8import json
9import os
10import re
11import tarfile
12import tempfile
13import time
14import traceback
15from multiprocessing import Pool, current_process
16from pathlib import Path
17from typing import TextIO
19from wikitextprocessor import Page
20from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData
21from wikitextprocessor.dumpparser import process_dump
23from .import_utils import import_extractor_module
24from .page import parse_page
25from .thesaurus import (
26 emit_words_in_thesaurus,
27 extract_thesaurus_data,
28 thesaurus_linkage_number,
29)
30from .wxr_context import WiktextractContext
31from .wxr_logging import logger
34def page_handler(
35 page: Page,
36) -> tuple[list[dict[str, str]], CollatedErrorReturnData]:
37 # Make sure there are no newlines or other strange characters in the
38 # title. They could cause security problems at several post-processing
39 # steps.
40 # We've given the page_handler function an extra wxr attribute previously.
41 # This should never cause an exception, and if it does, we want it to.
42 wxr: WiktextractContext = page_handler.wxr # type:ignore[attr-defined]
43 # Helps debug extraction hangs. This writes the path of each file being
44 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang
45 # has been observed, these files contain page(s) that hang. They should
46 # be checked before aborting the process, as an interrupt might delete them.
47 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname:
48 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid())
49 with open(debug_path, "w", encoding="utf-8") as f:
50 f.write(page.title + "\n")
52 wxr.wtp.start_page(page.title)
53 try:
54 title = re.sub(r"[\s\000-\037]+", " ", page.title)
55 title = title.strip()
56 if page.redirect_to is not None:
57 page_data = [
58 {
59 "title": title,
60 "redirect": page.redirect_to,
61 "pos": "hard-redirect",
62 }
63 ]
64 else:
65 # XXX Sign gloss pages?
66 start_t = time.time()
67 page_data = parse_page(wxr, title, page.body) # type: ignore[arg-type]
68 dur = time.time() - start_t
69 if dur > 100:
70 logger.warning(
71 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format(
72 dur, title
73 )
74 )
76 return page_data, wxr.wtp.to_return()
77 except Exception:
78 wxr.wtp.error(
79 f'=== EXCEPTION while parsing page "{page.title}" '
80 f"in process {current_process().name}",
81 traceback.format_exc(),
82 "page_handler_exception",
83 )
84 return [], wxr.wtp.to_return()
87def parse_wiktionary(
88 wxr: WiktextractContext,
89 dump_path: str,
90 num_processes: int | None,
91 phase1_only: bool,
92 namespace_ids: set[int],
93 out_f: TextIO,
94 human_readable: bool = False,
95 override_folders: list[str] | list[Path] | None = None,
96 skip_extract_dump: bool = False,
97 save_pages_path: str | Path | None = None,
98) -> None:
99 """Parses Wiktionary from the dump file ``path`` (which should point
100 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This
101 calls `word_cb(data)` for all words defined for languages in `languages`."""
102 capture_language_codes = wxr.config.capture_language_codes
103 if capture_language_codes is not None: 103 ↛ 108line 103 didn't jump to line 108 because the condition on line 103 was always true
104 assert isinstance(capture_language_codes, (list, tuple, set))
105 for x in capture_language_codes:
106 assert isinstance(x, str)
108 logger.info("First phase - extracting templates, macros, and pages")
109 if override_folders is not None: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 override_folders = [Path(folder) for folder in override_folders]
111 if save_pages_path is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 save_pages_path = Path(save_pages_path)
114 analyze_template_mod = import_extractor_module(
115 wxr.wtp.lang_code, "analyze_template"
116 )
117 process_dump(
118 wxr.wtp,
119 dump_path,
120 namespace_ids,
121 override_folders,
122 skip_extract_dump,
123 save_pages_path,
124 analyze_template_mod.analyze_template
125 if analyze_template_mod is not None
126 else None,
127 )
129 if not phase1_only: 129 ↛ exitline 129 didn't return from function 'parse_wiktionary' because the condition on line 129 was always true
130 reprocess_wiktionary(wxr, num_processes, out_f, human_readable)
133def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None:
134 if out_f is not None: 134 ↛ exitline 134 didn't return from function 'write_json_data' because the condition on line 134 was always true
135 if human_readable: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 out_f.write(
137 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False)
138 )
139 else:
140 out_f.write(json.dumps(data, ensure_ascii=False))
141 out_f.write("\n")
144def estimate_progress(
145 processed_pages: int, all_pages: int, start_time: float, last_time: float
146) -> float:
147 current_time = time.time()
148 processed_pages += 1
149 if current_time - last_time > 1:
150 remaining_pages = all_pages - processed_pages
151 estimate_seconds = (
152 (current_time - start_time) / processed_pages * remaining_pages
153 )
154 logger.info(
155 " ... {}/{} pages ({:.1%}) processed, "
156 "{:02d}:{:02d}:{:02d} remaining".format(
157 processed_pages,
158 all_pages,
159 processed_pages / all_pages,
160 int(estimate_seconds / 3600),
161 int(estimate_seconds / 60 % 60),
162 int(estimate_seconds % 60),
163 )
164 )
165 last_time = current_time
166 return last_time
169def init_worker_process(worker_func, wxr: WiktextractContext) -> None:
170 wxr.reconnect_databases()
171 worker_func.wxr = wxr
174def check_error(
175 wxr: WiktextractContext,
176 dt: dict,
177 word: str | None,
178 lang: str | None,
179 pos: str | None,
180 msg: str,
181 called_from: str | None = None,
182) -> None:
183 """Formats and outputs an error message about data format checks."""
184 if called_from is None: 184 ↛ 187line 184 didn't jump to line 187 because the condition on line 184 was always true
185 called_from = "wiktionary/179/20240425"
186 else:
187 called_from = "wiktionary/179/20240425" + called_from
188 msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False)
189 prefix = word or ""
190 if lang: 190 ↛ 192line 190 didn't jump to line 192 because the condition on line 190 was always true
191 prefix += "/" + lang
192 if pos: 192 ↛ 194line 192 didn't jump to line 194 because the condition on line 192 was always true
193 prefix += "/" + pos
194 if prefix: 194 ↛ 196line 194 didn't jump to line 196 because the condition on line 194 was always true
195 msg = prefix + ": " + msg
196 print(msg)
197 config = wxr.config
198 if len(config.debugs) > 100000: # Avoid excessive size 198 ↛ 199line 198 didn't jump to line 199 because the condition on line 198 was never true
199 return
200 error_data: ErrorMessageData = {
201 "msg": msg,
202 "trace": "",
203 "title": word,
204 "section": lang,
205 "subsection": pos,
206 "called_from": called_from,
207 "path": tuple(),
208 }
209 config.debugs.append(error_data)
212def check_tags(
213 wxr: WiktextractContext,
214 dt: dict,
215 word: str,
216 lang: str,
217 pos: str,
218 item: dict,
219) -> None:
220 assert isinstance(item, dict)
221 tags = item.get("tags")
222 if tags is None:
223 return
224 if not isinstance(tags, (list, tuple)): 224 ↛ 225line 224 didn't jump to line 225 because the condition on line 224 was never true
225 check_error(
226 wxr,
227 dt,
228 word,
229 lang,
230 pos,
231 '"tags" field value must be a list of strings: {}'.format(
232 repr(tags)
233 ),
234 )
235 return
236 for tag in tags:
237 if not isinstance(tag, str): 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 check_error(
239 wxr,
240 dt,
241 word,
242 lang,
243 pos,
244 '"tags" field should only contain strings: {}'.format(
245 repr(tag)
246 ),
247 )
248 continue
249 # XXX enable the following later (currently too many bogus tags in
250 # non-English editions). Tag values should be standardized across
251 # editions, except for uppercase tags (e.g., regional variants).
252 if wxr.wtp.lang_code in ("en",): # Check edition 252 ↛ 236line 252 didn't jump to line 236 because the condition on line 252 was always true
253 from .tags import uppercase_tags, valid_tags
255 if tag not in valid_tags and tag not in uppercase_tags: 255 ↛ 256line 255 didn't jump to line 256 because the condition on line 255 was never true
256 if len(tag) > 0 and tag[0].isupper():
257 check_error(
258 wxr,
259 dt,
260 word,
261 lang,
262 pos,
263 f"invalid uppercase tag {tag} not in or uppercase_tags",
264 called_from="uppercase_tags",
265 )
266 else:
267 check_error(
268 wxr,
269 dt,
270 word,
271 lang,
272 pos,
273 f"invalid tag {tag} not in valid_tags "
274 "or uppercase_tags",
275 )
278def check_str_fields(
279 wxr: WiktextractContext,
280 dt: dict,
281 word: str,
282 lang: str,
283 pos: str,
284 item: dict,
285 fields: list[str],
286 mandatory: bool = False,
287 empty_ok: bool = False,
288) -> None:
289 """Checks that each of the listed fields contains a non-empty string.
290 Non-existent fields are ok unless ``mandatory`` is True."""
291 assert isinstance(item, dict)
292 for field in fields:
293 v = item.get(field)
294 if v is None:
295 if mandatory:
296 check_error(
297 wxr,
298 dt,
299 word,
300 lang,
301 pos,
302 "{!r} should be a{} string (it is a "
303 "mandatory field): {}".format(
304 field,
305 "" if empty_ok else " non-empty",
306 json.dumps(item, sort_keys=True),
307 ),
308 )
309 continue
310 if not isinstance(v, str): 310 ↛ 311line 310 didn't jump to line 311 because the condition on line 310 was never true
311 check_error(
312 wxr,
313 dt,
314 word,
315 lang,
316 pos,
317 "{!r} should be a{} string: {}".format(
318 field,
319 "" if empty_ok else " non-empty",
320 json.dumps(item, sort_keys=True),
321 ),
322 )
323 if not v and not empty_ok:
324 check_error(
325 wxr,
326 dt,
327 word,
328 lang,
329 pos,
330 "{!r} should contain a non-empty string: {}".format(
331 field, json.dumps(item, sort_keys=True)
332 ),
333 )
336def check_dict_list_fields(
337 wxr: WiktextractContext,
338 dt: dict,
339 word: str,
340 lang: str,
341 pos: str,
342 item: dict,
343 fields: list[str],
344) -> bool:
345 """Checks that each listed field, if present, is a list of dicts."""
346 assert isinstance(item, dict)
347 for field in fields:
348 lst = item.get(field)
349 if lst is None:
350 continue
351 if not isinstance(lst, (list, tuple)): 351 ↛ 352line 351 didn't jump to line 352 because the condition on line 351 was never true
352 check_error(
353 wxr,
354 dt,
355 word,
356 lang,
357 pos,
358 "{!r} should be a list of dicts: {}".format(
359 field, json.dumps(lst, sort_keys=True)
360 ),
361 )
362 return False
363 for x in lst:
364 if not isinstance(x, dict): 364 ↛ 365line 364 didn't jump to line 365 because the condition on line 364 was never true
365 check_error(
366 wxr,
367 dt,
368 word,
369 lang,
370 pos,
371 "{!r} should be a list of dicts: {}".format(
372 field, json.dumps(lst, sort_keys=True)
373 ),
374 )
375 return False
376 return True
379def check_str_list_fields(
380 wxr: WiktextractContext,
381 dt: dict,
382 word: str,
383 lang: str,
384 pos: str,
385 item: dict,
386 fields: list[str],
387) -> None:
388 """Checks that each of the listed fields contains a list of non-empty
389 strings or is not present."""
390 assert isinstance(item, dict)
391 for field in fields:
392 lst = item.get(field)
393 if lst is None:
394 continue
395 if not isinstance(lst, (list, tuple)): 395 ↛ 396line 395 didn't jump to line 396 because the condition on line 395 was never true
396 check_error(
397 wxr,
398 dt,
399 word,
400 lang,
401 pos,
402 "{!r} should be a list of dicts: {}".format(
403 field, json.dumps(item, sort_keys=True)
404 ),
405 )
406 continue
407 for x in lst:
408 if not isinstance(x, str) or not x: 408 ↛ 409line 408 didn't jump to line 409 because the condition on line 408 was never true
409 check_error(
410 wxr,
411 dt,
412 word,
413 lang,
414 pos,
415 "{!r} should be a list of non-empty strings: {}".format(
416 field, json.dumps(item, sort_keys=True)
417 ),
418 )
419 break
422def check_json_data(wxr: WiktextractContext, dt: dict) -> None:
423 """Performs some basic checks on the generated data."""
424 word = dt.get("word", dt.get("title"))
425 if word is None: 425 ↛ 426line 425 didn't jump to line 426 because the condition on line 425 was never true
426 check_error(
427 wxr,
428 dt,
429 None,
430 None,
431 None,
432 'missing "word" or "title" field in data',
433 )
434 return
435 if "title" in dt:
436 return # redirect pages don't have following fields
437 lang = dt.get("lang")
438 if not lang: 438 ↛ 439line 438 didn't jump to line 439 because the condition on line 438 was never true
439 check_error(wxr, dt, word, None, None, 'missing "lang" field in data')
440 return
441 pos = dt.get("pos")
442 if not pos: 442 ↛ 443line 442 didn't jump to line 443 because the condition on line 442 was never true
443 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data')
444 return
445 if not dt.get("lang_code"): 445 ↛ 446line 445 didn't jump to line 446 because the condition on line 445 was never true
446 check_error(
447 wxr, dt, word, lang, pos, 'missing "lang_code" field in data'
448 )
449 check_tags(wxr, dt, word, lang, pos, dt)
450 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"])
451 num = dt.get("etymology_number")
452 if num is not None and not isinstance(num, int): 452 ↛ 453line 452 didn't jump to line 453 because the condition on line 452 was never true
453 check_error(
454 wxr, dt, word, lang, pos, '"etymology_number" must be an int'
455 )
456 # Check that certain fields, if present, contain lists of dicts
457 if not check_dict_list_fields( 457 ↛ 483line 457 didn't jump to line 483 because the condition on line 457 was never true
458 wxr,
459 dt,
460 word,
461 lang,
462 pos,
463 dt,
464 [
465 "forms",
466 "senses",
467 "synonyms",
468 "antonyms",
469 "hypernyms",
470 "holonyms",
471 "meronyms",
472 "coordinate_terms",
473 "derived",
474 "related",
475 "sounds",
476 "translations",
477 "descendants",
478 "etymology_templates",
479 "head_templates",
480 "inflection_templates",
481 ],
482 ):
483 return # Avoid further processing because it would cause type errors
484 # Check the "forms" field
485 forms = dt.get("forms") or []
486 for form in forms:
487 check_tags(wxr, dt, word, lang, pos, form)
488 tags = dt.get("tags")
489 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 489 ↛ 486line 489 didn't jump to line 486 because the condition on line 489 was always true
490 check_str_fields(
491 wxr, dt, word, lang, pos, form, ["form"], mandatory=True
492 )
493 check_str_list_fields(
494 wxr,
495 dt,
496 word,
497 lang,
498 pos,
499 dt,
500 ["categories", "topics", "wikidata", "wikipedia"],
501 )
502 # Check the "senses" field
503 senses = dt.get("senses") or []
504 if not senses: 504 ↛ 505line 504 didn't jump to line 505 because the condition on line 504 was never true
505 check_error(
506 wxr,
507 dt,
508 word,
509 lang,
510 pos,
511 'missing "senses" in data (must have at least one '
512 'sense, add empty sense with "no-gloss" tag if none '
513 "otherwise available)",
514 )
515 return
516 for sense in senses:
517 check_str_list_fields(
518 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"]
519 )
520 # Extra check: should have no-gloss tag if no glosses
521 for field in ("glosses", "raw_glosses"):
522 glosses = sense.get(field) or []
523 if ( 523 ↛ 528line 523 didn't jump to line 528 because the condition on line 523 was never true
524 not glosses
525 and isinstance(sense.get("tags"), str)
526 and "no-gloss" not in sense.get("tags", "").split()
527 ):
528 check_error(
529 wxr,
530 dt,
531 word,
532 lang,
533 pos,
534 "{!r} should have at least one gloss or "
535 '"no-gloss" in "tags"'.format(field),
536 )
537 continue
538 check_tags(wxr, dt, word, lang, pos, sense)
539 check_str_list_fields(
540 wxr,
541 dt,
542 word,
543 lang,
544 pos,
545 sense,
546 ["categories", "topics", "wikidata", "wikipedia"],
547 )
548 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"])
549 if not check_dict_list_fields( 549 ↛ 569line 549 didn't jump to line 569 because the condition on line 549 was never true
550 wxr,
551 dt,
552 word,
553 lang,
554 pos,
555 sense,
556 [
557 "alt_of",
558 "form_of",
559 "synonyms",
560 "antonyms",
561 "hypernyms",
562 "holonyms",
563 "meronyms",
564 "coordinate_terms",
565 "derived",
566 "related",
567 ],
568 ):
569 continue
570 for field in ("alt_of", "form_of"):
571 lst = sense.get(field)
572 if lst is None:
573 continue
574 for item in lst:
575 check_str_fields(
576 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
577 )
578 check_str_fields(
579 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False
580 )
582 for field in (
583 "synonyms",
584 "antonyms",
585 "hypernyms",
586 "holonyms",
587 "meronyms",
588 "coordinate_terms",
589 "derived",
590 "related",
591 ):
592 lst = sense.get(field)
593 if lst is None:
594 continue
595 for item in lst:
596 check_str_fields(
597 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
598 )
599 check_tags(wxr, dt, word, lang, pos, item)
600 check_str_fields(
601 wxr,
602 dt,
603 word,
604 lang,
605 pos,
606 item,
607 ["english", "roman", "sense", "taxonomic"],
608 mandatory=False,
609 empty_ok=True,
610 )
611 check_str_list_fields(
612 wxr, dt, word, lang, pos, item, ["topics"]
613 )
614 # Check the "sounds" field
615 # We will permit having any number of different types (ipa, enpr, etc)
616 # in the same sound entry or in different sound entries.
617 sounds = dt.get("sounds") or []
618 for item in sounds:
619 check_str_fields(
620 wxr,
621 dt,
622 word,
623 lang,
624 pos,
625 item,
626 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"],
627 )
628 check_tags(wxr, dt, word, lang, pos, item)
629 check_str_list_fields(
630 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"]
631 )
632 # Check the "translations" field
633 translations = dt.get("translations") or []
634 for item in translations:
635 check_str_fields(
636 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
637 )
638 check_tags(wxr, dt, word, lang, pos, item)
639 check_str_fields(
640 wxr,
641 dt,
642 word,
643 lang,
644 pos,
645 item,
646 [
647 "alt",
648 "code",
649 "english",
650 "lang",
651 "note",
652 "roman",
653 "sense",
654 "taxonomic",
655 ],
656 )
657 if not item.get("code") and not item.get("lang"): 657 ↛ 658line 657 didn't jump to line 658 because the condition on line 657 was never true
658 check_error(
659 wxr,
660 dt,
661 word,
662 lang,
663 pos,
664 '"translations" items must contain at least one '
665 'of "code" and "lang" (normally both): {}'.format(
666 json.dumps(item, sort_keys=True, ensure_ascii=False)
667 ),
668 )
669 # Check the "etymology_templates", "head_templates", and
670 # "inflection_templates" fields
671 for field in [
672 "etymology_templates",
673 "head_templates",
674 "inflection_templates",
675 ]:
676 lst = dt.get(field)
677 if lst is None:
678 continue
679 for item in lst:
680 check_str_fields(
681 wxr, dt, word, lang, pos, item, ["name"], mandatory=True
682 )
683 check_str_fields(
684 wxr,
685 dt,
686 word,
687 lang,
688 pos,
689 item,
690 ["expansion"],
691 # empty_ok=True because there are some templates
692 # that generate empty expansions.
693 mandatory=False,
694 empty_ok=True,
695 )
696 args = item.get("args")
697 if args is None: 697 ↛ 698line 697 didn't jump to line 698 because the condition on line 697 was never true
698 continue
699 if not isinstance(args, dict): 699 ↛ 700line 699 didn't jump to line 700 because the condition on line 699 was never true
700 check_error(
701 wxr,
702 dt,
703 word,
704 lang,
705 pos,
706 '{!r} item "args" value must be a dict: {}'.format(
707 field, json.dumps(args, sort_keys=True)
708 ),
709 )
710 continue
711 for k, v in args.items():
712 if not isinstance(k, str) or not isinstance(v, str): 712 ↛ 713line 712 didn't jump to line 713 because the condition on line 712 was never true
713 check_error(
714 wxr,
715 dt,
716 word,
717 lang,
718 pos,
719 '{!r} item "args" must be a dict with '
720 "string keys and values: {}".format(
721 field, json.dumps(args, sort_keys=True)
722 ),
723 )
724 continue
725 # Check the "descendants" field
726 descendants = dt.get("descendants") or []
727 for item in descendants:
728 check_str_fields(wxr, dt, word, lang, pos, item, ["text"])
729 depth = item.get("depth")
730 if depth is not None and not isinstance(depth, int): 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true
731 check_error(
732 wxr,
733 dt,
734 word,
735 lang,
736 pos,
737 '"descentants" field "depth" must be an int',
738 )
739 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])
740 # XXX should check that they are valid templates, perhaps turn
741 # template checking code above into a function
744def reprocess_wiktionary(
745 wxr: WiktextractContext,
746 num_processes: int | None,
747 out_f: TextIO,
748 human_readable: bool = False,
749 search_pattern: str | None = None,
750) -> None:
751 """Reprocesses the Wiktionary from the sqlite db."""
752 logger.info("Second phase - processing pages")
754 # Extract thesaurus data. This iterates over thesaurus pages,
755 # but is very fast.
756 if ( 756 ↛ 762line 756 didn't jump to line 762 because the condition on line 756 was always true
757 wxr.config.extract_thesaurus_pages
758 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type]
759 ):
760 extract_thesaurus_data(wxr, num_processes)
762 emitted = set()
763 process_ns_ids: list[int] = list(
764 {
765 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload]
766 for ns in wxr.config.extract_ns_names
767 }
768 )
769 start_time = time.time()
770 last_time = start_time
771 all_page_nums = wxr.wtp.saved_page_nums(
772 process_ns_ids, True, "wikitext", search_pattern
773 )
774 wxr.remove_unpicklable_objects()
775 with Pool(num_processes, init_worker_process, (page_handler, wxr)) as pool:
776 wxr.reconnect_databases(False)
777 for processed_pages, (page_data, wtp_stats) in enumerate(
778 pool.imap_unordered(
779 page_handler,
780 wxr.wtp.get_all_pages(
781 process_ns_ids, True, "wikitext", search_pattern
782 ),
783 )
784 ):
785 wxr.config.merge_return(wtp_stats)
786 for dt in page_data:
787 check_json_data(wxr, dt)
788 write_json_data(dt, out_f, human_readable)
789 word = dt.get("word")
790 lang_code = dt.get("lang_code")
791 pos = dt.get("pos")
792 if word and lang_code and pos:
793 emitted.add((word, lang_code, pos))
794 last_time = estimate_progress(
795 processed_pages, all_page_nums, start_time, last_time
796 )
797 if wxr.config.dump_file_lang_code == "en": 797 ↛ 799line 797 didn't jump to line 799 because the condition on line 797 was always true
798 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
799 logger.info("Reprocessing wiktionary complete")
802def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]:
803 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment]
804 title = page.title[page.title.find(":") + 1 :]
805 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title)
806 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title)
807 title = title.replace("//", "__slashslash__")
808 title = re.sub(r"^/", r"__slash__", title)
809 title = re.sub(r"/$", r"__slash__", title)
810 title = ns_name + "/" + title
811 return title, text
814def extract_namespace(
815 wxr: WiktextractContext, namespace: str, path: str
816) -> None:
817 """Extracts all pages in the given namespace and writes them to a .tar
818 file with the given path."""
819 logger.info(
820 f"Extracting pages from namespace {namespace} to tar file {path}"
821 )
822 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload]
823 t = time.time()
824 with tarfile.open(path, "w") as tarf:
825 for page in wxr.wtp.get_all_pages([ns_id]):
826 title, text = process_ns_page_title(page, namespace)
827 text = text.encode("utf-8")
828 f = io.BytesIO(text)
829 title += ".txt"
830 ti = tarfile.TarInfo(name=title)
831 ti.size = len(text)
832 # According to documentation, TarInfo.mtime can be int, float,
833 # or even None in newer versions, but mypy can't tell because
834 # it's not annotated and assumes it can only be int
835 ti.mtime = t # type: ignore[assignment]
836 ti.uid = 0
837 ti.gid = 0
838 ti.type = tarfile.REGTYPE
839 tarf.addfile(ti, f)