Coverage for src/wiktextract/wiktionary.py: 72%
295 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1# Wiktionary parser for extracting a lexicon and various other information
2# from wiktionary. This file contains code to uncompress the Wiktionary
3# dump file and to separate it into individual pages.
4#
5# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE and https://ylonen.org
7import io
8import json
9import os
10import re
11import tarfile
12import tempfile
13import time
14import traceback
15from multiprocessing import Pool, current_process
16from pathlib import Path
17from typing import TextIO
19from wikitextprocessor import Page
20from wikitextprocessor.core import CollatedErrorReturnData, ErrorMessageData
21from wikitextprocessor.dumpparser import process_dump
23from .import_utils import import_extractor_module
24from .page import parse_page
25from .thesaurus import (
26 emit_words_in_thesaurus,
27 extract_thesaurus_data,
28 thesaurus_linkage_number,
29)
30from .wxr_context import WiktextractContext
31from .wxr_logging import logger
34def page_handler(
35 page: Page,
36) -> tuple[list[dict[str, str]], CollatedErrorReturnData]:
37 # Make sure there are no newlines or other strange characters in the
38 # title. They could cause security problems at several post-processing
39 # steps.
40 # We've given the page_handler function an extra wxr attribute previously.
41 # This should never cause an exception, and if it does, we want it to.
42 wxr: WiktextractContext = page_handler.wxr # type:ignore[attr-defined]
43 # Helps debug extraction hangs. This writes the path of each file being
44 # processed into /tmp/wiktextract*/wiktextract-*. Once a hang
45 # has been observed, these files contain page(s) that hang. They should
46 # be checked before aborting the process, as an interrupt might delete them.
47 with tempfile.TemporaryDirectory(prefix="wiktextract") as tmpdirname:
48 debug_path = "{}/wiktextract-{}".format(tmpdirname, os.getpid())
49 with open(debug_path, "w", encoding="utf-8") as f:
50 f.write(page.title + "\n")
52 wxr.wtp.start_page(page.title)
53 try:
54 title = re.sub(r"[\s\000-\037]+", " ", page.title)
55 title = title.strip()
56 if page.redirect_to is not None:
57 page_data = [
58 {
59 "title": title,
60 "redirect": page.redirect_to,
61 "pos": "hard-redirect",
62 }
63 ]
64 else:
65 # XXX Sign gloss pages?
66 start_t = time.time()
67 page_data = parse_page(wxr, title, page.body) # type: ignore[arg-type]
68 dur = time.time() - start_t
69 if dur > 100:
70 logger.warning(
71 "====== WARNING: PARSING PAGE TOOK {:.1f}s: {}".format(
72 dur, title
73 )
74 )
76 return page_data, wxr.wtp.to_return()
77 except Exception:
78 wxr.wtp.error(
79 f'=== EXCEPTION while parsing page "{page.title}" '
80 f"in process {current_process().name}",
81 traceback.format_exc(),
82 "page_handler_exception",
83 )
84 return [], wxr.wtp.to_return()
87def parse_wiktionary(
88 wxr: WiktextractContext,
89 dump_path: str,
90 num_processes: int | None,
91 phase1_only: bool,
92 namespace_ids: set[int],
93 out_f: TextIO,
94 human_readable: bool = False,
95 override_folders: list[str] | list[Path] | None = None,
96 skip_extract_dump: bool = False,
97 save_pages_path: str | Path | None = None,
98) -> None:
99 """Parses Wiktionary from the dump file ``path`` (which should point
100 to a "enwiktionary-<date>-pages-articles.xml.bz2" file. This
101 calls `word_cb(data)` for all words defined for languages in `languages`."""
102 capture_language_codes = wxr.config.capture_language_codes
103 if capture_language_codes is not None: 103 ↛ 108line 103 didn't jump to line 108 because the condition on line 103 was always true
104 assert isinstance(capture_language_codes, (list, tuple, set))
105 for x in capture_language_codes:
106 assert isinstance(x, str)
108 logger.info("First phase - extracting templates, macros, and pages")
109 if override_folders is not None: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 override_folders = [Path(folder) for folder in override_folders]
111 if save_pages_path is not None: 111 ↛ 112line 111 didn't jump to line 112 because the condition on line 111 was never true
112 save_pages_path = Path(save_pages_path)
114 analyze_template_mod = import_extractor_module(
115 wxr.wtp.lang_code, "analyze_template"
116 )
117 process_dump(
118 wxr.wtp,
119 dump_path,
120 namespace_ids,
121 override_folders,
122 skip_extract_dump,
123 save_pages_path,
124 analyze_template_mod.analyze_template
125 if analyze_template_mod is not None
126 else None,
127 )
129 if not phase1_only: 129 ↛ exitline 129 didn't return from function 'parse_wiktionary' because the condition on line 129 was always true
130 reprocess_wiktionary(wxr, num_processes, out_f, human_readable)
133def write_json_data(data: dict, out_f: TextIO, human_readable: bool) -> None:
134 if out_f is not None: 134 ↛ exitline 134 didn't return from function 'write_json_data' because the condition on line 134 was always true
135 if human_readable: 135 ↛ 136line 135 didn't jump to line 136 because the condition on line 135 was never true
136 out_f.write(
137 json.dumps(data, indent=2, sort_keys=True, ensure_ascii=False)
138 )
139 else:
140 out_f.write(json.dumps(data, ensure_ascii=False))
141 out_f.write("\n")
144def estimate_progress(
145 processed_pages: int, all_pages: int, start_time: float, last_time: float
146) -> float:
147 current_time = time.time()
148 processed_pages += 1
149 if current_time - last_time > 1:
150 remaining_pages = all_pages - processed_pages
151 estimate_seconds = (
152 (current_time - start_time) / processed_pages * remaining_pages
153 )
154 logger.info(
155 " ... {}/{} pages ({:.1%}) processed, "
156 "{:02d}:{:02d}:{:02d} remaining".format(
157 processed_pages,
158 all_pages,
159 processed_pages / all_pages,
160 int(estimate_seconds / 3600),
161 int(estimate_seconds / 60 % 60),
162 int(estimate_seconds % 60),
163 )
164 )
165 last_time = current_time
166 return last_time
169def init_worker_process(worker_func, wxr: WiktextractContext) -> None:
170 wxr.reconnect_databases()
171 worker_func.wxr = wxr
174def check_error(
175 wxr: WiktextractContext,
176 dt: dict,
177 word: str | None,
178 lang: str | None,
179 pos: str | None,
180 msg: str,
181) -> None:
182 """Formats and outputs an error message about data format checks."""
183 msg += ": " + json.dumps(dt, sort_keys=True, ensure_ascii=False)
184 prefix = word or ""
185 if lang: 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true
186 prefix += "/" + lang
187 if pos: 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true
188 prefix += "/" + pos
189 if prefix: 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was always true
190 msg = prefix + ": " + msg
191 print(msg)
192 config = wxr.config
193 if len(config.debugs) > 100000: # Avoid excessive size 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 return
195 error_data: ErrorMessageData = {
196 "msg": msg,
197 "trace": "",
198 "title": word,
199 "section": lang,
200 "subsection": pos,
201 "called_from": "wiktionary/179/20240425",
202 "path": tuple(),
203 }
204 config.debugs.append(error_data)
207def check_tags(
208 wxr: WiktextractContext,
209 dt: dict,
210 word: str,
211 lang: str,
212 pos: str,
213 item: dict,
214) -> None:
215 assert isinstance(item, dict)
216 tags = item.get("tags")
217 if tags is None:
218 return
219 if not isinstance(tags, (list, tuple)): 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 check_error(
221 wxr,
222 dt,
223 word,
224 lang,
225 pos,
226 '"tags" field value must be a list of strings: {}'.format(
227 repr(tags)
228 ),
229 )
230 return
231 for tag in tags:
232 if not isinstance(tag, str): 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true
233 check_error(
234 wxr,
235 dt,
236 word,
237 lang,
238 pos,
239 '"tags" field should only contain strings: {}'.format(
240 repr(tag)
241 ),
242 )
243 continue
244 # XXX enable the following later (currently too many bogus tags in
245 # non-English editions). Tag values should be standardized across
246 # editions, except for uppercase tags (e.g., regional variants).
247 if wxr.wtp.lang_code in ("en",): # Check edition 247 ↛ 231line 247 didn't jump to line 231 because the condition on line 247 was always true
248 from .tags import valid_tags
250 if tag not in valid_tags: 250 ↛ 251line 250 didn't jump to line 251 because the condition on line 250 was never true
251 check_error(
252 wxr,
253 dt,
254 word,
255 lang,
256 pos,
257 "invalid tag {} not in valid_tags (or "
258 "uppercase_tags)".format(repr(tag)),
259 )
262def check_str_fields(
263 wxr: WiktextractContext,
264 dt: dict,
265 word: str,
266 lang: str,
267 pos: str,
268 item: dict,
269 fields: list[str],
270 mandatory: bool = False,
271 empty_ok: bool = False,
272) -> None:
273 """Checks that each of the listed fields contains a non-empty string.
274 Non-existent fields are ok unless ``mandatory`` is True."""
275 assert isinstance(item, dict)
276 for field in fields:
277 v = item.get(field)
278 if v is None:
279 if mandatory:
280 check_error(
281 wxr,
282 dt,
283 word,
284 lang,
285 pos,
286 "{!r} should be a{} string (it is a "
287 "mandatory field): {}".format(
288 field,
289 "" if empty_ok else " non-empty",
290 json.dumps(item, sort_keys=True),
291 ),
292 )
293 continue
294 if not isinstance(v, str): 294 ↛ 295line 294 didn't jump to line 295 because the condition on line 294 was never true
295 check_error(
296 wxr,
297 dt,
298 word,
299 lang,
300 pos,
301 "{!r} should be a{} string: {}".format(
302 field,
303 "" if empty_ok else " non-empty",
304 json.dumps(item, sort_keys=True),
305 ),
306 )
307 if not v and not empty_ok:
308 check_error(
309 wxr,
310 dt,
311 word,
312 lang,
313 pos,
314 "{!r} should contain a non-empty string: {}".format(
315 field, json.dumps(item, sort_keys=True)
316 ),
317 )
320def check_dict_list_fields(
321 wxr: WiktextractContext,
322 dt: dict,
323 word: str,
324 lang: str,
325 pos: str,
326 item: dict,
327 fields: list[str],
328) -> bool:
329 """Checks that each listed field, if present, is a list of dicts."""
330 assert isinstance(item, dict)
331 for field in fields:
332 lst = item.get(field)
333 if lst is None:
334 continue
335 if not isinstance(lst, (list, tuple)): 335 ↛ 336line 335 didn't jump to line 336 because the condition on line 335 was never true
336 check_error(
337 wxr,
338 dt,
339 word,
340 lang,
341 pos,
342 "{!r} should be a list of dicts: {}".format(
343 field, json.dumps(lst, sort_keys=True)
344 ),
345 )
346 return False
347 for x in lst:
348 if not isinstance(x, dict): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 check_error(
350 wxr,
351 dt,
352 word,
353 lang,
354 pos,
355 "{!r} should be a list of dicts: {}".format(
356 field, json.dumps(lst, sort_keys=True)
357 ),
358 )
359 return False
360 return True
363def check_str_list_fields(
364 wxr: WiktextractContext,
365 dt: dict,
366 word: str,
367 lang: str,
368 pos: str,
369 item: dict,
370 fields: list[str],
371) -> None:
372 """Checks that each of the listed fields contains a list of non-empty
373 strings or is not present."""
374 assert isinstance(item, dict)
375 for field in fields:
376 lst = item.get(field)
377 if lst is None:
378 continue
379 if not isinstance(lst, (list, tuple)): 379 ↛ 380line 379 didn't jump to line 380 because the condition on line 379 was never true
380 check_error(
381 wxr,
382 dt,
383 word,
384 lang,
385 pos,
386 "{!r} should be a list of dicts: {}".format(
387 field, json.dumps(item, sort_keys=True)
388 ),
389 )
390 continue
391 for x in lst:
392 if not isinstance(x, str) or not x: 392 ↛ 393line 392 didn't jump to line 393 because the condition on line 392 was never true
393 check_error(
394 wxr,
395 dt,
396 word,
397 lang,
398 pos,
399 "{!r} should be a list of non-empty strings: {}".format(
400 field, json.dumps(item, sort_keys=True)
401 ),
402 )
403 break
406def check_json_data(wxr: WiktextractContext, dt: dict) -> None:
407 """Performs some basic checks on the generated data."""
408 word = dt.get("word", dt.get("title"))
409 if word is None: 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true
410 check_error(
411 wxr,
412 dt,
413 None,
414 None,
415 None,
416 'missing "word" or "title" field in data',
417 )
418 return
419 if "title" in dt:
420 return # redirect pages don't have following fields
421 lang = dt.get("lang")
422 if not lang: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 check_error(wxr, dt, word, None, None, 'missing "lang" field in data')
424 return
425 pos = dt.get("pos")
426 if not pos: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true
427 check_error(wxr, dt, word, lang, pos, 'missing "pos" field in data')
428 return
429 if not dt.get("lang_code"): 429 ↛ 430line 429 didn't jump to line 430 because the condition on line 429 was never true
430 check_error(
431 wxr, dt, word, lang, pos, 'missing "lang_code" field in data'
432 )
433 check_tags(wxr, dt, word, lang, pos, dt)
434 check_str_fields(wxr, dt, word, lang, pos, dt, ["etymology_text"])
435 num = dt.get("etymology_number")
436 if num is not None and not isinstance(num, int): 436 ↛ 437line 436 didn't jump to line 437 because the condition on line 436 was never true
437 check_error(
438 wxr, dt, word, lang, pos, '"etymology_number" must be an int'
439 )
440 # Check that certain fields, if present, contain lists of dicts
441 if not check_dict_list_fields( 441 ↛ 467line 441 didn't jump to line 467 because the condition on line 441 was never true
442 wxr,
443 dt,
444 word,
445 lang,
446 pos,
447 dt,
448 [
449 "forms",
450 "senses",
451 "synonyms",
452 "antonyms",
453 "hypernyms",
454 "holonyms",
455 "meronyms",
456 "coordinate_terms",
457 "derived",
458 "related",
459 "sounds",
460 "translations",
461 "descendants",
462 "etymology_templates",
463 "head_templates",
464 "inflection_templates",
465 ],
466 ):
467 return # Avoid further processing because it would cause type errors
468 # Check the "forms" field
469 forms = dt.get("forms") or []
470 for form in forms:
471 check_tags(wxr, dt, word, lang, pos, form)
472 tags = dt.get("tags")
473 if not isinstance(tags, (list, tuple)) or "table-tags" not in tags: 473 ↛ 470line 473 didn't jump to line 470 because the condition on line 473 was always true
474 check_str_fields(
475 wxr, dt, word, lang, pos, form, ["form"], mandatory=True
476 )
477 check_str_list_fields(
478 wxr,
479 dt,
480 word,
481 lang,
482 pos,
483 dt,
484 ["categories", "topics", "wikidata", "wikipedia"],
485 )
486 # Check the "senses" field
487 senses = dt.get("senses") or []
488 if not senses: 488 ↛ 489line 488 didn't jump to line 489 because the condition on line 488 was never true
489 check_error(
490 wxr,
491 dt,
492 word,
493 lang,
494 pos,
495 'missing "senses" in data (must have at least one '
496 'sense, add empty sense with "no-gloss" tag if none '
497 "otherwise available)",
498 )
499 return
500 for sense in senses:
501 check_str_list_fields(
502 wxr, dt, word, lang, pos, sense, ["glosses", "raw_glosses"]
503 )
504 # Extra check: should have no-gloss tag if no glosses
505 for field in ("glosses", "raw_glosses"):
506 glosses = sense.get(field) or []
507 if ( 507 ↛ 512line 507 didn't jump to line 512
508 not glosses
509 and isinstance(sense.get("tags"), str)
510 and "no-gloss" not in sense.get("tags", "").split()
511 ):
512 check_error(
513 wxr,
514 dt,
515 word,
516 lang,
517 pos,
518 "{!r} should have at least one gloss or "
519 '"no-gloss" in "tags"'.format(field),
520 )
521 continue
522 check_tags(wxr, dt, word, lang, pos, sense)
523 check_str_list_fields(
524 wxr,
525 dt,
526 word,
527 lang,
528 pos,
529 sense,
530 ["categories", "topics", "wikidata", "wikipedia"],
531 )
532 check_str_fields(wxr, dt, word, lang, pos, sense, ["english"])
533 if not check_dict_list_fields( 533 ↛ 553line 533 didn't jump to line 553 because the condition on line 533 was never true
534 wxr,
535 dt,
536 word,
537 lang,
538 pos,
539 sense,
540 [
541 "alt_of",
542 "form_of",
543 "synonyms",
544 "antonyms",
545 "hypernyms",
546 "holonyms",
547 "meronyms",
548 "coordinate_terms",
549 "derived",
550 "related",
551 ],
552 ):
553 continue
554 for field in ("alt_of", "form_of"):
555 lst = sense.get(field)
556 if lst is None:
557 continue
558 for item in lst:
559 check_str_fields(
560 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
561 )
562 check_str_fields(
563 wxr, dt, word, lang, pos, item, ["extra"], mandatory=False
564 )
566 for field in (
567 "synonyms",
568 "antonyms",
569 "hypernyms",
570 "holonyms",
571 "meronyms",
572 "coordinate_terms",
573 "derived",
574 "related",
575 ):
576 lst = sense.get(field)
577 if lst is None:
578 continue
579 for item in lst:
580 check_str_fields(
581 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
582 )
583 check_tags(wxr, dt, word, lang, pos, item)
584 check_str_fields(
585 wxr,
586 dt,
587 word,
588 lang,
589 pos,
590 item,
591 ["english", "roman", "sense", "taxonomic"],
592 mandatory=False,
593 empty_ok=True,
594 )
595 check_str_list_fields(
596 wxr, dt, word, lang, pos, item, ["topics"]
597 )
598 # Check the "sounds" field
599 # We will permit having any number of different types (ipa, enpr, etc)
600 # in the same sound entry or in different sound entries.
601 sounds = dt.get("sounds") or []
602 for item in sounds:
603 check_str_fields(
604 wxr,
605 dt,
606 word,
607 lang,
608 pos,
609 item,
610 ["ipa", "enpr", "audio", "ogg_url", "mp3_url", "audio-ipa", "text"],
611 )
612 check_tags(wxr, dt, word, lang, pos, item)
613 check_str_list_fields(
614 wxr, dt, word, lang, pos, item, ["homophones", "hyphenation"]
615 )
616 # Check the "translations" field
617 translations = dt.get("translations") or []
618 for item in translations:
619 check_str_fields(
620 wxr, dt, word, lang, pos, item, ["word"], mandatory=True
621 )
622 check_tags(wxr, dt, word, lang, pos, item)
623 check_str_fields(
624 wxr,
625 dt,
626 word,
627 lang,
628 pos,
629 item,
630 [
631 "alt",
632 "code",
633 "english",
634 "lang",
635 "note",
636 "roman",
637 "sense",
638 "taxonomic",
639 ],
640 )
641 if not item.get("code") and not item.get("lang"): 641 ↛ 642line 641 didn't jump to line 642 because the condition on line 641 was never true
642 check_error(
643 wxr,
644 dt,
645 word,
646 lang,
647 pos,
648 '"translations" items must contain at least one '
649 'of "code" and "lang" (normally both): {}'.format(
650 json.dumps(item, sort_keys=True, ensure_ascii=False)
651 ),
652 )
653 # Check the "etymology_templates", "head_templates", and
654 # "inflection_templates" fields
655 for field in [
656 "etymology_templates",
657 "head_templates",
658 "inflection_templates",
659 ]:
660 lst = dt.get(field)
661 if lst is None:
662 continue
663 for item in lst:
664 check_str_fields(
665 wxr, dt, word, lang, pos, item, ["name"], mandatory=True
666 )
667 check_str_fields(
668 wxr,
669 dt,
670 word,
671 lang,
672 pos,
673 item,
674 ["expansion"],
675 # empty_ok=True because there are some templates
676 # that generate empty expansions.
677 mandatory=False,
678 empty_ok=True,
679 )
680 args = item.get("args")
681 if args is None: 681 ↛ 682line 681 didn't jump to line 682 because the condition on line 681 was never true
682 continue
683 if not isinstance(args, dict): 683 ↛ 684line 683 didn't jump to line 684 because the condition on line 683 was never true
684 check_error(
685 wxr,
686 dt,
687 word,
688 lang,
689 pos,
690 '{!r} item "args" value must be a dict: {}'.format(
691 field, json.dumps(args, sort_keys=True)
692 ),
693 )
694 continue
695 for k, v in args.items():
696 if not isinstance(k, str) or not isinstance(v, str): 696 ↛ 697line 696 didn't jump to line 697 because the condition on line 696 was never true
697 check_error(
698 wxr,
699 dt,
700 word,
701 lang,
702 pos,
703 '{!r} item "args" must be a dict with '
704 "string keys and values: {}".format(
705 field, json.dumps(args, sort_keys=True)
706 ),
707 )
708 continue
709 # Check the "descendants" field
710 descendants = dt.get("descendants") or []
711 for item in descendants:
712 check_str_fields(wxr, dt, word, lang, pos, item, ["text"])
713 depth = item.get("depth")
714 if depth is not None and not isinstance(depth, int): 714 ↛ 715line 714 didn't jump to line 715 because the condition on line 714 was never true
715 check_error(
716 wxr,
717 dt,
718 word,
719 lang,
720 pos,
721 '"descentants" field "depth" must be an int',
722 )
723 check_dict_list_fields(wxr, dt, word, lang, pos, item, ["templates"])
724 # XXX should check that they are valid templates, perhaps turn
725 # template checking code above into a function
728def reprocess_wiktionary(
729 wxr: WiktextractContext,
730 num_processes: int | None,
731 out_f: TextIO,
732 human_readable: bool = False,
733 search_pattern: str | None = None,
734) -> None:
735 """Reprocesses the Wiktionary from the sqlite db."""
736 logger.info("Second phase - processing pages")
738 # Extract thesaurus data. This iterates over thesaurus pages,
739 # but is very fast.
740 if ( 740 ↛ 746line 740 didn't jump to line 746
741 wxr.config.extract_thesaurus_pages
742 and thesaurus_linkage_number(wxr.thesaurus_db_conn) == 0 # type: ignore[arg-type]
743 ):
744 extract_thesaurus_data(wxr, num_processes)
746 emitted = set()
747 process_ns_ids: list[int] = list(
748 {
749 wxr.wtp.NAMESPACE_DATA.get(ns, {}).get("id", 0) # type: ignore[call-overload]
750 for ns in wxr.config.extract_ns_names
751 }
752 )
753 start_time = time.time()
754 last_time = start_time
755 all_page_nums = wxr.wtp.saved_page_nums(
756 process_ns_ids, True, "wikitext", search_pattern
757 )
758 wxr.remove_unpicklable_objects()
759 with Pool(num_processes, init_worker_process, (page_handler, wxr)) as pool:
760 wxr.reconnect_databases(False)
761 for processed_pages, (page_data, wtp_stats) in enumerate(
762 pool.imap_unordered(
763 page_handler,
764 wxr.wtp.get_all_pages(
765 process_ns_ids, True, "wikitext", search_pattern
766 ),
767 )
768 ):
769 wxr.config.merge_return(wtp_stats)
770 for dt in page_data:
771 check_json_data(wxr, dt)
772 write_json_data(dt, out_f, human_readable)
773 word = dt.get("word")
774 lang_code = dt.get("lang_code")
775 pos = dt.get("pos")
776 if word and lang_code and pos:
777 emitted.add((word, lang_code, pos))
778 last_time = estimate_progress(
779 processed_pages, all_page_nums, start_time, last_time
780 )
781 if wxr.config.dump_file_lang_code == "en": 781 ↛ 783line 781 didn't jump to line 783 because the condition on line 781 was always true
782 emit_words_in_thesaurus(wxr, emitted, out_f, human_readable)
783 logger.info("Reprocessing wiktionary complete")
786def process_ns_page_title(page: Page, ns_name: str) -> tuple[str, str]:
787 text: str = page.body if page.body is not None else page.redirect_to # type: ignore[assignment]
788 title = page.title[page.title.find(":") + 1 :]
789 title = re.sub(r"(^|/)\.($|/)", r"\1__dotdot__\2", title)
790 title = re.sub(r"(^|/)\.\.($|/)", r"\1__dotdot__\2", title)
791 title = title.replace("//", "__slashslash__")
792 title = re.sub(r"^/", r"__slash__", title)
793 title = re.sub(r"/$", r"__slash__", title)
794 title = ns_name + "/" + title
795 return title, text
798def extract_namespace(
799 wxr: WiktextractContext, namespace: str, path: str
800) -> None:
801 """Extracts all pages in the given namespace and writes them to a .tar
802 file with the given path."""
803 logger.info(
804 f"Extracting pages from namespace {namespace} to tar file {path}"
805 )
806 ns_id: int = wxr.wtp.NAMESPACE_DATA.get(namespace, {}).get("id") # type: ignore[assignment, call-overload]
807 t = time.time()
808 with tarfile.open(path, "w") as tarf:
809 for page in wxr.wtp.get_all_pages([ns_id]):
810 title, text = process_ns_page_title(page, namespace)
811 text = text.encode("utf-8")
812 f = io.BytesIO(text)
813 title += ".txt"
814 ti = tarfile.TarInfo(name=title)
815 ti.size = len(text)
816 # According to documentation, TarInfo.mtime can be int, float,
817 # or even None in newer versions, but mypy can't tell because
818 # it's not annotated and assumes it can only be int
819 ti.mtime = t # type: ignore[assignment]
820 ti.uid = 0
821 ti.gid = 0
822 ti.type = tarfile.REGTYPE
823 tarf.addfile(ti, f)