Coverage for src/wiktextract/config.py: 96%
62 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1# Definition of the configuration object for Wiktionary data extraction.
2# The same object is also used for collecting statistics.
3#
4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org
6import collections
7import json
8from importlib.resources import files
9from typing import (
10 Iterable,
11 Optional,
12 TypedDict,
13)
15from wikitextprocessor.core import (
16 CollatedErrorReturnData,
17 ErrorMessageData,
18 HTMLTagData,
19)
21SoundFileRedirects = dict[str, str]
23POSSubtitleData = TypedDict(
24 "POSSubtitleData",
25 {
26 "pos": str,
27 "debug": str,
28 "tags": list[str],
29 "error": str, # not used
30 "warning": str, # not used
31 },
32 total=False,
33)
36class WiktionaryConfig:
37 """This class holds configuration data for Wiktionary parsing."""
39 __slots__ = (
40 "dump_file_lang_code",
41 "capture_language_codes",
42 "capture_translations",
43 "capture_pronunciation",
44 "capture_linkages",
45 "capture_compounds",
46 "capture_redirects",
47 "capture_examples",
48 "capture_etymologies",
49 "capture_inflections",
50 "capture_descendants",
51 "expand_tables",
52 "verbose",
53 "num_pages",
54 "language_counts",
55 "pos_counts",
56 "section_counts",
57 "word",
58 "errors",
59 "warnings",
60 "debugs",
61 "redirects",
62 "data_folder",
63 "extract_thesaurus_pages",
64 "save_ns_names",
65 "extract_ns_names",
66 "allowed_html_tags",
67 )
69 def __init__(
70 self,
71 dump_file_lang_code: str = "en",
72 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"},
73 capture_translations=True,
74 capture_pronunciation=True,
75 capture_linkages=True,
76 capture_compounds=True,
77 capture_redirects=True,
78 capture_examples=True,
79 capture_etymologies=True,
80 capture_inflections=True,
81 capture_descendants=True,
82 verbose=False,
83 expand_tables=False,
84 ):
85 if capture_language_codes is not None:
86 assert isinstance(capture_language_codes, (list, tuple, set))
87 for x in capture_language_codes:
88 assert isinstance(x, str)
89 assert capture_language_codes is None or isinstance(
90 capture_language_codes, (list, tuple, set)
91 )
92 assert capture_translations in (True, False)
93 assert capture_pronunciation in (True, False)
94 assert capture_linkages in (True, False)
95 assert capture_compounds in (True, False)
96 assert capture_redirects in (True, False)
97 assert capture_etymologies in (True, False)
98 self.dump_file_lang_code = dump_file_lang_code
99 self.capture_language_codes = capture_language_codes
100 self.capture_translations = capture_translations
101 self.capture_pronunciation = capture_pronunciation
102 self.capture_linkages = capture_linkages
103 self.capture_compounds = capture_compounds
104 self.capture_redirects = capture_redirects
105 self.capture_examples = capture_examples
106 self.capture_etymologies = capture_etymologies
107 self.capture_inflections = capture_inflections
108 self.capture_descendants = capture_descendants
109 self.verbose = verbose
110 self.expand_tables = expand_tables
111 # Some fields for statistics
112 self.num_pages = 0
113 self.language_counts: dict[str, int] = collections.defaultdict(int)
114 self.pos_counts: dict[str, int] = collections.defaultdict(int)
115 self.section_counts: dict[str, int] = collections.defaultdict(int)
116 # Some fields related to errors
117 # The word currently being processed.
118 self.word: Optional[str] = None
119 self.errors: list[ErrorMessageData] = []
120 self.warnings: list[ErrorMessageData] = []
121 self.debugs: list[ErrorMessageData] = []
122 self.redirects: SoundFileRedirects = {}
123 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
124 self.extract_thesaurus_pages = False
125 # these namespace pages will be copied from the XML dump file and
126 # saved to a SQLite db file
127 self.save_ns_names = ["Main", "Template", "Module"]
128 # these are extracted namespaces
129 self.extract_ns_names = ["Main"]
130 self.allowed_html_tags: dict[str, HTMLTagData] = {}
131 self.load_edition_settings()
133 def merge_return(self, ret: CollatedErrorReturnData):
134 # XXX This was never properly implemented; even the only
135 # count (self.section_counts) that is updated during running
136 # gets discarded when doing batches instead of individual
137 # pages. Search: STATISTICS_IMPLEMENTATION
138 # if "num_pages" in ret:
139 # self.num_pages += ret["num_pages"]
140 # for k, v in ret["language_counts"].items():
141 # self.language_counts[k] += v
142 # for k, v in ret["pos_counts"].items():
143 # self.pos_counts[k] += v
144 # for k, v in ret["section_counts"].items():
145 # self.section_counts[k] += v
146 if "errors" in ret and len(self.errors) < 100_000: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true
147 self.errors.extend(ret.get("errors", []))
148 if "warnings" in ret and len(self.warnings) < 100_000: 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was always true
149 self.warnings.extend(ret.get("warnings", []))
150 if "debugs" in ret and len(self.debugs) < 3_000_000: 150 ↛ exitline 150 didn't return from function 'merge_return' because the condition on line 150 was always true
151 self.debugs.extend(ret.get("debugs", []))
153 def load_edition_settings(self) -> None:
154 file_path = self.data_folder / "config.json"
155 if file_path.exists():
156 with file_path.open(encoding="utf-8") as f:
157 for key, value in json.load(f).items():
158 setattr(self, key, value)