Coverage for src/wiktextract/config.py: 94%
69 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1# Definition of the configuration object for Wiktionary data extraction.
2# The same object is also used for collecting statistics.
3#
4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org
6import collections
7import json
8from importlib.resources import files
9from typing import (
10 Iterable,
11 Optional,
12 TypedDict,
13)
15from wikitextprocessor.core import (
16 CollatedErrorReturnData,
17 ErrorMessageData,
18 HTMLTagData,
19)
21SoundFileRedirects = dict[str, str]
23POSSubtitleData = TypedDict(
24 "POSSubtitleData",
25 {
26 "pos": str,
27 "debug": str,
28 "tags": list[str],
29 "error": str, # not used
30 "warning": str, # not used
31 "note": str, # not used
32 "wiki_notices": str, # not used
33 },
34 total=False,
35)
38class WiktionaryConfig:
39 """This class holds configuration data for Wiktionary parsing."""
41 __slots__ = (
42 "dump_file_lang_code",
43 "capture_language_codes",
44 "capture_translations",
45 "capture_pronunciation",
46 "capture_linkages",
47 "capture_compounds",
48 "capture_redirects",
49 "capture_examples",
50 "capture_etymologies",
51 "capture_inflections",
52 "capture_descendants",
53 "expand_tables",
54 "verbose",
55 "num_pages",
56 "language_counts",
57 "pos_counts",
58 "section_counts",
59 "word",
60 "errors",
61 "warnings",
62 "debugs",
63 "redirects",
64 "data_folder",
65 "extract_thesaurus_pages",
66 "save_ns_names",
67 "extract_ns_names",
68 "allowed_html_tags",
69 "parser_function_aliases",
70 "notes",
71 "wiki_notices",
72 )
74 def __init__(
75 self,
76 dump_file_lang_code: str = "en",
77 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"},
78 capture_translations=True,
79 capture_pronunciation=True,
80 capture_linkages=True,
81 capture_compounds=True,
82 capture_redirects=True,
83 capture_examples=True,
84 capture_etymologies=True,
85 capture_inflections=True,
86 capture_descendants=True,
87 verbose=False,
88 expand_tables=False,
89 ):
90 if capture_language_codes is not None:
91 assert isinstance(capture_language_codes, (list, tuple, set))
92 for x in capture_language_codes:
93 assert isinstance(x, str)
94 assert capture_language_codes is None or isinstance(
95 capture_language_codes, (list, tuple, set)
96 )
97 assert capture_translations in (True, False)
98 assert capture_pronunciation in (True, False)
99 assert capture_linkages in (True, False)
100 assert capture_compounds in (True, False)
101 assert capture_redirects in (True, False)
102 assert capture_etymologies in (True, False)
103 self.dump_file_lang_code = dump_file_lang_code
104 self.capture_language_codes = capture_language_codes
105 self.capture_translations = capture_translations
106 self.capture_pronunciation = capture_pronunciation
107 self.capture_linkages = capture_linkages
108 self.capture_compounds = capture_compounds
109 self.capture_redirects = capture_redirects
110 self.capture_examples = capture_examples
111 self.capture_etymologies = capture_etymologies
112 self.capture_inflections = capture_inflections
113 self.capture_descendants = capture_descendants
114 self.verbose = verbose
115 self.expand_tables = expand_tables
116 # Some fields for statistics
117 self.num_pages = 0
118 self.language_counts: dict[str, int] = collections.defaultdict(int)
119 self.pos_counts: dict[str, int] = collections.defaultdict(int)
120 self.section_counts: dict[str, int] = collections.defaultdict(int)
121 # Some fields related to errors
122 # The word currently being processed.
123 self.word: Optional[str] = None
124 self.errors: list[ErrorMessageData] = []
125 self.warnings: list[ErrorMessageData] = []
126 self.debugs: list[ErrorMessageData] = []
127 self.notes: list[ErrorMessageData] = []
128 self.wiki_notices: list[ErrorMessageData] = []
129 self.redirects: SoundFileRedirects = {}
130 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
131 self.extract_thesaurus_pages = False
132 # these namespace pages will be copied from the XML dump file and
133 # saved to a SQLite db file
134 self.save_ns_names = ["Main", "Template", "Module"]
135 # these are extracted namespaces
136 self.extract_ns_names = ["Main"]
137 self.allowed_html_tags: dict[str, HTMLTagData] = {}
138 self.parser_function_aliases: dict[str, str] = {}
139 self.load_edition_settings()
141 def merge_return(self, ret: CollatedErrorReturnData):
142 # XXX This was never properly implemented; even the only
143 # count (self.section_counts) that is updated during running
144 # gets discarded when doing batches instead of individual
145 # pages. Search: STATISTICS_IMPLEMENTATION
146 # if "num_pages" in ret:
147 # self.num_pages += ret["num_pages"]
148 # for k, v in ret["language_counts"].items():
149 # self.language_counts[k] += v
150 # for k, v in ret["pos_counts"].items():
151 # self.pos_counts[k] += v
152 # for k, v in ret["section_counts"].items():
153 # self.section_counts[k] += v
154 if "errors" in ret and len(self.errors) < 100_000: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true
155 self.errors.extend(ret.get("errors", []))
156 if "warnings" in ret and len(self.warnings) < 100_000: 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true
157 self.warnings.extend(ret.get("warnings", []))
158 if "notes" in ret and len(self.notes) < 100_000: 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true
159 self.notes.extend(ret.get("warnings", []))
160 if "wiki_notices" in ret and len(self.wiki_notices) < 100_000: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true
161 self.wiki_notices.extend(ret.get("warnings", []))
162 if "debugs" in ret and len(self.debugs) < 3_000_000: 162 ↛ exitline 162 didn't return from function 'merge_return' because the condition on line 162 was always true
163 self.debugs.extend(ret.get("debugs", []))
165 def load_edition_settings(self) -> None:
166 file_path = self.data_folder / "config.json"
167 if file_path.exists():
168 with file_path.open(encoding="utf-8") as f:
169 for key, value in json.load(f).items():
170 setattr(self, key, value)