Coverage for src/wiktextract/config.py: 96%
63 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Definition of the configuration object for Wiktionary data extraction.
2# The same object is also used for collecting statistics.
3#
4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org
6import collections
7import json
8from importlib.resources import files
9from typing import (
10 Iterable,
11 Optional,
12 TypedDict,
13)
15from wikitextprocessor.core import (
16 CollatedErrorReturnData,
17 ErrorMessageData,
18 HTMLTagData,
19)
21SoundFileRedirects = dict[str, str]
23POSSubtitleData = TypedDict(
24 "POSSubtitleData",
25 {
26 "pos": str,
27 "debug": str,
28 "tags": list[str],
29 "error": str, # not used
30 "warning": str, # not used
31 },
32 total=False,
33)
36class WiktionaryConfig:
37 """This class holds configuration data for Wiktionary parsing."""
39 __slots__ = (
40 "dump_file_lang_code",
41 "capture_language_codes",
42 "capture_translations",
43 "capture_pronunciation",
44 "capture_linkages",
45 "capture_compounds",
46 "capture_redirects",
47 "capture_examples",
48 "capture_etymologies",
49 "capture_inflections",
50 "capture_descendants",
51 "expand_tables",
52 "verbose",
53 "num_pages",
54 "language_counts",
55 "pos_counts",
56 "section_counts",
57 "word",
58 "errors",
59 "warnings",
60 "debugs",
61 "redirects",
62 "data_folder",
63 "extract_thesaurus_pages",
64 "save_ns_names",
65 "extract_ns_names",
66 "allowed_html_tags",
67 "parser_function_aliases",
68 )
70 def __init__(
71 self,
72 dump_file_lang_code: str = "en",
73 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"},
74 capture_translations=True,
75 capture_pronunciation=True,
76 capture_linkages=True,
77 capture_compounds=True,
78 capture_redirects=True,
79 capture_examples=True,
80 capture_etymologies=True,
81 capture_inflections=True,
82 capture_descendants=True,
83 verbose=False,
84 expand_tables=False,
85 ):
86 if capture_language_codes is not None:
87 assert isinstance(capture_language_codes, (list, tuple, set))
88 for x in capture_language_codes:
89 assert isinstance(x, str)
90 assert capture_language_codes is None or isinstance(
91 capture_language_codes, (list, tuple, set)
92 )
93 assert capture_translations in (True, False)
94 assert capture_pronunciation in (True, False)
95 assert capture_linkages in (True, False)
96 assert capture_compounds in (True, False)
97 assert capture_redirects in (True, False)
98 assert capture_etymologies in (True, False)
99 self.dump_file_lang_code = dump_file_lang_code
100 self.capture_language_codes = capture_language_codes
101 self.capture_translations = capture_translations
102 self.capture_pronunciation = capture_pronunciation
103 self.capture_linkages = capture_linkages
104 self.capture_compounds = capture_compounds
105 self.capture_redirects = capture_redirects
106 self.capture_examples = capture_examples
107 self.capture_etymologies = capture_etymologies
108 self.capture_inflections = capture_inflections
109 self.capture_descendants = capture_descendants
110 self.verbose = verbose
111 self.expand_tables = expand_tables
112 # Some fields for statistics
113 self.num_pages = 0
114 self.language_counts: dict[str, int] = collections.defaultdict(int)
115 self.pos_counts: dict[str, int] = collections.defaultdict(int)
116 self.section_counts: dict[str, int] = collections.defaultdict(int)
117 # Some fields related to errors
118 # The word currently being processed.
119 self.word: Optional[str] = None
120 self.errors: list[ErrorMessageData] = []
121 self.warnings: list[ErrorMessageData] = []
122 self.debugs: list[ErrorMessageData] = []
123 self.redirects: SoundFileRedirects = {}
124 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code
125 self.extract_thesaurus_pages = False
126 # these namespace pages will be copied from the XML dump file and
127 # saved to a SQLite db file
128 self.save_ns_names = ["Main", "Template", "Module"]
129 # these are extracted namespaces
130 self.extract_ns_names = ["Main"]
131 self.allowed_html_tags: dict[str, HTMLTagData] = {}
132 self.parser_function_aliases: dict[str, str] = {}
133 self.load_edition_settings()
135 def merge_return(self, ret: CollatedErrorReturnData):
136 # XXX This was never properly implemented; even the only
137 # count (self.section_counts) that is updated during running
138 # gets discarded when doing batches instead of individual
139 # pages. Search: STATISTICS_IMPLEMENTATION
140 # if "num_pages" in ret:
141 # self.num_pages += ret["num_pages"]
142 # for k, v in ret["language_counts"].items():
143 # self.language_counts[k] += v
144 # for k, v in ret["pos_counts"].items():
145 # self.pos_counts[k] += v
146 # for k, v in ret["section_counts"].items():
147 # self.section_counts[k] += v
148 if "errors" in ret and len(self.errors) < 100_000: 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was always true
149 self.errors.extend(ret.get("errors", []))
150 if "warnings" in ret and len(self.warnings) < 100_000: 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true
151 self.warnings.extend(ret.get("warnings", []))
152 if "debugs" in ret and len(self.debugs) < 3_000_000: 152 ↛ exitline 152 didn't return from function 'merge_return' because the condition on line 152 was always true
153 self.debugs.extend(ret.get("debugs", []))
155 def load_edition_settings(self) -> None:
156 file_path = self.data_folder / "config.json"
157 if file_path.exists():
158 with file_path.open(encoding="utf-8") as f:
159 for key, value in json.load(f).items():
160 setattr(self, key, value)