Coverage for src/wiktextract/config.py: 96%

1# Definition of the configuration object for Wiktionary data extraction.

2# The same object is also used for collecting statistics.

6import collections

7import json

8from importlib.resources import files

9from typing import (

10 Iterable,

11 Optional,

12 TypedDict,

13)

15from wikitextprocessor.core import (

16 CollatedErrorReturnData,

17 ErrorMessageData,

18 HTMLTagData,

19)

21SoundFileRedirects = dict[str, str]

23POSSubtitleData = TypedDict(

24 "POSSubtitleData",

25 {

26 "pos": str,

27 "debug": str,

28 "tags": list[str],

29 "error": str, # not used

30 "warning": str, # not used

31 },

32 total=False,

33)

36class WiktionaryConfig:

37 """This class holds configuration data for Wiktionary parsing."""

39 __slots__ = (

40 "dump_file_lang_code",

41 "capture_language_codes",

42 "capture_translations",

43 "capture_pronunciation",

44 "capture_linkages",

45 "capture_compounds",

46 "capture_redirects",

47 "capture_examples",

48 "capture_etymologies",

49 "capture_inflections",

50 "capture_descendants",

51 "expand_tables",

52 "verbose",

53 "num_pages",

54 "language_counts",

55 "pos_counts",

56 "section_counts",

57 "word",

58 "errors",

59 "warnings",

60 "debugs",

61 "redirects",

62 "data_folder",

63 "extract_thesaurus_pages",

64 "save_ns_names",

65 "extract_ns_names",

66 "allowed_html_tags",

67 "parser_function_aliases",

68 )

70 def __init__(

71 self,

72 dump_file_lang_code: str = "en",

73 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"},

74 capture_translations=True,

75 capture_pronunciation=True,

76 capture_linkages=True,

77 capture_compounds=True,

78 capture_redirects=True,

79 capture_examples=True,

80 capture_etymologies=True,

81 capture_inflections=True,

82 capture_descendants=True,

83 verbose=False,

84 expand_tables=False,

85 ):

86 if capture_language_codes is not None:

87 assert isinstance(capture_language_codes, (list, tuple, set))

88 for x in capture_language_codes:

89 assert isinstance(x, str)

90 assert capture_language_codes is None or isinstance(

91 capture_language_codes, (list, tuple, set)

92 )

93 assert capture_translations in (True, False)

94 assert capture_pronunciation in (True, False)

95 assert capture_linkages in (True, False)

96 assert capture_compounds in (True, False)

97 assert capture_redirects in (True, False)

98 assert capture_etymologies in (True, False)

99 self.dump_file_lang_code = dump_file_lang_code

100 self.capture_language_codes = capture_language_codes

101 self.capture_translations = capture_translations

102 self.capture_pronunciation = capture_pronunciation

103 self.capture_linkages = capture_linkages

104 self.capture_compounds = capture_compounds

105 self.capture_redirects = capture_redirects

106 self.capture_examples = capture_examples

107 self.capture_etymologies = capture_etymologies

108 self.capture_inflections = capture_inflections

109 self.capture_descendants = capture_descendants

110 self.verbose = verbose

111 self.expand_tables = expand_tables

112 # Some fields for statistics

113 self.num_pages = 0

114 self.language_counts: dict[str, int] = collections.defaultdict(int)

115 self.pos_counts: dict[str, int] = collections.defaultdict(int)

116 self.section_counts: dict[str, int] = collections.defaultdict(int)

117 # Some fields related to errors

118 # The word currently being processed.

119 self.word: Optional[str] = None

120 self.errors: list[ErrorMessageData] = []

121 self.warnings: list[ErrorMessageData] = []

122 self.debugs: list[ErrorMessageData] = []

123 self.redirects: SoundFileRedirects = {}

124 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code

125 self.extract_thesaurus_pages = False

126 # these namespace pages will be copied from the XML dump file and

127 # saved to a SQLite db file

128 self.save_ns_names = ["Main", "Template", "Module"]

129 # these are extracted namespaces

130 self.extract_ns_names = ["Main"]

131 self.allowed_html_tags: dict[str, HTMLTagData] = {}

132 self.parser_function_aliases: dict[str, str] = {}

133 self.load_edition_settings()

134

135 def merge_return(self, ret: CollatedErrorReturnData):

136 # XXX This was never properly implemented; even the only

137 # count (self.section_counts) that is updated during running

138 # gets discarded when doing batches instead of individual

139 # pages. Search: STATISTICS_IMPLEMENTATION

140 # if "num_pages" in ret:

141 # self.num_pages += ret["num_pages"]

142 # for k, v in ret["language_counts"].items():

143 # self.language_counts[k] += v

144 # for k, v in ret["pos_counts"].items():

145 # self.pos_counts[k] += v

146 # for k, v in ret["section_counts"].items():

147 # self.section_counts[k] += v

148 if "errors" in ret and len(self.errors) < 100_000: 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was always true

149 self.errors.extend(ret.get("errors", []))

150 if "warnings" in ret and len(self.warnings) < 100_000: 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true

151 self.warnings.extend(ret.get("warnings", []))

152 if "debugs" in ret and len(self.debugs) < 3_000_000: 152 ↛ exitline 152 didn't return from function 'merge_return' because the condition on line 152 was always true

153 self.debugs.extend(ret.get("debugs", []))

154

155 def load_edition_settings(self) -> None:

156 file_path = self.data_folder / "config.json"

157 if file_path.exists():

158 with file_path.open(encoding="utf-8") as f:

159 for key, value in json.load(f).items():

160 setattr(self, key, value)