Coverage for src / wiktextract / config.py: 94%

70 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-03-26 08:59 +0000

1# Definition of the configuration object for Wiktionary data extraction. 

2# The same object is also used for collecting statistics. 

3# 

4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org 

5 

6import collections 

7import json 

8from importlib.resources import files 

9from typing import ( 

10 Iterable, 

11 Optional, 

12 TypedDict, 

13) 

14 

15from wikitextprocessor.core import ( 

16 CollatedErrorReturnData, 

17 ErrorMessageData, 

18 HTMLTagData, 

19) 

20 

21SoundFileRedirects = dict[str, str] 

22 

23POSSubtitleData = TypedDict( 

24 "POSSubtitleData", 

25 { 

26 "pos": str, 

27 "debug": str, 

28 "tags": list[str], 

29 "error": str, # not used 

30 "warning": str, # not used 

31 "note": str, # not used 

32 "wiki_notices": str, # not used 

33 }, 

34 total=False, 

35) 

36 

37 

38class WiktionaryConfig: 

39 """This class holds configuration data for Wiktionary parsing.""" 

40 

41 __slots__ = ( 

42 "dump_file_lang_code", 

43 "capture_language_codes", 

44 "capture_translations", 

45 "capture_pronunciation", 

46 "capture_linkages", 

47 "capture_compounds", 

48 "capture_redirects", 

49 "capture_examples", 

50 "capture_etymologies", 

51 "capture_inflections", 

52 "capture_descendants", 

53 "expand_tables", 

54 "verbose", 

55 "num_pages", 

56 "language_counts", 

57 "pos_counts", 

58 "section_counts", 

59 "word", 

60 "errors", 

61 "warnings", 

62 "debugs", 

63 "redirects", 

64 "data_folder", 

65 "extract_thesaurus_pages", 

66 "save_ns_names", 

67 "extract_ns_names", 

68 "allowed_html_tags", 

69 "parser_function_aliases", 

70 "notes", 

71 "wiki_notices", 

72 "linktrailing_regex_pattern", 

73 ) 

74 

75 def __init__( 

76 self, 

77 dump_file_lang_code: str = "en", 

78 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"}, 

79 capture_translations=True, 

80 capture_pronunciation=True, 

81 capture_linkages=True, 

82 capture_compounds=True, 

83 capture_redirects=True, 

84 capture_examples=True, 

85 capture_etymologies=True, 

86 capture_inflections=True, 

87 capture_descendants=True, 

88 verbose=False, 

89 expand_tables=False, 

90 ): 

91 if capture_language_codes is not None: 

92 assert isinstance(capture_language_codes, (list, tuple, set)) 

93 for x in capture_language_codes: 

94 assert isinstance(x, str) 

95 assert capture_language_codes is None or isinstance( 

96 capture_language_codes, (list, tuple, set) 

97 ) 

98 assert capture_translations in (True, False) 

99 assert capture_pronunciation in (True, False) 

100 assert capture_linkages in (True, False) 

101 assert capture_compounds in (True, False) 

102 assert capture_redirects in (True, False) 

103 assert capture_etymologies in (True, False) 

104 self.dump_file_lang_code = dump_file_lang_code 

105 self.capture_language_codes = capture_language_codes 

106 self.capture_translations = capture_translations 

107 self.capture_pronunciation = capture_pronunciation 

108 self.capture_linkages = capture_linkages 

109 self.capture_compounds = capture_compounds 

110 self.capture_redirects = capture_redirects 

111 self.capture_examples = capture_examples 

112 self.capture_etymologies = capture_etymologies 

113 self.capture_inflections = capture_inflections 

114 self.capture_descendants = capture_descendants 

115 self.verbose = verbose 

116 self.expand_tables = expand_tables 

117 # Some fields for statistics 

118 self.num_pages = 0 

119 self.language_counts: dict[str, int] = collections.defaultdict(int) 

120 self.pos_counts: dict[str, int] = collections.defaultdict(int) 

121 self.section_counts: dict[str, int] = collections.defaultdict(int) 

122 # Some fields related to errors 

123 # The word currently being processed. 

124 self.word: Optional[str] = None 

125 self.errors: list[ErrorMessageData] = [] 

126 self.warnings: list[ErrorMessageData] = [] 

127 self.debugs: list[ErrorMessageData] = [] 

128 self.notes: list[ErrorMessageData] = [] 

129 self.wiki_notices: list[ErrorMessageData] = [] 

130 self.redirects: SoundFileRedirects = {} 

131 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code 

132 self.extract_thesaurus_pages = False 

133 # these namespace pages will be copied from the XML dump file and 

134 # saved to a SQLite db file 

135 self.save_ns_names = ["Main", "Template", "Module"] 

136 # these are extracted namespaces 

137 self.extract_ns_names = ["Main"] 

138 self.allowed_html_tags: dict[str, HTMLTagData] = {} 

139 self.parser_function_aliases: dict[str, str] = {} 

140 self.linktrailing_regex_pattern: str | None = None 

141 self.load_edition_settings() 

142 

143 def merge_return(self, ret: CollatedErrorReturnData): 

144 # XXX This was never properly implemented; even the only 

145 # count (self.section_counts) that is updated during running 

146 # gets discarded when doing batches instead of individual 

147 # pages. Search: STATISTICS_IMPLEMENTATION 

148 # if "num_pages" in ret: 

149 # self.num_pages += ret["num_pages"] 

150 # for k, v in ret["language_counts"].items(): 

151 # self.language_counts[k] += v 

152 # for k, v in ret["pos_counts"].items(): 

153 # self.pos_counts[k] += v 

154 # for k, v in ret["section_counts"].items(): 

155 # self.section_counts[k] += v 

156 if "errors" in ret and len(self.errors) < 100_000: 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true

157 self.errors.extend(ret.get("errors", [])) 

158 if "warnings" in ret and len(self.warnings) < 100_000: 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 self.warnings.extend(ret.get("warnings", [])) 

160 if "notes" in ret and len(self.notes) < 100_000: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true

161 self.notes.extend(ret.get("warnings", [])) 

162 if "wiki_notices" in ret and len(self.wiki_notices) < 100_000: 162 ↛ 164line 162 didn't jump to line 164 because the condition on line 162 was always true

163 self.wiki_notices.extend(ret.get("warnings", [])) 

164 if "debugs" in ret and len(self.debugs) < 3_000_000: 164 ↛ exitline 164 didn't return from function 'merge_return' because the condition on line 164 was always true

165 self.debugs.extend(ret.get("debugs", [])) 

166 

167 def load_edition_settings(self) -> None: 

168 file_path = self.data_folder / "config.json" 

169 if file_path.exists(): 

170 with file_path.open(encoding="utf-8") as f: 

171 for key, value in json.load(f).items(): 

172 setattr(self, key, value)