Coverage for src/wiktextract/config.py: 94%

69 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1# Definition of the configuration object for Wiktionary data extraction. 

2# The same object is also used for collecting statistics. 

3# 

4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org 

5 

6import collections 

7import json 

8from importlib.resources import files 

9from typing import ( 

10 Iterable, 

11 Optional, 

12 TypedDict, 

13) 

14 

15from wikitextprocessor.core import ( 

16 CollatedErrorReturnData, 

17 ErrorMessageData, 

18 HTMLTagData, 

19) 

20 

21SoundFileRedirects = dict[str, str] 

22 

23POSSubtitleData = TypedDict( 

24 "POSSubtitleData", 

25 { 

26 "pos": str, 

27 "debug": str, 

28 "tags": list[str], 

29 "error": str, # not used 

30 "warning": str, # not used 

31 "note": str, # not used 

32 "wiki_notices": str, # not used 

33 }, 

34 total=False, 

35) 

36 

37 

38class WiktionaryConfig: 

39 """This class holds configuration data for Wiktionary parsing.""" 

40 

41 __slots__ = ( 

42 "dump_file_lang_code", 

43 "capture_language_codes", 

44 "capture_translations", 

45 "capture_pronunciation", 

46 "capture_linkages", 

47 "capture_compounds", 

48 "capture_redirects", 

49 "capture_examples", 

50 "capture_etymologies", 

51 "capture_inflections", 

52 "capture_descendants", 

53 "expand_tables", 

54 "verbose", 

55 "num_pages", 

56 "language_counts", 

57 "pos_counts", 

58 "section_counts", 

59 "word", 

60 "errors", 

61 "warnings", 

62 "debugs", 

63 "redirects", 

64 "data_folder", 

65 "extract_thesaurus_pages", 

66 "save_ns_names", 

67 "extract_ns_names", 

68 "allowed_html_tags", 

69 "parser_function_aliases", 

70 "notes", 

71 "wiki_notices", 

72 ) 

73 

74 def __init__( 

75 self, 

76 dump_file_lang_code: str = "en", 

77 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"}, 

78 capture_translations=True, 

79 capture_pronunciation=True, 

80 capture_linkages=True, 

81 capture_compounds=True, 

82 capture_redirects=True, 

83 capture_examples=True, 

84 capture_etymologies=True, 

85 capture_inflections=True, 

86 capture_descendants=True, 

87 verbose=False, 

88 expand_tables=False, 

89 ): 

90 if capture_language_codes is not None: 

91 assert isinstance(capture_language_codes, (list, tuple, set)) 

92 for x in capture_language_codes: 

93 assert isinstance(x, str) 

94 assert capture_language_codes is None or isinstance( 

95 capture_language_codes, (list, tuple, set) 

96 ) 

97 assert capture_translations in (True, False) 

98 assert capture_pronunciation in (True, False) 

99 assert capture_linkages in (True, False) 

100 assert capture_compounds in (True, False) 

101 assert capture_redirects in (True, False) 

102 assert capture_etymologies in (True, False) 

103 self.dump_file_lang_code = dump_file_lang_code 

104 self.capture_language_codes = capture_language_codes 

105 self.capture_translations = capture_translations 

106 self.capture_pronunciation = capture_pronunciation 

107 self.capture_linkages = capture_linkages 

108 self.capture_compounds = capture_compounds 

109 self.capture_redirects = capture_redirects 

110 self.capture_examples = capture_examples 

111 self.capture_etymologies = capture_etymologies 

112 self.capture_inflections = capture_inflections 

113 self.capture_descendants = capture_descendants 

114 self.verbose = verbose 

115 self.expand_tables = expand_tables 

116 # Some fields for statistics 

117 self.num_pages = 0 

118 self.language_counts: dict[str, int] = collections.defaultdict(int) 

119 self.pos_counts: dict[str, int] = collections.defaultdict(int) 

120 self.section_counts: dict[str, int] = collections.defaultdict(int) 

121 # Some fields related to errors 

122 # The word currently being processed. 

123 self.word: Optional[str] = None 

124 self.errors: list[ErrorMessageData] = [] 

125 self.warnings: list[ErrorMessageData] = [] 

126 self.debugs: list[ErrorMessageData] = [] 

127 self.notes: list[ErrorMessageData] = [] 

128 self.wiki_notices: list[ErrorMessageData] = [] 

129 self.redirects: SoundFileRedirects = {} 

130 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code 

131 self.extract_thesaurus_pages = False 

132 # these namespace pages will be copied from the XML dump file and 

133 # saved to a SQLite db file 

134 self.save_ns_names = ["Main", "Template", "Module"] 

135 # these are extracted namespaces 

136 self.extract_ns_names = ["Main"] 

137 self.allowed_html_tags: dict[str, HTMLTagData] = {} 

138 self.parser_function_aliases: dict[str, str] = {} 

139 self.load_edition_settings() 

140 

141 def merge_return(self, ret: CollatedErrorReturnData): 

142 # XXX This was never properly implemented; even the only 

143 # count (self.section_counts) that is updated during running 

144 # gets discarded when doing batches instead of individual 

145 # pages. Search: STATISTICS_IMPLEMENTATION 

146 # if "num_pages" in ret: 

147 # self.num_pages += ret["num_pages"] 

148 # for k, v in ret["language_counts"].items(): 

149 # self.language_counts[k] += v 

150 # for k, v in ret["pos_counts"].items(): 

151 # self.pos_counts[k] += v 

152 # for k, v in ret["section_counts"].items(): 

153 # self.section_counts[k] += v 

154 if "errors" in ret and len(self.errors) < 100_000: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 self.errors.extend(ret.get("errors", [])) 

156 if "warnings" in ret and len(self.warnings) < 100_000: 156 ↛ 158line 156 didn't jump to line 158 because the condition on line 156 was always true

157 self.warnings.extend(ret.get("warnings", [])) 

158 if "notes" in ret and len(self.notes) < 100_000: 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 self.notes.extend(ret.get("warnings", [])) 

160 if "wiki_notices" in ret and len(self.wiki_notices) < 100_000: 160 ↛ 162line 160 didn't jump to line 162 because the condition on line 160 was always true

161 self.wiki_notices.extend(ret.get("warnings", [])) 

162 if "debugs" in ret and len(self.debugs) < 3_000_000: 162 ↛ exitline 162 didn't return from function 'merge_return' because the condition on line 162 was always true

163 self.debugs.extend(ret.get("debugs", [])) 

164 

165 def load_edition_settings(self) -> None: 

166 file_path = self.data_folder / "config.json" 

167 if file_path.exists(): 

168 with file_path.open(encoding="utf-8") as f: 

169 for key, value in json.load(f).items(): 

170 setattr(self, key, value)