Coverage for src/wiktextract/config.py: 96%

63 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Definition of the configuration object for Wiktionary data extraction. 

2# The same object is also used for collecting statistics. 

3# 

4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org 

5 

6import collections 

7import json 

8from importlib.resources import files 

9from typing import ( 

10 Iterable, 

11 Optional, 

12 TypedDict, 

13) 

14 

15from wikitextprocessor.core import ( 

16 CollatedErrorReturnData, 

17 ErrorMessageData, 

18 HTMLTagData, 

19) 

20 

21SoundFileRedirects = dict[str, str] 

22 

23POSSubtitleData = TypedDict( 

24 "POSSubtitleData", 

25 { 

26 "pos": str, 

27 "debug": str, 

28 "tags": list[str], 

29 "error": str, # not used 

30 "warning": str, # not used 

31 }, 

32 total=False, 

33) 

34 

35 

36class WiktionaryConfig: 

37 """This class holds configuration data for Wiktionary parsing.""" 

38 

39 __slots__ = ( 

40 "dump_file_lang_code", 

41 "capture_language_codes", 

42 "capture_translations", 

43 "capture_pronunciation", 

44 "capture_linkages", 

45 "capture_compounds", 

46 "capture_redirects", 

47 "capture_examples", 

48 "capture_etymologies", 

49 "capture_inflections", 

50 "capture_descendants", 

51 "expand_tables", 

52 "verbose", 

53 "num_pages", 

54 "language_counts", 

55 "pos_counts", 

56 "section_counts", 

57 "word", 

58 "errors", 

59 "warnings", 

60 "debugs", 

61 "redirects", 

62 "data_folder", 

63 "extract_thesaurus_pages", 

64 "save_ns_names", 

65 "extract_ns_names", 

66 "allowed_html_tags", 

67 "parser_function_aliases", 

68 ) 

69 

70 def __init__( 

71 self, 

72 dump_file_lang_code: str = "en", 

73 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"}, 

74 capture_translations=True, 

75 capture_pronunciation=True, 

76 capture_linkages=True, 

77 capture_compounds=True, 

78 capture_redirects=True, 

79 capture_examples=True, 

80 capture_etymologies=True, 

81 capture_inflections=True, 

82 capture_descendants=True, 

83 verbose=False, 

84 expand_tables=False, 

85 ): 

86 if capture_language_codes is not None: 

87 assert isinstance(capture_language_codes, (list, tuple, set)) 

88 for x in capture_language_codes: 

89 assert isinstance(x, str) 

90 assert capture_language_codes is None or isinstance( 

91 capture_language_codes, (list, tuple, set) 

92 ) 

93 assert capture_translations in (True, False) 

94 assert capture_pronunciation in (True, False) 

95 assert capture_linkages in (True, False) 

96 assert capture_compounds in (True, False) 

97 assert capture_redirects in (True, False) 

98 assert capture_etymologies in (True, False) 

99 self.dump_file_lang_code = dump_file_lang_code 

100 self.capture_language_codes = capture_language_codes 

101 self.capture_translations = capture_translations 

102 self.capture_pronunciation = capture_pronunciation 

103 self.capture_linkages = capture_linkages 

104 self.capture_compounds = capture_compounds 

105 self.capture_redirects = capture_redirects 

106 self.capture_examples = capture_examples 

107 self.capture_etymologies = capture_etymologies 

108 self.capture_inflections = capture_inflections 

109 self.capture_descendants = capture_descendants 

110 self.verbose = verbose 

111 self.expand_tables = expand_tables 

112 # Some fields for statistics 

113 self.num_pages = 0 

114 self.language_counts: dict[str, int] = collections.defaultdict(int) 

115 self.pos_counts: dict[str, int] = collections.defaultdict(int) 

116 self.section_counts: dict[str, int] = collections.defaultdict(int) 

117 # Some fields related to errors 

118 # The word currently being processed. 

119 self.word: Optional[str] = None 

120 self.errors: list[ErrorMessageData] = [] 

121 self.warnings: list[ErrorMessageData] = [] 

122 self.debugs: list[ErrorMessageData] = [] 

123 self.redirects: SoundFileRedirects = {} 

124 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code 

125 self.extract_thesaurus_pages = False 

126 # these namespace pages will be copied from the XML dump file and 

127 # saved to a SQLite db file 

128 self.save_ns_names = ["Main", "Template", "Module"] 

129 # these are extracted namespaces 

130 self.extract_ns_names = ["Main"] 

131 self.allowed_html_tags: dict[str, HTMLTagData] = {} 

132 self.parser_function_aliases: dict[str, str] = {} 

133 self.load_edition_settings() 

134 

135 def merge_return(self, ret: CollatedErrorReturnData): 

136 # XXX This was never properly implemented; even the only 

137 # count (self.section_counts) that is updated during running 

138 # gets discarded when doing batches instead of individual 

139 # pages. Search: STATISTICS_IMPLEMENTATION 

140 # if "num_pages" in ret: 

141 # self.num_pages += ret["num_pages"] 

142 # for k, v in ret["language_counts"].items(): 

143 # self.language_counts[k] += v 

144 # for k, v in ret["pos_counts"].items(): 

145 # self.pos_counts[k] += v 

146 # for k, v in ret["section_counts"].items(): 

147 # self.section_counts[k] += v 

148 if "errors" in ret and len(self.errors) < 100_000: 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was always true

149 self.errors.extend(ret.get("errors", [])) 

150 if "warnings" in ret and len(self.warnings) < 100_000: 150 ↛ 152line 150 didn't jump to line 152 because the condition on line 150 was always true

151 self.warnings.extend(ret.get("warnings", [])) 

152 if "debugs" in ret and len(self.debugs) < 3_000_000: 152 ↛ exitline 152 didn't return from function 'merge_return' because the condition on line 152 was always true

153 self.debugs.extend(ret.get("debugs", [])) 

154 

155 def load_edition_settings(self) -> None: 

156 file_path = self.data_folder / "config.json" 

157 if file_path.exists(): 

158 with file_path.open(encoding="utf-8") as f: 

159 for key, value in json.load(f).items(): 

160 setattr(self, key, value)