Coverage for src/wiktextract/config.py: 96%

62 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Definition of the configuration object for Wiktionary data extraction. 

2# The same object is also used for collecting statistics. 

3# 

4# Copyright (c) 2018-2022, 2024 Tatu Ylonen. See file LICENSE or https://ylonen.org 

5 

6import collections 

7import json 

8from importlib.resources import files 

9from typing import ( 

10 Iterable, 

11 Optional, 

12 TypedDict, 

13) 

14 

15from wikitextprocessor.core import ( 

16 CollatedErrorReturnData, 

17 ErrorMessageData, 

18 HTMLTagData, 

19) 

20 

21SoundFileRedirects = dict[str, str] 

22 

23POSSubtitleData = TypedDict( 

24 "POSSubtitleData", 

25 { 

26 "pos": str, 

27 "debug": str, 

28 "tags": list[str], 

29 "error": str, # not used 

30 "warning": str, # not used 

31 }, 

32 total=False, 

33) 

34 

35 

36class WiktionaryConfig: 

37 """This class holds configuration data for Wiktionary parsing.""" 

38 

39 __slots__ = ( 

40 "dump_file_lang_code", 

41 "capture_language_codes", 

42 "capture_translations", 

43 "capture_pronunciation", 

44 "capture_linkages", 

45 "capture_compounds", 

46 "capture_redirects", 

47 "capture_examples", 

48 "capture_etymologies", 

49 "capture_inflections", 

50 "capture_descendants", 

51 "expand_tables", 

52 "verbose", 

53 "num_pages", 

54 "language_counts", 

55 "pos_counts", 

56 "section_counts", 

57 "word", 

58 "errors", 

59 "warnings", 

60 "debugs", 

61 "redirects", 

62 "data_folder", 

63 "extract_thesaurus_pages", 

64 "save_ns_names", 

65 "extract_ns_names", 

66 "allowed_html_tags", 

67 ) 

68 

69 def __init__( 

70 self, 

71 dump_file_lang_code: str = "en", 

72 capture_language_codes: Optional[Iterable[str]] = {"en", "mul"}, 

73 capture_translations=True, 

74 capture_pronunciation=True, 

75 capture_linkages=True, 

76 capture_compounds=True, 

77 capture_redirects=True, 

78 capture_examples=True, 

79 capture_etymologies=True, 

80 capture_inflections=True, 

81 capture_descendants=True, 

82 verbose=False, 

83 expand_tables=False, 

84 ): 

85 if capture_language_codes is not None: 

86 assert isinstance(capture_language_codes, (list, tuple, set)) 

87 for x in capture_language_codes: 

88 assert isinstance(x, str) 

89 assert capture_language_codes is None or isinstance( 

90 capture_language_codes, (list, tuple, set) 

91 ) 

92 assert capture_translations in (True, False) 

93 assert capture_pronunciation in (True, False) 

94 assert capture_linkages in (True, False) 

95 assert capture_compounds in (True, False) 

96 assert capture_redirects in (True, False) 

97 assert capture_etymologies in (True, False) 

98 self.dump_file_lang_code = dump_file_lang_code 

99 self.capture_language_codes = capture_language_codes 

100 self.capture_translations = capture_translations 

101 self.capture_pronunciation = capture_pronunciation 

102 self.capture_linkages = capture_linkages 

103 self.capture_compounds = capture_compounds 

104 self.capture_redirects = capture_redirects 

105 self.capture_examples = capture_examples 

106 self.capture_etymologies = capture_etymologies 

107 self.capture_inflections = capture_inflections 

108 self.capture_descendants = capture_descendants 

109 self.verbose = verbose 

110 self.expand_tables = expand_tables 

111 # Some fields for statistics 

112 self.num_pages = 0 

113 self.language_counts: dict[str, int] = collections.defaultdict(int) 

114 self.pos_counts: dict[str, int] = collections.defaultdict(int) 

115 self.section_counts: dict[str, int] = collections.defaultdict(int) 

116 # Some fields related to errors 

117 # The word currently being processed. 

118 self.word: Optional[str] = None 

119 self.errors: list[ErrorMessageData] = [] 

120 self.warnings: list[ErrorMessageData] = [] 

121 self.debugs: list[ErrorMessageData] = [] 

122 self.redirects: SoundFileRedirects = {} 

123 self.data_folder = files("wiktextract") / "data" / dump_file_lang_code 

124 self.extract_thesaurus_pages = False 

125 # these namespace pages will be copied from the XML dump file and 

126 # saved to a SQLite db file 

127 self.save_ns_names = ["Main", "Template", "Module"] 

128 # these are extracted namespaces 

129 self.extract_ns_names = ["Main"] 

130 self.allowed_html_tags: dict[str, HTMLTagData] = {} 

131 self.load_edition_settings() 

132 

133 def merge_return(self, ret: CollatedErrorReturnData): 

134 # XXX This was never properly implemented; even the only 

135 # count (self.section_counts) that is updated during running 

136 # gets discarded when doing batches instead of individual 

137 # pages. Search: STATISTICS_IMPLEMENTATION 

138 # if "num_pages" in ret: 

139 # self.num_pages += ret["num_pages"] 

140 # for k, v in ret["language_counts"].items(): 

141 # self.language_counts[k] += v 

142 # for k, v in ret["pos_counts"].items(): 

143 # self.pos_counts[k] += v 

144 # for k, v in ret["section_counts"].items(): 

145 # self.section_counts[k] += v 

146 if "errors" in ret and len(self.errors) < 100_000: 146 ↛ 148line 146 didn't jump to line 148 because the condition on line 146 was always true

147 self.errors.extend(ret.get("errors", [])) 

148 if "warnings" in ret and len(self.warnings) < 100_000: 148 ↛ 150line 148 didn't jump to line 150 because the condition on line 148 was always true

149 self.warnings.extend(ret.get("warnings", [])) 

150 if "debugs" in ret and len(self.debugs) < 3_000_000: 150 ↛ exitline 150 didn't return from function 'merge_return' because the condition on line 150 was always true

151 self.debugs.extend(ret.get("debugs", [])) 

152 

153 def load_edition_settings(self) -> None: 

154 file_path = self.data_folder / "config.json" 

155 if file_path.exists(): 

156 with file_path.open(encoding="utf-8") as f: 

157 for key, value in json.load(f).items(): 

158 setattr(self, key, value)