Coverage for src/wiktextract/extractor/en/thesaurus.py: 75%

144 statements  

« prev     ^ index     » next       coverage.py v7.9.0, created at 2025-06-13 07:43 +0000

1# Extracting information from thesaurus pages in Wiktionary. The data will be 

2# merged into word linkages in later stages. 

3# 

4# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

5 

6import re 

7from typing import Union 

8 

9from mediawiki_langcodes import code_to_name, name_to_code 

10from wikitextprocessor import NodeKind, Page, WikiNode 

11from wikitextprocessor.core import NamespaceDataEntry 

12 

13from ...datautils import ns_title_prefix_tuple 

14from ...page import LEVEL_KINDS, clean_node 

15from ...thesaurus import ThesaurusTerm 

16from ...wxr_context import WiktextractContext 

17from ...wxr_logging import logger 

18from .form_descriptions import parse_sense_qualifier 

19from .section_titles import LINKAGE_TITLES, POS_TITLES 

20from .type_utils import SenseData 

21 

22SENSE_TITLE_PREFIX = "Sense: " 

23 

24IGNORED_SUBTITLE_TAGS_MAP: dict[str, list[str]] = { 

25 "by reason": [], 

26 "by period of time": [], 

27 "by degree": [], 

28 "by type": [], 

29 "other": [], 

30 "opaque slang terms": ["slang"], 

31 "slang": ["slang"], 

32 "colloquial, archaic, slang": ["colloquial", "archaic", "slang"], 

33 "euphemisms": ["euphemism"], 

34 "colloquialisms": ["colloquial"], 

35 "colloquialisms or slang": ["colloquial"], 

36 "technical terms misused": ["colloquial"], 

37 "people": [], 

38 "proper names": ["proper-noun"], 

39 "race-based (warning- offensive)": ["offensive"], 

40 "substance addicts": [], 

41 "non-substance addicts": [], 

42 "echoing sounds": [], 

43 "movement sounds": [], 

44 "impacting sounds": [], 

45 "destructive sounds": [], 

46 "noisy sounds": [], 

47 "vocal sounds": [], 

48 "miscellaneous sounds": [], 

49 "age and gender": [], 

50 "breeds and types": [], 

51 "by function": [], 

52 "wild horses": [], 

53 "body parts": [], 

54 "colors, patterns and markings": [], 

55 "diseases": [], 

56 "equipment and gear": [], 

57 "groups": [], 

58 "horse-drawn vehicles": [], 

59 "places": [], 

60 "sports": [], 

61 "sounds and behavior": [], 

62 "obscure derivations": [], 

63 "plants": [], 

64 "animals": [], 

65 "common": [], 

66 "rare": ["rare"], 

67} 

68 

69 

70def extract_thesaurus_page( 

71 wxr: WiktextractContext, page: Page 

72) -> list[ThesaurusTerm]: 

73 """Extracts linkages from the thesaurus pages in Wiktionary.""" 

74 

75 thesaurus_ns_data: NamespaceDataEntry 

76 thesaurus_ns_data = wxr.wtp.NAMESPACE_DATA["Thesaurus"] 

77 

78 thesaurus_ns_local_name = thesaurus_ns_data["name"] 

79 

80 title = page.title 

81 text = page.body 

82 assert text is not None 

83 if title.startswith("Thesaurus:Requested entries "): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 return [] 

85 if "/" in title: 85 ↛ 87line 85 didn't jump to line 87 because the condition on line 85 was never true

86 # print("STRANGE TITLE:", title) 

87 return [] 

88 word = title[len(thesaurus_ns_local_name) + 1 :] 

89 idx = word.find(":") 

90 if idx > 0 and idx < 5: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 word = word[idx + 1 :] # Remove language prefix 

92 expanded = wxr.wtp.expand(text, templates_to_expand=None) # Expand all 

93 expanded = re.sub( 

94 r'(?s)<span class="tr Latn"[^>]*>(<b>)?(.*?)(</b>)?' r"</span>", 

95 r"XLITS\2XLITE", 

96 expanded, 

97 ) 

98 tree = wxr.wtp.parse(expanded, pre_expand=False) 

99 assert tree.kind == NodeKind.ROOT 

100 lang = "" 

101 pos = "" 

102 sense = "" 

103 linkage = "" 

104 subtitle_tags: list[str] = [] 

105 entry_id = -1 

106 # Some pages don't have a language subtitle, but use 

107 # {{ws header|lang=xx}} 

108 m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text) 

109 if m: 109 ↛ 112line 109 didn't jump to line 112 because the condition on line 109 was always true

110 lang = code_to_name(m.group(1), "en") 

111 

112 def recurse( 

113 contents: Union[ 

114 list[Union[WikiNode, str]], 

115 WikiNode, 

116 str, 

117 list[list[Union[WikiNode, str]]], 

118 ], 

119 ) -> list[ThesaurusTerm]: 

120 nonlocal lang 

121 nonlocal pos 

122 nonlocal sense 

123 nonlocal linkage 

124 nonlocal subtitle_tags 

125 nonlocal entry_id 

126 item_sense = "" 

127 tags: list[str] = [] 

128 topics: list[str] = [] 

129 

130 if isinstance(contents, (list, tuple)): 

131 thesaurus = [] 

132 for x in contents: 

133 thesaurus.extend(recurse(x)) 

134 return thesaurus 

135 if not isinstance(contents, WikiNode): 

136 return [] 

137 kind = contents.kind 

138 if kind == NodeKind.LIST and not contents.contain_node(NodeKind.LIST): 

139 if lang == "": 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 logger.debug( 

141 f"{title=} {lang=} UNEXPECTED LIST WITHOUT LANG: " 

142 + str(contents) 

143 ) 

144 return [] 

145 thesaurus = [] 

146 for node in contents.children: 

147 if isinstance(node, str) or node.kind != NodeKind.LIST_ITEM: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 continue 

149 w = clean_node(wxr, None, node.children) 

150 if "*" in w: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 logger.debug(f"{title=} {lang=} {pos=} STAR IN WORD: {w}") 

152 # Check for parenthesized sense at the beginning 

153 m = re.match(r"(?s)^\(([^)]*)\):\s*(.*)$", w) 

154 if m: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 item_sense, w = m.groups() 

156 # XXX check for item_sense being part-of-speech 

157 else: 

158 item_sense = sense 

159 

160 # Remove thesaurus links, if any 

161 w = re.sub(r"\s*\[W[Ss]\]", "", w) 

162 

163 # Check for English translation in quotes. This can be 

164 # literal translation, not necessarily the real meaning. 

165 english = None 

166 

167 def engl_fn(m: re.Match) -> str: 

168 nonlocal english 

169 english = m.group(1) 

170 return "" 

171 

172 w = re.sub(r'(\bliterally\s*)?(, )?“([^"]*)"\s*', engl_fn, w) 

173 

174 # Check for qualifiers in parentheses 

175 tags = [] 

176 topics = [] 

177 

178 def qual_fn(m: re.Match) -> str: 

179 q = m.group(1) 

180 if q == item_sense: 

181 return "" 

182 if "XLITS" in q: 

183 return q 

184 dt: SenseData = {} 

185 parse_sense_qualifier(wxr, q, dt) 

186 tags.extend(dt.get("tags", [])) 

187 topics.extend(dt.get("topics", [])) 

188 return "" 

189 

190 w = re.sub(r"\(([^)]*)\)$", qual_fn, w).strip() 

191 

192 # XXX there could be a transliteration, e.g. 

193 # Thesaurus:老百姓 

194 

195 # XXX Apparently there can also be alternative spellings, 

196 # such as 眾人/众人 on Thesaurus:老百姓 

197 

198 # If the word is now empty or separator, skip 

199 if not w or w.startswith("---") or w == "\u2014": 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 return [] 

201 rel = linkage or "synonyms" 

202 for w1 in w.split(","): 

203 m = re.match(r"(?s)(.*?)\s*XLITS(.*?)XLITE\s*", w1) 

204 if m: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 w1, xlit = m.groups() 

206 else: 

207 xlit = "" 

208 w1 = w1.strip() 

209 if w1.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 w1 = w1[10:] 

211 w1 = w1.removesuffix(" [⇒ thesaurus]") 

212 

213 if len(w1) > 0: 213 ↛ 202line 213 didn't jump to line 202 because the condition on line 213 was always true

214 lang_code = name_to_code(lang, "en") 

215 if lang_code == "": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 logger.debug( 

217 f"Linkage language {lang} not recognized" 

218 ) 

219 thesaurus.append( 

220 ThesaurusTerm( 

221 entry=word, 

222 language_code=lang_code, 

223 pos=pos, 

224 linkage=rel, 

225 term=w1, 

226 tags=tags, 

227 topics=topics, 

228 roman=xlit, 

229 sense=item_sense, 

230 ) 

231 ) 

232 return thesaurus 

233 if kind not in LEVEL_KINDS: 

234 thesaurus = [] 

235 args_thesaurus = recurse( 

236 contents.sarg if contents.sarg else contents.largs 

237 ) 

238 if args_thesaurus is not None: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true

239 thesaurus.extend(args_thesaurus) 

240 children_thesaurus = recurse(contents.children) 

241 if children_thesaurus is not None: 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was always true

242 thesaurus.extend(children_thesaurus) 

243 return thesaurus 

244 subtitle = wxr.wtp.node_to_text( 

245 contents.sarg if contents.sarg else contents.largs 

246 ) 

247 if name_to_code(subtitle, "en") != "": 

248 lang = subtitle 

249 pos = "" 

250 sense = "" 

251 linkage = "" 

252 return recurse(contents.children) 

253 if subtitle.lower().startswith(SENSE_TITLE_PREFIX): 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 sense = subtitle[len(SENSE_TITLE_PREFIX) :] 

255 linkage = "" 

256 return recurse(contents.children) 

257 

258 subtitle = subtitle.lower() 

259 if subtitle in ( 259 ↛ 271line 259 didn't jump to line 271 because the condition on line 259 was never true

260 "further reading", 

261 "external links", 

262 "references", 

263 "translations", 

264 "notes", 

265 "usage", 

266 "work to be done", 

267 "quantification", 

268 "abbreviation", 

269 "symbol", 

270 ): 

271 return [] 

272 if subtitle in LINKAGE_TITLES: 

273 linkage = LINKAGE_TITLES[subtitle] 

274 return recurse(contents.children) 

275 if subtitle in POS_TITLES: 

276 pos = POS_TITLES[subtitle]["pos"] 

277 sense = "" 

278 linkage = "" 

279 return recurse(contents.children) 

280 if subtitle in IGNORED_SUBTITLE_TAGS_MAP: 280 ↛ 283line 280 didn't jump to line 283 because the condition on line 280 was never true

281 # These subtitles are ignored but children are processed and 

282 # possibly given additional tags 

283 subtitle_tags = IGNORED_SUBTITLE_TAGS_MAP[subtitle] 

284 return recurse(contents.children) 

285 logger.debug( 

286 f"{title=} {lang=} {pos=} {sense=} UNHANDLED SUBTITLE: " 

287 + "subtitle " 

288 + str(contents.sarg if contents.sarg else contents.largs) 

289 ) 

290 return recurse(contents.children) 

291 

292 return recurse(tree)