Coverage for src/wiktextract/extractor/en/thesaurus.py: 75%

1# Extracting information from thesaurus pages in Wiktionary. The data will be

2# merged into word linkages in later stages.

6import re

7from typing import Union

9from mediawiki_langcodes import code_to_name, name_to_code

10from wikitextprocessor import NodeKind, Page, WikiNode

11from wikitextprocessor.core import NamespaceDataEntry

13from ...datautils import ns_title_prefix_tuple

14from ...page import LEVEL_KINDS, clean_node

15from ...thesaurus import ThesaurusTerm

16from ...wxr_context import WiktextractContext

17from ...wxr_logging import logger

18from .form_descriptions import parse_sense_qualifier

19from .section_titles import LINKAGE_TITLES, POS_TITLES

20from .type_utils import SenseData

22SENSE_TITLE_PREFIX = "Sense: "

24IGNORED_SUBTITLE_TAGS_MAP: dict[str, list[str]] = {

25 "by reason": [],

26 "by period of time": [],

27 "by degree": [],

28 "by type": [],

29 "other": [],

30 "opaque slang terms": ["slang"],

31 "slang": ["slang"],

32 "colloquial, archaic, slang": ["colloquial", "archaic", "slang"],

33 "euphemisms": ["euphemism"],

34 "colloquialisms": ["colloquial"],

35 "colloquialisms or slang": ["colloquial"],

36 "technical terms misused": ["colloquial"],

37 "people": [],

38 "proper names": ["proper-noun"],

39 "race-based (warning- offensive)": ["offensive"],

40 "substance addicts": [],

41 "non-substance addicts": [],

42 "echoing sounds": [],

43 "movement sounds": [],

44 "impacting sounds": [],

45 "destructive sounds": [],

46 "noisy sounds": [],

47 "vocal sounds": [],

48 "miscellaneous sounds": [],

49 "age and gender": [],

50 "breeds and types": [],

51 "by function": [],

52 "wild horses": [],

53 "body parts": [],

54 "colors, patterns and markings": [],

55 "diseases": [],

56 "equipment and gear": [],

57 "groups": [],

58 "horse-drawn vehicles": [],

59 "places": [],

60 "sports": [],

61 "sounds and behavior": [],

62 "obscure derivations": [],

63 "plants": [],

64 "animals": [],

65 "common": [],

66 "rare": ["rare"],

67}

70def extract_thesaurus_page(

71 wxr: WiktextractContext, page: Page

72) -> list[ThesaurusTerm]:

73 """Extracts linkages from the thesaurus pages in Wiktionary."""

75 thesaurus_ns_data: NamespaceDataEntry

76 thesaurus_ns_data = wxr.wtp.NAMESPACE_DATA["Thesaurus"]

78 thesaurus_ns_local_name = thesaurus_ns_data["name"]

80 title = page.title

81 text = page.body

82 assert text is not None

83 if title.startswith("Thesaurus:Requested entries "): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true

84 return []

85 if "/" in title: 85 ↛ 87line 85 didn't jump to line 87 because the condition on line 85 was never true

86 # print("STRANGE TITLE:", title)

87 return []

88 word = title[len(thesaurus_ns_local_name) + 1 :]

89 idx = word.find(":")

90 if idx > 0 and idx < 5: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true

91 word = word[idx + 1 :] # Remove language prefix

92 expanded = wxr.wtp.expand(text, templates_to_expand=None) # Expand all

93 expanded = re.sub(

94 r'(?s)<span class="tr Latn"[^>]*>(<b>)?(.*?)(</b>)?' r"</span>",

95 r"XLITS\2XLITE",

96 expanded,

97 )

98 tree = wxr.wtp.parse(expanded, pre_expand=False)

99 assert tree.kind == NodeKind.ROOT

100 lang = ""

101 pos = ""

102 sense = ""

103 linkage = ""

104 subtitle_tags: list[str] = []

105 entry_id = -1

106 # Some pages don't have a language subtitle, but use

107 # {{ws header|lang=xx}}

108 m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text)

109 if m: 109 ↛ 112line 109 didn't jump to line 112 because the condition on line 109 was always true

110 lang = code_to_name(m.group(1), "en")

111

112 def recurse(

113 contents: Union[

114 list[Union[WikiNode, str]],

115 WikiNode,

116 str,

117 list[list[Union[WikiNode, str]]],

118 ],

119 ) -> list[ThesaurusTerm]:

120 nonlocal lang

121 nonlocal pos

122 nonlocal sense

123 nonlocal linkage

124 nonlocal subtitle_tags

125 nonlocal entry_id

126 item_sense = ""

127 tags: list[str] = []

128 topics: list[str] = []

129

130 if isinstance(contents, (list, tuple)):

131 thesaurus = []

132 for x in contents:

133 thesaurus.extend(recurse(x))

134 return thesaurus

135 if not isinstance(contents, WikiNode):

136 return []

137 kind = contents.kind

138 if kind == NodeKind.LIST and not contents.contain_node(NodeKind.LIST):

139 if lang == "": 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 logger.debug(

141 f"{title=} {lang=} UNEXPECTED LIST WITHOUT LANG: "

142 + str(contents)

143 )

144 return []

145 thesaurus = []

146 for node in contents.children:

147 if isinstance(node, str) or node.kind != NodeKind.LIST_ITEM: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true

148 continue

149 w = clean_node(wxr, None, node.children)

150 if "*" in w: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 logger.debug(f"{title=} {lang=} {pos=} STAR IN WORD: {w}")

152 # Check for parenthesized sense at the beginning

153 m = re.match(r"(?s)^$([^)]*)$:\s*(.*)$", w)

154 if m: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true

155 item_sense, w = m.groups()

156 # XXX check for item_sense being part-of-speech

157 else:

158 item_sense = sense

159

160 # Remove thesaurus links, if any

161 w = re.sub(r"\s*\[W[Ss]\]", "", w)

162

163 # Check for English translation in quotes. This can be

164 # literal translation, not necessarily the real meaning.

165 english = None

166

167 def engl_fn(m: re.Match) -> str:

168 nonlocal english

169 english = m.group(1)

170 return ""

171

172 w = re.sub(r'(\bliterally\s*)?(, )?“([^"]*)"\s*', engl_fn, w)

173

174 # Check for qualifiers in parentheses

175 tags = []

176 topics = []

177

178 def qual_fn(m: re.Match) -> str:

179 q = m.group(1)

180 if q == item_sense:

181 return ""

182 if "XLITS" in q:

183 return q

184 dt: SenseData = {}

185 parse_sense_qualifier(wxr, q, dt)

186 tags.extend(dt.get("tags", []))

187 topics.extend(dt.get("topics", []))

188 return ""

189

190 w = re.sub(r"$([^)]*)$$", qual_fn, w).strip()

191

192 # XXX there could be a transliteration, e.g.

193 # Thesaurus:老百姓

194

195 # XXX Apparently there can also be alternative spellings,

196 # such as 眾人／众人 on Thesaurus:老百姓

197

198 # If the word is now empty or separator, skip

199 if not w or w.startswith("---") or w == "\u2014": 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 return []

201 rel = linkage or "synonyms"

202 for w1 in w.split(","):

203 m = re.match(r"(?s)(.*?)\s*XLITS(.*?)XLITE\s*", w1)

204 if m: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 w1, xlit = m.groups()

206 else:

207 xlit = ""

208 w1 = w1.strip()

209 if w1.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 w1 = w1[10:]

211 w1 = w1.removesuffix(" [⇒ thesaurus]")

212

213 if len(w1) > 0: 213 ↛ 202line 213 didn't jump to line 202 because the condition on line 213 was always true

214 lang_code = name_to_code(lang, "en")

215 if lang_code == "": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true

216 logger.debug(

217 f"Linkage language {lang} not recognized"

218 )

219 thesaurus.append(

220 ThesaurusTerm(

221 entry=word,

222 language_code=lang_code,

223 pos=pos,

224 linkage=rel,

225 term=w1,

226 tags=tags,

227 topics=topics,

228 roman=xlit,

229 sense=item_sense,

230 )

231 )

232 return thesaurus

233 if kind not in LEVEL_KINDS:

234 thesaurus = []

235 args_thesaurus = recurse(

236 contents.sarg if contents.sarg else contents.largs

237 )

238 if args_thesaurus is not None: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true

239 thesaurus.extend(args_thesaurus)

240 children_thesaurus = recurse(contents.children)

241 if children_thesaurus is not None: 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was always true

242 thesaurus.extend(children_thesaurus)

243 return thesaurus

244 subtitle = wxr.wtp.node_to_text(

245 contents.sarg if contents.sarg else contents.largs

246 )

247 if name_to_code(subtitle, "en") != "":

248 lang = subtitle

249 pos = ""

250 sense = ""

251 linkage = ""

252 return recurse(contents.children)

253 if subtitle.lower().startswith(SENSE_TITLE_PREFIX): 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true

254 sense = subtitle[len(SENSE_TITLE_PREFIX) :]

255 linkage = ""

256 return recurse(contents.children)

257

258 subtitle = subtitle.lower()

259 if subtitle in ( 259 ↛ 271line 259 didn't jump to line 271 because the condition on line 259 was never true

260 "further reading",

261 "external links",

262 "references",

263 "translations",

264 "notes",

265 "usage",

266 "work to be done",

267 "quantification",

268 "abbreviation",

269 "symbol",

270 ):

271 return []

272 if subtitle in LINKAGE_TITLES:

273 linkage = LINKAGE_TITLES[subtitle]

274 return recurse(contents.children)

275 if subtitle in POS_TITLES:

276 pos = POS_TITLES[subtitle]["pos"]

277 sense = ""

278 linkage = ""

279 return recurse(contents.children)

280 if subtitle in IGNORED_SUBTITLE_TAGS_MAP: 280 ↛ 283line 280 didn't jump to line 283 because the condition on line 280 was never true

281 # These subtitles are ignored but children are processed and

282 # possibly given additional tags

283 subtitle_tags = IGNORED_SUBTITLE_TAGS_MAP[subtitle]

284 return recurse(contents.children)

285 logger.debug(

286 f"{title=} {lang=} {pos=} {sense=} UNHANDLED SUBTITLE: "

287 + "subtitle "

288 + str(contents.sarg if contents.sarg else contents.largs)

289 )

290 return recurse(contents.children)

291

292 return recurse(tree)