Coverage for src/wiktextract/extractor/en/thesaurus.py: 75%
144 statements
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
« prev ^ index » next coverage.py v7.9.0, created at 2025-06-13 07:43 +0000
1# Extracting information from thesaurus pages in Wiktionary. The data will be
2# merged into word linkages in later stages.
3#
4# Copyright (c) 2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
6import re
7from typing import Union
9from mediawiki_langcodes import code_to_name, name_to_code
10from wikitextprocessor import NodeKind, Page, WikiNode
11from wikitextprocessor.core import NamespaceDataEntry
13from ...datautils import ns_title_prefix_tuple
14from ...page import LEVEL_KINDS, clean_node
15from ...thesaurus import ThesaurusTerm
16from ...wxr_context import WiktextractContext
17from ...wxr_logging import logger
18from .form_descriptions import parse_sense_qualifier
19from .section_titles import LINKAGE_TITLES, POS_TITLES
20from .type_utils import SenseData
22SENSE_TITLE_PREFIX = "Sense: "
24IGNORED_SUBTITLE_TAGS_MAP: dict[str, list[str]] = {
25 "by reason": [],
26 "by period of time": [],
27 "by degree": [],
28 "by type": [],
29 "other": [],
30 "opaque slang terms": ["slang"],
31 "slang": ["slang"],
32 "colloquial, archaic, slang": ["colloquial", "archaic", "slang"],
33 "euphemisms": ["euphemism"],
34 "colloquialisms": ["colloquial"],
35 "colloquialisms or slang": ["colloquial"],
36 "technical terms misused": ["colloquial"],
37 "people": [],
38 "proper names": ["proper-noun"],
39 "race-based (warning- offensive)": ["offensive"],
40 "substance addicts": [],
41 "non-substance addicts": [],
42 "echoing sounds": [],
43 "movement sounds": [],
44 "impacting sounds": [],
45 "destructive sounds": [],
46 "noisy sounds": [],
47 "vocal sounds": [],
48 "miscellaneous sounds": [],
49 "age and gender": [],
50 "breeds and types": [],
51 "by function": [],
52 "wild horses": [],
53 "body parts": [],
54 "colors, patterns and markings": [],
55 "diseases": [],
56 "equipment and gear": [],
57 "groups": [],
58 "horse-drawn vehicles": [],
59 "places": [],
60 "sports": [],
61 "sounds and behavior": [],
62 "obscure derivations": [],
63 "plants": [],
64 "animals": [],
65 "common": [],
66 "rare": ["rare"],
67}
70def extract_thesaurus_page(
71 wxr: WiktextractContext, page: Page
72) -> list[ThesaurusTerm]:
73 """Extracts linkages from the thesaurus pages in Wiktionary."""
75 thesaurus_ns_data: NamespaceDataEntry
76 thesaurus_ns_data = wxr.wtp.NAMESPACE_DATA["Thesaurus"]
78 thesaurus_ns_local_name = thesaurus_ns_data["name"]
80 title = page.title
81 text = page.body
82 assert text is not None
83 if title.startswith("Thesaurus:Requested entries "): 83 ↛ 84line 83 didn't jump to line 84 because the condition on line 83 was never true
84 return []
85 if "/" in title: 85 ↛ 87line 85 didn't jump to line 87 because the condition on line 85 was never true
86 # print("STRANGE TITLE:", title)
87 return []
88 word = title[len(thesaurus_ns_local_name) + 1 :]
89 idx = word.find(":")
90 if idx > 0 and idx < 5: 90 ↛ 91line 90 didn't jump to line 91 because the condition on line 90 was never true
91 word = word[idx + 1 :] # Remove language prefix
92 expanded = wxr.wtp.expand(text, templates_to_expand=None) # Expand all
93 expanded = re.sub(
94 r'(?s)<span class="tr Latn"[^>]*>(<b>)?(.*?)(</b>)?' r"</span>",
95 r"XLITS\2XLITE",
96 expanded,
97 )
98 tree = wxr.wtp.parse(expanded, pre_expand=False)
99 assert tree.kind == NodeKind.ROOT
100 lang = ""
101 pos = ""
102 sense = ""
103 linkage = ""
104 subtitle_tags: list[str] = []
105 entry_id = -1
106 # Some pages don't have a language subtitle, but use
107 # {{ws header|lang=xx}}
108 m = re.search(r"(?s)\{\{ws header\|[^}]*lang=([^}|]*)", text)
109 if m: 109 ↛ 112line 109 didn't jump to line 112 because the condition on line 109 was always true
110 lang = code_to_name(m.group(1), "en")
112 def recurse(
113 contents: Union[
114 list[Union[WikiNode, str]],
115 WikiNode,
116 str,
117 list[list[Union[WikiNode, str]]],
118 ],
119 ) -> list[ThesaurusTerm]:
120 nonlocal lang
121 nonlocal pos
122 nonlocal sense
123 nonlocal linkage
124 nonlocal subtitle_tags
125 nonlocal entry_id
126 item_sense = ""
127 tags: list[str] = []
128 topics: list[str] = []
130 if isinstance(contents, (list, tuple)):
131 thesaurus = []
132 for x in contents:
133 thesaurus.extend(recurse(x))
134 return thesaurus
135 if not isinstance(contents, WikiNode):
136 return []
137 kind = contents.kind
138 if kind == NodeKind.LIST and not contents.contain_node(NodeKind.LIST):
139 if lang == "": 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true
140 logger.debug(
141 f"{title=} {lang=} UNEXPECTED LIST WITHOUT LANG: "
142 + str(contents)
143 )
144 return []
145 thesaurus = []
146 for node in contents.children:
147 if isinstance(node, str) or node.kind != NodeKind.LIST_ITEM: 147 ↛ 148line 147 didn't jump to line 148 because the condition on line 147 was never true
148 continue
149 w = clean_node(wxr, None, node.children)
150 if "*" in w: 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true
151 logger.debug(f"{title=} {lang=} {pos=} STAR IN WORD: {w}")
152 # Check for parenthesized sense at the beginning
153 m = re.match(r"(?s)^\(([^)]*)\):\s*(.*)$", w)
154 if m: 154 ↛ 155line 154 didn't jump to line 155 because the condition on line 154 was never true
155 item_sense, w = m.groups()
156 # XXX check for item_sense being part-of-speech
157 else:
158 item_sense = sense
160 # Remove thesaurus links, if any
161 w = re.sub(r"\s*\[W[Ss]\]", "", w)
163 # Check for English translation in quotes. This can be
164 # literal translation, not necessarily the real meaning.
165 english = None
167 def engl_fn(m: re.Match) -> str:
168 nonlocal english
169 english = m.group(1)
170 return ""
172 w = re.sub(r'(\bliterally\s*)?(, )?“([^"]*)"\s*', engl_fn, w)
174 # Check for qualifiers in parentheses
175 tags = []
176 topics = []
178 def qual_fn(m: re.Match) -> str:
179 q = m.group(1)
180 if q == item_sense:
181 return ""
182 if "XLITS" in q:
183 return q
184 dt: SenseData = {}
185 parse_sense_qualifier(wxr, q, dt)
186 tags.extend(dt.get("tags", []))
187 topics.extend(dt.get("topics", []))
188 return ""
190 w = re.sub(r"\(([^)]*)\)$", qual_fn, w).strip()
192 # XXX there could be a transliteration, e.g.
193 # Thesaurus:老百姓
195 # XXX Apparently there can also be alternative spellings,
196 # such as 眾人/众人 on Thesaurus:老百姓
198 # If the word is now empty or separator, skip
199 if not w or w.startswith("---") or w == "\u2014": 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 return []
201 rel = linkage or "synonyms"
202 for w1 in w.split(","):
203 m = re.match(r"(?s)(.*?)\s*XLITS(.*?)XLITE\s*", w1)
204 if m: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 w1, xlit = m.groups()
206 else:
207 xlit = ""
208 w1 = w1.strip()
209 if w1.startswith(ns_title_prefix_tuple(wxr, "Thesaurus")): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 w1 = w1[10:]
211 w1 = w1.removesuffix(" [⇒ thesaurus]")
213 if len(w1) > 0: 213 ↛ 202line 213 didn't jump to line 202 because the condition on line 213 was always true
214 lang_code = name_to_code(lang, "en")
215 if lang_code == "": 215 ↛ 216line 215 didn't jump to line 216 because the condition on line 215 was never true
216 logger.debug(
217 f"Linkage language {lang} not recognized"
218 )
219 thesaurus.append(
220 ThesaurusTerm(
221 entry=word,
222 language_code=lang_code,
223 pos=pos,
224 linkage=rel,
225 term=w1,
226 tags=tags,
227 topics=topics,
228 roman=xlit,
229 sense=item_sense,
230 )
231 )
232 return thesaurus
233 if kind not in LEVEL_KINDS:
234 thesaurus = []
235 args_thesaurus = recurse(
236 contents.sarg if contents.sarg else contents.largs
237 )
238 if args_thesaurus is not None: 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true
239 thesaurus.extend(args_thesaurus)
240 children_thesaurus = recurse(contents.children)
241 if children_thesaurus is not None: 241 ↛ 243line 241 didn't jump to line 243 because the condition on line 241 was always true
242 thesaurus.extend(children_thesaurus)
243 return thesaurus
244 subtitle = wxr.wtp.node_to_text(
245 contents.sarg if contents.sarg else contents.largs
246 )
247 if name_to_code(subtitle, "en") != "":
248 lang = subtitle
249 pos = ""
250 sense = ""
251 linkage = ""
252 return recurse(contents.children)
253 if subtitle.lower().startswith(SENSE_TITLE_PREFIX): 253 ↛ 254line 253 didn't jump to line 254 because the condition on line 253 was never true
254 sense = subtitle[len(SENSE_TITLE_PREFIX) :]
255 linkage = ""
256 return recurse(contents.children)
258 subtitle = subtitle.lower()
259 if subtitle in ( 259 ↛ 271line 259 didn't jump to line 271 because the condition on line 259 was never true
260 "further reading",
261 "external links",
262 "references",
263 "translations",
264 "notes",
265 "usage",
266 "work to be done",
267 "quantification",
268 "abbreviation",
269 "symbol",
270 ):
271 return []
272 if subtitle in LINKAGE_TITLES:
273 linkage = LINKAGE_TITLES[subtitle]
274 return recurse(contents.children)
275 if subtitle in POS_TITLES:
276 pos = POS_TITLES[subtitle]["pos"]
277 sense = ""
278 linkage = ""
279 return recurse(contents.children)
280 if subtitle in IGNORED_SUBTITLE_TAGS_MAP: 280 ↛ 283line 280 didn't jump to line 283 because the condition on line 280 was never true
281 # These subtitles are ignored but children are processed and
282 # possibly given additional tags
283 subtitle_tags = IGNORED_SUBTITLE_TAGS_MAP[subtitle]
284 return recurse(contents.children)
285 logger.debug(
286 f"{title=} {lang=} {pos=} {sense=} UNHANDLED SUBTITLE: "
287 + "subtitle "
288 + str(contents.sarg if contents.sarg else contents.largs)
289 )
290 return recurse(contents.children)
292 return recurse(tree)