Coverage for src / wiktextract / page.py: 88%
303 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import re
6from collections import defaultdict
7from copy import copy
8from typing import Any, Callable, Optional, Sequence, Union
10from mediawiki_langcodes import name_to_code
11from wikitextprocessor.core import (
12 NamespaceDataEntry,
13 PostTemplateFnCallable,
14 TemplateArgs,
15 TemplateFnCallable,
16)
17from wikitextprocessor.node_expand import NodeHandlerFnCallable
18from wikitextprocessor.parser import GeneralNode, NodeKind, WikiNode
20from .clean import clean_value
21from .datautils import data_append, data_extend
22from .import_utils import import_extractor_module
23from .wxr_context import WiktextractContext
25# NodeKind values for subtitles
26LEVEL_KINDS = {
27 NodeKind.LEVEL2,
28 NodeKind.LEVEL3,
29 NodeKind.LEVEL4,
30 NodeKind.LEVEL5,
31 NodeKind.LEVEL6,
32}
35def parse_page(
36 wxr: WiktextractContext, page_title: str, page_text: str
37) -> list[dict[str, Any]]:
38 """Parses the text of a Wiktionary page and returns a list of
39 dictionaries, one for each word/part-of-speech defined on the page
40 for the languages specified by ``capture_language_codes`` (None means
41 all available languages). ``word`` is page title, and ``text`` is
42 page text in Wikimedia format. Other arguments indicate what is
43 captured."""
44 page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page")
45 page_data = page_extractor_mod.parse_page(wxr, page_title, page_text)
46 if wxr.config.extract_thesaurus_pages:
47 inject_linkages(wxr, page_data)
48 if wxr.config.dump_file_lang_code == "en":
49 process_categories(wxr, page_data)
50 remove_duplicate_data(page_data)
51 return page_data
54def is_panel_template(wxr: WiktextractContext, template_name: str) -> bool:
55 """Checks if `Template_name` is a known panel template name (i.e., one that
56 produces an infobox in Wiktionary, but this also recognizes certain other
57 templates that we do not wish to expand)."""
58 page_extractor_mod = import_extractor_module(wxr.wtp.lang_code, "page")
59 if (
60 hasattr(page_extractor_mod, "PANEL_TEMPLATES")
61 and template_name in page_extractor_mod.PANEL_TEMPLATES
62 ):
63 return True
64 if hasattr(
65 page_extractor_mod, "PANEL_PREFIXES"
66 ) and template_name.startswith(tuple(page_extractor_mod.PANEL_PREFIXES)):
67 return True
68 return False
71def recursively_extract(
72 contents: Union[WikiNode, str, list[Union[str, WikiNode]]],
73 fn: Callable[[Union[WikiNode, list[WikiNode]]], bool],
74) -> tuple[list[Union[str, WikiNode]], list[Union[str, WikiNode]]]:
75 """Recursively extracts elements from contents for which ``fn`` returns
76 True. This returns two lists, the extracted elements and the remaining
77 content (with the extracted elements removed at each level). Only
78 WikiNode objects can be extracted."""
79 # If contents is a list, process each element separately
80 extracted = []
81 new_contents = []
82 if isinstance(contents, (list, tuple)):
83 for x in contents:
84 e1, c1 = recursively_extract(x, fn)
85 extracted.extend(e1)
86 new_contents.extend(c1)
87 return extracted, new_contents
88 # If content is not WikiNode, just return it as new contents.
89 if not isinstance(contents, WikiNode):
90 return [], [contents]
91 # Check if this content should be extracted
92 if fn(contents):
93 return [contents], []
94 # Otherwise content is WikiNode, and we must recurse into it.
95 kind = contents.kind
96 new_node = copy(contents)
97 new_node.children = []
98 new_node.sarg = ""
99 new_node.largs = []
100 new_node.attrs = {}
101 new_contents.append(new_node)
102 if kind in LEVEL_KINDS or kind == NodeKind.LINK:
103 # Process args and children
104 new_args = []
105 for arg in contents.largs:
106 e1, c1 = recursively_extract(arg, fn)
107 new_args.append(c1)
108 extracted.extend(e1)
109 new_node.largs = new_args
110 e1, c1 = recursively_extract(contents.children, fn)
111 extracted.extend(e1)
112 new_node.children = c1
113 elif kind in {
114 NodeKind.ITALIC,
115 NodeKind.BOLD,
116 NodeKind.TABLE,
117 NodeKind.TABLE_CAPTION,
118 NodeKind.TABLE_ROW,
119 NodeKind.TABLE_HEADER_CELL,
120 NodeKind.TABLE_CELL,
121 NodeKind.PRE,
122 NodeKind.PREFORMATTED,
123 }:
124 # Process only children
125 e1, c1 = recursively_extract(contents.children, fn)
126 extracted.extend(e1)
127 new_node.children = c1
128 elif kind in (NodeKind.HLINE,): 128 ↛ 130line 128 didn't jump to line 130 because the condition on line 128 was never true
129 # No arguments or children
130 pass
131 elif kind in (NodeKind.LIST, NodeKind.LIST_ITEM):
132 # Keep args as-is, process children
133 new_node.sarg = contents.sarg
134 e1, c1 = recursively_extract(contents.children, fn)
135 extracted.extend(e1)
136 new_node.children = c1
137 elif kind in {
138 NodeKind.TEMPLATE,
139 NodeKind.TEMPLATE_ARG,
140 NodeKind.PARSER_FN,
141 NodeKind.URL,
142 }:
143 # Process only args
144 new_args = []
145 for arg in contents.largs:
146 e1, c1 = recursively_extract(arg, fn)
147 new_args.append(c1)
148 extracted.extend(e1)
149 new_node.largs = new_args
150 elif kind == NodeKind.HTML: 150 ↛ 158line 150 didn't jump to line 158 because the condition on line 150 was always true
151 # Keep attrs and args as-is, process children
152 new_node.attrs = contents.attrs
153 new_node.sarg = contents.sarg
154 e1, c1 = recursively_extract(contents.children, fn)
155 extracted.extend(e1)
156 new_node.children = c1
157 else:
158 raise RuntimeError(f"recursively_extract: unhandled kind {kind}")
159 return extracted, new_contents
162def inject_linkages(wxr: WiktextractContext, page_data: list[dict]) -> None:
163 # Inject linkages from thesaurus entries
164 from .thesaurus import search_thesaurus
166 local_thesaurus_ns = wxr.wtp.NAMESPACE_DATA.get("Thesaurus", {}).get("name") # type: ignore[call-overload]
167 for data in page_data:
168 if "pos" not in data: 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 continue
170 word = data["word"]
171 lang_code = data["lang_code"]
172 pos = data["pos"]
173 for term in search_thesaurus(
174 wxr.thesaurus_db_conn, # type:ignore[arg-type]
175 word,
176 lang_code,
177 pos, # type: ignore[arg-type]
178 ):
179 for dt in data.get(term.linkage, ()):
180 if dt.get("word") == term.term and ( 180 ↛ 183line 180 didn't jump to line 183 because the condition on line 180 was never true
181 not term.sense or dt.get("sense") == term.sense
182 ):
183 break
184 else:
185 dt = {
186 "word": term.term,
187 "source": f"{local_thesaurus_ns}:{word}",
188 }
189 if len(term.sense) > 0: 189 ↛ 190line 189 didn't jump to line 190 because the condition on line 189 was never true
190 dt["sense"] = term.sense
191 if len(term.tags) > 0: 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 dt["tags"] = term.tags
193 if len(term.raw_tags) > 0: 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 dt["raw_tags"] = term.raw_tags
195 if len(term.topics) > 0: 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true
196 dt["topics"] = term.topics
197 if len(term.roman) > 0: 197 ↛ 198line 197 didn't jump to line 198 because the condition on line 197 was never true
198 dt["roman"] = term.roman
199 data_append(data, term.linkage, dt)
202def process_categories(
203 wxr: WiktextractContext, page_data: list[dict[str, Any]]
204) -> None:
205 # Categories are not otherwise disambiguated, but if there is only
206 # one sense and only one data in ret for the same language, move
207 # categories to the only sense. Note that categories are commonly
208 # specified for the page, and thus if we have multiple data in
209 # ret, we don't know which one they belong to (not even which
210 # language necessarily?).
211 # XXX can Category links be specified globally (i.e., in a different
212 # language?)
213 by_lang = defaultdict(list)
214 for data in page_data:
215 by_lang[data["lang"]].append(data)
216 for la, lst in by_lang.items():
217 if len(lst) > 1:
218 # Propagate categories from the last entry for the language to
219 # its other entries. It is common for them to only be specified
220 # in the last part-of-speech.
221 last = lst[-1]
222 for field in ("categories",):
223 if field not in last:
224 continue
225 vals = last[field]
226 for data in lst[:-1]:
227 assert data is not last
228 assert data.get(field) is not vals
229 if data.get("alt_of") or data.get("form_of"): 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 continue # Don't add to alt-of/form-of entries
231 data_extend(data, field, vals)
232 continue
233 if len(lst) != 1: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true
234 continue
235 data = lst[0]
236 senses = data.get("senses") or []
237 if len(senses) != 1:
238 continue
239 # Only one sense for this language. Move categories and certain other
240 # data to sense.
241 for field in ("categories", "topics", "wikidata", "wikipedia"):
242 if field in data:
243 v = data[field]
244 del data[field]
245 data_extend(senses[0], field, v)
247 # If the last part-of-speech of the last language (i.e., last item in "ret")
248 # has categories or topics not bound to a sense, propagate those
249 # categories and topics to all datas on "ret". It is common for categories
250 # to be specified at the end of an article. Apparently these can also
251 # apply to different languages.
252 if len(page_data) > 1:
253 last = page_data[-1]
254 for field in ("categories",):
255 if field not in last:
256 continue
257 lst = last[field]
258 for data in page_data[:-1]:
259 if data.get("form_of") or data.get("alt_of"): 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 continue # Don't add to form_of or alt_of entries
261 data_extend(data, field, lst)
263 # Remove category links that start with a language name from entries for
264 # different languages
265 rhymes_ns_prefix = (
266 wxr.wtp.NAMESPACE_DATA.get("Rhymes", {}).get("name", "") + ":" # type: ignore[call-overload]
267 )
268 for data in page_data:
269 lang_code = data.get("lang_code")
270 cats = data.get("categories", [])
271 new_cats = []
272 for cat in cats:
273 no_prefix_cat = cat.removeprefix(rhymes_ns_prefix)
274 cat_lang = no_prefix_cat.split(maxsplit=1)[0].split(
275 "/", maxsplit=1
276 )[0]
277 cat_lang_code = name_to_code(cat_lang, "en")
278 if (
279 cat_lang_code != ""
280 and cat_lang_code != lang_code
281 and not (lang_code == "mul" and cat_lang_code == "en")
282 ):
283 continue
284 new_cats.append(cat)
285 if len(new_cats) == 0:
286 if "categories" in data:
287 del data["categories"]
288 else:
289 data["categories"] = new_cats
292def remove_duplicate_data(page_data: dict) -> None:
293 # Remove duplicates from tags, categories, etc.
294 for data in page_data:
295 for field in ("categories", "topics", "tags", "wikidata", "wikipedia"):
296 if field in data:
297 data[field] = sorted(set(data[field]))
298 for sense in data.get("senses", ()):
299 if field in sense:
300 sense[field] = sorted(set(sense[field]))
302 # If raw_glosses is identical to glosses, remove it
303 # If "empty-gloss" in tags and there are glosses, remove the tag
304 for data in page_data:
305 for s in data.get("senses", []):
306 rglosses = s.get("raw_glosses", ())
307 if not rglosses:
308 continue
309 sglosses = s.get("glosses", ())
310 if sglosses: 310 ↛ 314line 310 didn't jump to line 314 because the condition on line 310 was always true
311 tags = s.get("tags", ())
312 while "empty-gloss" in s.get("tags", ()): 312 ↛ 313line 312 didn't jump to line 313 because the condition on line 312 was never true
313 tags.remove("empty-gloss")
314 if len(rglosses) != len(sglosses):
315 continue
316 same = True
317 for rg, sg in zip(rglosses, sglosses):
318 if rg != sg:
319 same = False
320 break
321 if same:
322 del s["raw_glosses"]
325def clean_node(
326 wxr: WiktextractContext,
327 sense_data: Optional[Any],
328 wikinode: GeneralNode,
329 template_fn: Optional[TemplateFnCallable] = None,
330 post_template_fn: Optional[PostTemplateFnCallable] = None,
331 node_handler_fn: Optional[NodeHandlerFnCallable] = None,
332 collect_links: bool = False,
333 remove_anchors_from_links: bool = False,
334 no_strip=False,
335 no_html_strip=False,
336) -> str:
337 """
338 Expands node or nodes to text, cleaning up HTML tags and duplicate spaces.
340 If `sense_data` is a dictionary, expanded category links will be added to
341 it under the `categories` key. And if `collect_link` is `True`, expanded
342 links will be added to the `links` key.
343 """
345 # print("CLEAN_NODE:", repr(value))
346 def clean_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
347 if template_fn is not None:
348 return template_fn(name, ht)
349 if is_panel_template(wxr, name):
350 return ""
351 return None
353 def clean_node_handler_fn_default(
354 node: WikiNode,
355 ) -> Optional[list[Union[str, WikiNode]]]:
356 assert isinstance(node, WikiNode)
357 kind = node.kind
358 if kind in {
359 NodeKind.TABLE_CELL,
360 NodeKind.TABLE_HEADER_CELL,
361 }:
362 return node.children
363 return None
365 if node_handler_fn is not None:
366 # override clean_node_handler_fn, the def above can't be accessed
367 clean_node_handler_fn = node_handler_fn
368 else:
369 clean_node_handler_fn = clean_node_handler_fn_default
371 # print("clean_node: value={!r}".format(value))
372 v = wxr.wtp.node_to_html(
373 wikinode,
374 node_handler_fn=clean_node_handler_fn,
375 template_fn=template_fn,
376 post_template_fn=post_template_fn,
377 )
378 # print("##########")
379 # print(f"{wikinode=}")
380 # print("clean_node: v={!r}".format(v))
382 # Capture categories if sense_data has been given. We also track
383 # Lua execution errors here.
384 # If collect_links=True (for glosses), capture links
385 category_ns_data: NamespaceDataEntry = wxr.wtp.NAMESPACE_DATA.get(
386 "Category",
387 {}, # type: ignore[typeddict-item]
388 )
389 category_ns_names: set[str] = {category_ns_data.get("name")} | set(
390 category_ns_data.get("aliases") # type:ignore[assignment,arg-type]
391 )
392 category_ns_names |= {"Category", "category"}
393 category_names_pattern = rf"(?:{'|'.join(category_ns_names)})"
394 if sense_data is not None:
395 # Check for Lua execution error
396 if '<strong class="error">Lua execution error' in v: 396 ↛ 397line 396 didn't jump to line 397 because the condition on line 396 was never true
397 data_append(sense_data, "tags", "error-lua-exec")
398 if '<strong class="error">Lua timeout error' in v: 398 ↛ 399line 398 didn't jump to line 399 because the condition on line 398 was never true
399 data_append(sense_data, "tags", "error-lua-timeout")
400 # Capture Category tags
401 if not collect_links:
402 for m in re.finditer(
403 rf"(?is)\[\[:?\s*{category_names_pattern}\s*:([^]|]+)",
404 v,
405 ):
406 cat = clean_value(wxr, m.group(1))
407 cat = re.sub(r"\s+", " ", cat)
408 cat = cat.strip()
409 if not cat: 409 ↛ 410line 409 didn't jump to line 410 because the condition on line 409 was never true
410 continue
411 if not sense_data_has_value(sense_data, "categories", cat):
412 data_append(sense_data, "categories", cat)
413 else:
414 links, categories = extract_links_from_node(
415 wxr,
416 v,
417 category_ns_names=category_ns_names,
418 remove_anchor_tags=remove_anchors_from_links,
419 )
420 for cat in categories:
421 # do not keep duplicated category links
422 if not sense_data_has_value(sense_data, "categories", cat): 422 ↛ 420line 422 didn't jump to line 420 because the condition on line 422 was always true
423 data_append(sense_data, "categories", cat)
424 # print(f"{links=}")
425 for ltuple in links:
426 # We want to keep link data as is, even duplicated
427 data_append(sense_data, "links", ltuple)
429 v = clean_value(wxr, v, no_strip=no_strip, no_html_strip=no_html_strip)
430 # print("After clean_value:", repr(v))
432 # Strip any unhandled templates and other stuff. This is mostly intended
433 # to clean up erroneous codings in the original text.
434 # v = re.sub(r"(?s)\{\{.*", "", v)
435 # Some templates create <sup>(Category: ...)</sup>; remove
436 v = re.sub(
437 rf"(?si)\s*(?:<sup>)?\({category_names_pattern}:[^)]+\)(?:</sup>)?",
438 "",
439 v,
440 )
441 # Some templates create question mark in <sup>, e.g.,
442 # some Korean Hanja form
443 v = re.sub(r"\^\?", "", v)
444 return v
447def sense_data_has_value(
448 sense_data: dict[str, Any], name: str, value: Any
449) -> bool:
450 """
451 Return True if `sense_data` has value in the attribute `name`'s value or
452 in the value of key `name` if `sense_date` is dictionary.
453 """
454 if hasattr(sense_data, name):
455 return value in getattr(sense_data, name)
456 elif isinstance(sense_data, dict): 456 ↛ 458line 456 didn't jump to line 458 because the condition on line 456 was always true
457 return value in sense_data.get(name, ()) # type:ignore[operator]
458 return False
461def extract_links_from_node(
462 wxr: WiktextractContext,
463 nodes: WikiNode | list[WikiNode | str] | str,
464 category_ns_names: set[str] | None = None,
465 remove_anchor_tags=False,
466 expand_nodes=False,
467) -> tuple[list[tuple[str, str]], set[str]]:
468 """Find link nodes and extract them as a list of tuples. If
469 `category_ns_names` is passed, also extract category names separately and
470 return them as a list of strings."""
471 ret: list[tuple[str, str]] = []
472 cat_ret: set[str] = set()
474 # Sometimes this function may receive nodes that have not been expanded,
475 # and like a head template node. Expanding the involves turning the nodes
476 # into wikitext and then parsing and expanding them, so it's expensive.
477 if expand_nodes is True:
478 nodes = wxr.wtp.parse(wxr.wtp.node_to_wikitext(nodes), expand_all=True)
479 if not isinstance(nodes, list): 479 ↛ 481line 479 didn't jump to line 481 because the condition on line 479 was always true
480 nodes = [nodes]
481 for node in nodes:
482 # print(f"{node=}")
483 if isinstance(node, str) and node.strip():
484 for m in re.finditer(
485 r"(?is)\[\[:?(\s*([^][|:]+):)?\s*([^]|]+)(\|([^]|]+))?\]\]",
486 # 1 2 3 4 5
487 node,
488 ):
489 if (
490 m.group(2)
491 and category_ns_names is not None
492 and m.group(2).strip() in category_ns_names
493 ):
494 cat = clean_value(wxr, m.group(3))
495 cat = re.sub(r"\s+", " ", cat).strip()
496 if not cat: 496 ↛ 497line 496 didn't jump to line 497 because the condition on line 496 was never true
497 continue
498 cat_ret.add(cat)
499 elif not m.group(1):
500 if m.group(5):
501 ltext = clean_value(wxr, m.group(5))
502 ltarget = clean_value(wxr, m.group(3))
503 elif not m.group(3): 503 ↛ 504line 503 didn't jump to line 504 because the condition on line 503 was never true
504 continue
505 else:
506 txt = clean_value(wxr, m.group(3))
507 ltext = txt
508 ltarget = txt
509 ltarget = re.sub(r"\s+", " ", ltarget).strip()
510 ltext = re.sub(r"\s+", " ", ltext).strip()
511 if not ltext and not ltarget: 511 ↛ 512line 511 didn't jump to line 512 because the condition on line 511 was never true
512 continue
513 if not ltext and ltarget: 513 ↛ 514line 513 didn't jump to line 514 because the condition on line 513 was never true
514 ltext = ltarget
515 ret.append((ltext, ltarget))
516 if not isinstance(node, WikiNode):
517 continue
518 for link_node in node.find_child_recursively(NodeKind.LINK):
519 if len(link_node.largs) > 0: 519 ↛ 518line 519 didn't jump to line 518 because the condition on line 519 was always true
520 ltarget = clean_node(wxr, None, link_node.largs[0]).strip()
521 if not ltarget: 521 ↛ 522line 521 didn't jump to line 522 because the condition on line 521 was never true
522 continue
523 ltext = clean_node(wxr, None, link_node)
524 ret.append((ltext, ltarget))
525 # XXX extract category links
526 if category_ns_names is not None:
527 new_ret: list[tuple[str, str]] = []
528 for ltext, ltarget in ret:
529 if ltext.strip() or not ltarget.strip():
530 new_ret.append((ltext, ltarget))
531 continue
532 m2 = re.match(r"([^:]+):.+", ltarget)
533 if m2 is not None and m2.group(1).strip() in category_ns_names: 533 ↛ 536line 533 didn't jump to line 536 because the condition on line 533 was always true
534 cat_ret.add(ltarget[ltarget.index(":") + 1 :])
535 else:
536 new_ret.append((ltext, ltarget))
537 ret = new_ret
538 if remove_anchor_tags is True:
539 new_ret = []
540 for ltext, ltarget in ret:
541 if "#" in ltarget and not ltarget.startswith("#"):
542 ltarget = ltarget[: ltarget.index("#")]
543 new_ret.append((ltext, ltarget))
544 ret = new_ret
545 return ret, cat_ret