Coverage for src/wiktextract/extractor/en/page.py: 79%
1839 statements
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
« prev ^ index » next coverage.py v7.14.3, created at 2026-06-23 09:14 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Literal,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .descendant import extract_descendant_section
51from .example import extract_example_list_item, extract_template_zh_x
52from .form_descriptions import (
53 classify_desc,
54 decode_tags,
55 distw,
56 parse_alt_or_inflection_of,
57 parse_sense_qualifier,
58 parse_word_head,
59)
60from .inflection import TableContext, parse_inflection_section
61from .info_templates import (
62 INFO_TEMPLATE_FUNCS,
63 parse_info_template_arguments,
64 parse_info_template_node,
65)
66from .linkages import (
67 extract_alt_form_section,
68 parse_linkage,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 FormData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280# Annoying templates that should be in etymology sections, but sometimes
281# are thrown in heads because the etymology section is missing, like at
282# the oldest level of a reconstruction: see wiktextract#1658
283ETYMOLOGY_TEMPLATES_IN_HEADS = {
284 "etymon",
285}
287PROBLEMATIC_TEMPLATES_CLUMP = (
288 WORD_LEVEL_HEAD_TEMPLATES | ETYMOLOGY_TEMPLATES_IN_HEADS
289)
291FLOATING_TABLE_TEMPLATES: set[str] = {
292 # az-suffix-form creates a style=floatright div that is otherwise
293 # deleted; if it is not pre-expanded, we can intercept the template
294 # so we add this set into do_not_pre_expand, and intercept the
295 # templates in parse_part_of_speech
296 "az-suffix-forms",
297 "az-inf-p",
298 "kk-suffix-forms",
299 "ky-suffix-forms",
300 "tr-inf-p",
301 "tr-suffix-forms",
302 "tt-suffix-forms",
303 "uz-suffix-forms",
304}
305# These two should contain template names that should always be
306# pre-expanded when *first* processing the tree, or not pre-expanded
307# so that the template are left in place with their identifying
308# name intact for later filtering.
310DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
311DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
313# Additional templates to be expanded in the pre-expand phase
314ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
315 "multitrans",
316 "multitrans-nowiki",
317 "trans-top",
318 "trans-top-also",
319 "trans-bottom",
320 "checktrans-top",
321 "checktrans-bottom",
322 "col",
323 "col1",
324 "col2",
325 "col3",
326 "col4",
327 "col5",
328 "col1-u",
329 "col2-u",
330 "col3-u",
331 "col4-u",
332 "col5-u",
333 "check deprecated lang param usage",
334 "deprecated code",
335 "ru-verb-alt-ё",
336 "ru-noun-alt-ё",
337 "ru-adj-alt-ё",
338 "ru-proper noun-alt-ё",
339 "ru-pos-alt-ё",
340 "ru-alt-ё",
341 "inflection of",
342 "no deprecated lang param usage",
343 "transclude", # these produce sense entries (or other lists)
344 "tcl",
345}
347# Inverse linkage for those that have them
348linkage_inverses: dict[str, str] = {
349 # XXX this is not currently used, move to post-processing
350 "synonyms": "synonyms",
351 "hypernyms": "hyponyms",
352 "hyponyms": "hypernyms",
353 "holonyms": "meronyms",
354 "meronyms": "holonyms",
355 "derived": "derived_from",
356 "coordinate_terms": "coordinate_terms",
357 "troponyms": "hypernyms",
358 "antonyms": "antonyms",
359 "instances": "instance_of",
360 "related": "related",
361}
363# Templates that are used to form panels on pages and that
364# should be ignored in various positions
365PANEL_TEMPLATES: set[str] = {
366 "Character info",
367 "CJKV",
368 "French personal pronouns",
369 "French possessive adjectives",
370 "French possessive pronouns",
371 "Han etym",
372 "Han etyl", # this redirects to Han etym and would cause Lua errors,
373 # and I don't know why, but I'm putting it here because
374 # we should be ignoring it anyhow.
375 "Japanese demonstratives",
376 "Latn-script",
377 "LDL",
378 "MW1913Abbr",
379 "Number-encoding",
380 "Nuttall",
381 "Spanish possessive adjectives",
382 "Spanish possessive pronouns",
383 "USRegionDisputed",
384 "Webster 1913",
385 "ase-rfr",
386 "attention",
387 "attn",
388 "beer",
389 "broken ref",
390 "ca-compass",
391 "character info",
392 "character info/var",
393 "checksense",
394 "compass-fi",
395 "copyvio suspected",
396 "delete",
397 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
398 "etystub",
399 "examples",
400 "hu-corr",
401 "hu-suff-pron",
402 "interwiktionary",
403 "ja-kanjitab",
404 "ja-kt",
405 "ko-hanja-search",
406 "look",
407 "maintenance box",
408 "maintenance line",
409 "mediagenic terms",
410 "merge",
411 "missing template",
412 "morse links",
413 "move",
414 "multiple images",
415 "no inline",
416 "picdic",
417 "picdicimg",
418 "picdiclabel",
419 "polyominoes",
420 "predidential nomics",
421 "punctuation", # This actually gets pre-expanded
422 "reconstructed",
423 "request box",
424 "rf-sound example",
425 "rfaccents",
426 "rfap",
427 "rfaspect",
428 "rfc",
429 "rfc-auto",
430 "rfc-header",
431 "rfc-level",
432 "rfc-pron-n",
433 "rfc-sense",
434 "rfclarify",
435 "rfd",
436 "rfd-redundant",
437 "rfd-sense",
438 "rfdate",
439 "rfdatek",
440 "rfdef",
441 "rfe",
442 "rfe/dowork",
443 "rfex",
444 "rfexp",
445 "rfform",
446 "rfgender",
447 "rfi",
448 "rfinfl",
449 "rfm",
450 "rfm-sense",
451 "rfp",
452 "rfp-old",
453 "rfquote",
454 "rfquote-sense",
455 "rfquotek",
456 "rfref",
457 "rfscript",
458 "rft2",
459 "rftaxon",
460 "rftone",
461 "rftranslit",
462 "rfv",
463 "rfv-etym",
464 "rfv-pron",
465 "rfv-quote",
466 "rfv-sense",
467 "selfref",
468 "split",
469 "stroke order", # XXX consider capturing this?
470 "stub entry",
471 "t-needed",
472 "tbot entry",
473 "tea room",
474 "tea room sense",
475 # "ttbc", - XXX needed in at least on/Preposition/Translation page
476 "unblock",
477 "unsupportedpage",
478 "video frames",
479 "was wotd",
480 "wrongtitle",
481 "zh-forms",
482 "zh-hanzi-box",
483 "no entry",
484}
486# Template name prefixes used for language-specific panel templates (i.e.,
487# templates that create side boxes or notice boxes or that should generally
488# be ignored).
489PANEL_PREFIXES: set[str] = {
490 "list:compass points/",
491 "list:Gregorian calendar months/",
492 "RQ:",
493}
495# Templates used for wikipedia links.
496wikipedia_templates: set[str] = {
497 "wikipedia",
498 "slim-wikipedia",
499 "w",
500 "W",
501 "swp",
502 "wiki",
503 "Wikipedia",
504 "wtorw",
505}
506for x in PANEL_PREFIXES & wikipedia_templates: 506 ↛ 507line 506 didn't jump to line 507 because the loop on line 506 never started
507 print(
508 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
509 x
510 )
511 )
513# Mapping from a template name (without language prefix) for the main word
514# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
515# it could validly occur. This is used as just a sanity check to give
516# warnings about probably incorrect coding in Wiktionary.
517template_allowed_pos_map: dict[str, list[str]] = {
518 "abbr": ["abbrev"],
519 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
520 "plural noun": ["noun", "name"],
521 "plural-noun": ["noun", "name"],
522 "proper noun": ["noun", "name"],
523 "proper-noun": ["name", "noun"],
524 "prop": ["name", "noun"],
525 "verb": ["verb", "phrase"],
526 "gerund": ["verb"],
527 "particle": ["adv", "particle"],
528 "adj": ["adj", "adj_noun"],
529 "pron": ["pron", "noun"],
530 "name": ["name", "noun"],
531 "adv": ["adv", "intj", "conj", "particle"],
532 "phrase": ["phrase", "prep_phrase"],
533 "noun phrase": ["phrase"],
534 "ordinal": ["num"],
535 "number": ["num"],
536 "pos": ["affix", "name", "num"],
537 "suffix": ["suffix", "affix"],
538 "character": ["character"],
539 "letter": ["character"],
540 "kanji": ["character"],
541 "cont": ["abbrev"],
542 "interj": ["intj"],
543 "con": ["conj"],
544 "part": ["particle"],
545 "prep": ["prep", "postp"],
546 "postp": ["postp"],
547 "misspelling": ["noun", "adj", "verb", "adv"],
548 "part-form": ["verb"],
549}
550for k, v in template_allowed_pos_map.items():
551 for x in v:
552 if x not in PARTS_OF_SPEECH: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true
553 print(
554 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
555 "".format(x, k, v)
556 )
557 assert False
560# Templates ignored during etymology extraction, i.e., these will not be listed
561# in the extracted etymology templates.
562ignored_etymology_templates: list[str] = [
563 "...",
564 "IPAchar",
565 "ipachar",
566 "ISBN",
567 "isValidPageName",
568 "redlink category",
569 "deprecated code",
570 "check deprecated lang param usage",
571 "para",
572 "p",
573 "cite",
574 "Cite news",
575 "Cite newsgroup",
576 "cite paper",
577 "cite MLLM 1976",
578 "cite journal",
579 "cite news/documentation",
580 "cite paper/documentation",
581 "cite video game",
582 "cite video game/documentation",
583 "cite newsgroup",
584 "cite newsgroup/documentation",
585 "cite web/documentation",
586 "cite news",
587 "Cite book",
588 "Cite-book",
589 "cite book",
590 "cite web",
591 "cite-usenet",
592 "cite-video/documentation",
593 "Cite-journal",
594 "rfe",
595 "catlangname",
596 "cln",
597 "langname-lite",
598 "no deprecated lang param usage",
599 "mention",
600 "m",
601 "m-self",
602 "link",
603 "l",
604 "ll",
605 "l-self",
606]
607# Regexp for matching ignored etymology template names. This adds certain
608# prefixes to the names listed above.
609ignored_etymology_templates_re = re.compile(
610 r"^((cite-|R:|RQ:).*|"
611 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
612 + r")$"
613)
615# Regexp for matching ignored descendants template names. Right now we just
616# copy the ignored etymology templates
617ignored_descendants_templates_re = ignored_etymology_templates_re
619# Set of template names that are used to define usage examples. If the usage
620# example contains one of these templates, then it its type is set to
621# "example"
622usex_templates: set[str] = {
623 "afex",
624 "affixusex",
625 "co", # {{collocation}} acts like a example template, specifically for
626 # pairs of combinations of words that are more common than you'd
627 # except would be randomly; hlavní#Czech
628 "coi",
629 "collocation",
630 "el-example",
631 "el-x",
632 "example",
633 "examples",
634 "he-usex",
635 "he-x",
636 "hi-usex",
637 "hi-x",
638 "ja-usex-inline",
639 "ja-usex",
640 "ja-x",
641 "jbo-example",
642 "jbo-x",
643 "km-usex",
644 "km-x",
645 "ko-usex",
646 "ko-x",
647 "lo-usex",
648 "lo-x",
649 "ne-x",
650 "ne-usex",
651 "prefixusex",
652 "ryu-usex",
653 "ryu-x",
654 "shn-usex",
655 "shn-x",
656 "suffixusex",
657 "th-usex",
658 "th-x",
659 "ur-usex",
660 "ur-x",
661 "usex",
662 "usex-suffix",
663 "ux",
664 "uxi",
665}
667stop_head_at_these_templates: set[str] = {
668 "category",
669 "cat",
670 "topics",
671 "catlangname",
672 "c",
673 "C",
674 "top",
675 "cln",
676}
678# Set of template names that are used to define quotation examples. If the
679# usage example contains one of these templates, then its type is set to
680# "quotation".
681quotation_templates: set[str] = {
682 "collapse-quote",
683 "quote-av",
684 "quote-book",
685 "quote-GYLD",
686 "quote-hansard",
687 "quotei",
688 "quote-journal",
689 "quotelite",
690 "quote-mailing list",
691 "quote-meta",
692 "quote-newsgroup",
693 "quote-song",
694 "quote-text",
695 "quote",
696 "quote-us-patent",
697 "quote-video game",
698 "quote-web",
699 "quote-wikipedia",
700 "wikiquote",
701 "Wikiquote",
702 "Q",
703}
705taxonomy_templates = {
706 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
707 "taxfmt",
708 "taxlink",
709 "taxlink2",
710 "taxlinknew",
711 "taxlook",
712}
714# Template names, this was exctracted from template_linkage_mappings,
715# because the code using template_linkage_mappings was actually not used
716# (but not removed).
717template_linkages_to_ignore_in_examples: set[str] = {
718 "syn",
719 "synonyms",
720 "ant",
721 "antonyms",
722 "hyp",
723 "hyponyms",
724 "der",
725 "derived terms",
726 "coordinate terms",
727 "cot",
728 "rel",
729 "col",
730 "inline alt forms",
731 "alti",
732 "comeronyms",
733 "holonyms",
734 "holo",
735 "hypernyms",
736 "hyper",
737 "meronyms",
738 "mero",
739 "troponyms",
740 "perfectives",
741 "pf",
742 "imperfectives",
743 "impf",
744 "syndiff",
745 "synsee",
746 # not linkage nor example templates
747 "sense",
748 "s",
749 "color panel",
750 "colour panel",
751}
753# Maps template name used in a word sense to a linkage field that it adds.
754sense_linkage_templates: dict[str, str] = {
755 "syn": "synonyms",
756 "synonyms": "synonyms",
757 "synsee": "synonyms",
758 "syndiff": "synonyms",
759 "hyp": "hyponyms",
760 "hyponyms": "hyponyms",
761 "ant": "antonyms",
762 "antonyms": "antonyms",
763 "alti": "related",
764 "inline alt forms": "related",
765 "coordinate terms": "coordinate_terms",
766 "cot": "coordinate_terms",
767 "comeronyms": "related",
768 "holonyms": "holonyms",
769 "holo": "holonyms",
770 "hypernyms": "hypernyms",
771 "hyper": "hypernyms",
772 "meronyms": "meronyms",
773 "mero": "meronyms",
774 "troponyms": "troponyms",
775 "perfectives": "related",
776 "pf": "related",
777 "imperfectives": "related",
778 "impf": "related",
779 "parasynonyms": "synonyms",
780 "par": "synonyms",
781 "parasyn": "synonyms",
782 "nearsyn": "synonyms",
783 "near-syn": "synonyms",
784}
786sense_linkage_templates_tags: dict[str, list[str]] = {
787 "alti": ["alternative"],
788 "inline alt forms": ["alternative"],
789 "comeronyms": ["comeronym"],
790 "perfectives": ["perfective"],
791 "pf": ["perfective"],
792 "imperfectives": ["imperfective"],
793 "impf": ["imperfective"],
794}
797def decode_html_entities(v: Union[str, int]) -> str:
798 """Decodes HTML entities from a value, converting them to the respective
799 Unicode characters/strings."""
800 if isinstance(v, int):
801 # I changed this to return str(v) instead of v = str(v),
802 # but there might have been the intention to have more logic
803 # here. html.unescape would not do anything special with an integer,
804 # it needs html escape symbols (&xx;).
805 return str(v)
806 return html.unescape(v)
809def parse_sense_linkage(
810 wxr: WiktextractContext,
811 data: SenseData,
812 name: str,
813 ht: TemplateArgs,
814 pos: str,
815) -> None:
816 """Parses a linkage (synonym, etc) specified in a word sense."""
817 assert isinstance(wxr, WiktextractContext)
818 assert isinstance(data, dict)
819 assert isinstance(name, str)
820 assert isinstance(ht, dict)
821 field = sense_linkage_templates[name]
822 field_tags = sense_linkage_templates_tags.get(name, [])
823 for i in range(2, 20):
824 if i not in ht:
825 break
826 w = clean_node(wxr, data, ht[i])
827 if "#" in w:
828 w = w[: w.index("#")]
829 if w in ["", "<"]: # `<` used in "hypernyms" template
830 continue
831 if ( 831 ↛ 836line 831 didn't jump to line 836 because the condition on line 831 was never true
832 i > 2
833 and w in (",", "or", ";")
834 or w.startswith(("see also", "See also"))
835 ):
836 continue
837 is_thesaurus = False
838 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
839 if w.startswith(alias):
840 is_thesaurus = True
841 w = w[len(alias) :]
842 if w != wxr.wtp.title: 842 ↛ 862line 842 didn't jump to line 862 because the condition on line 842 was always true
843 from ...thesaurus import search_thesaurus
845 lang_code = clean_node(wxr, None, ht.get(1, ""))
846 for t_data in search_thesaurus(
847 wxr.thesaurus_db_conn, # type: ignore
848 w,
849 lang_code,
850 pos,
851 "synonyms", # GH issue #1570
852 ):
853 l_data: LinkageData = {
854 "word": t_data.term,
855 "source": "Thesaurus:" + w,
856 }
857 if len(t_data.tags) > 0: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true
858 l_data["tags"] = t_data.tags
859 if len(t_data.raw_tags) > 0: 859 ↛ 860line 859 didn't jump to line 860 because the condition on line 859 was never true
860 l_data["raw_tags"] = t_data.raw_tags
861 data_append(data, field, l_data)
862 break
863 if is_thesaurus:
864 continue
865 tags: list[str] = []
866 topics: list[str] = []
867 english: Optional[str] = None
868 # Try to find qualifiers for this synonym
869 q = ht.get("q{}".format(i - 1))
870 if q:
871 cls = classify_desc(q)
872 if cls == "tags":
873 tagsets1, topics1 = decode_tags(q)
874 for ts in tagsets1:
875 tags.extend(ts)
876 topics.extend(topics1)
877 elif cls == "english": 877 ↛ 883line 877 didn't jump to line 883 because the condition on line 877 was always true
878 if english: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true
879 english += "; " + q
880 else:
881 english = q
882 # Try to find English translation for this synonym
883 t = ht.get("t{}".format(i - 1))
884 if t: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true
885 if english:
886 english += "; " + t
887 else:
888 english = t
890 # See if the linkage contains a parenthesized alt
891 alt = None
892 m = re.search(r"\(([^)]+)\)$", w)
893 if m: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true
894 w = w[: m.start()].strip()
895 alt = m.group(1)
897 dt = {"word": w}
898 if field_tags: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true
899 data_extend(dt, "tags", field_tags)
900 if tags:
901 data_extend(dt, "tags", tags)
902 if topics: 902 ↛ 903line 902 didn't jump to line 903 because the condition on line 902 was never true
903 data_extend(dt, "topics", topics)
904 if english:
905 dt["english"] = english # DEPRECATED for "translation"
906 dt["translation"] = english
907 if alt: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true
908 dt["alt"] = alt
909 data_append(data, field, dt)
912EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
913example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
914captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
917def synch_splits_with_args(
918 line: str, targs: TemplateArgs
919) -> Optional[list[str]]:
920 """If it looks like there's something weird with how a line of example
921 text has been split, this function will do the splitting after counting
922 occurences of the splitting regex inside the two main template arguments
923 containing the string data for the original language example and the
924 English translations.
925 """
926 # Previously, we split without capturing groups, but here we want to
927 # keep the original splitting hyphen regex intact.
928 fparts = captured_splitters_re.split(line)
929 new_parts = []
930 # ["First", " – ", "second", " – ", "third..."] from OL argument
931 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
932 new_parts.append("".join(fparts[:first]))
933 # Translation argument
934 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
935 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
936 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
937 new_parts.append("".join(fparts[first + 1 : second]))
939 if all(new_parts): # no empty strings from the above spaghetti
940 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
941 return new_parts
942 else:
943 return None
946QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
947QUALIFIERS_RE = re.compile(QUALIFIERS)
948# (...): ... or (...(...)...): ...
951def parse_language(
952 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
953) -> list[WordData]:
954 """Iterates over the text of the page, returning words (parts-of-speech)
955 defined on the page one at a time. (Individual word senses for the
956 same part-of-speech are typically encoded in the same entry.)"""
957 # imported here to avoid circular import
958 from .pronunciation import parse_pronunciation
960 assert isinstance(wxr, WiktextractContext)
961 assert isinstance(langnode, WikiNode)
962 assert isinstance(language, str)
963 assert isinstance(lang_code, str)
964 # print("parse_language", language)
966 is_reconstruction = False
967 word: str = wxr.wtp.title # type: ignore[assignment]
968 unsupported_prefix = "Unsupported titles/"
969 if word.startswith(unsupported_prefix):
970 w = word[len(unsupported_prefix) :]
971 if w in unsupported_title_map: 971 ↛ 974line 971 didn't jump to line 974 because the condition on line 971 was always true
972 word = unsupported_title_map[w]
973 else:
974 wxr.wtp.error(
975 "Unimplemented unsupported title: {}".format(word),
976 sortid="page/870",
977 )
978 word = w
979 elif word.startswith("Reconstruction:"):
980 word = word[word.find("/") + 1 :]
981 is_reconstruction = True
982 elif word.startswith("a/languages"): 982 ↛ 984line 982 didn't jump to line 984 because the condition on line 982 was never true
983 # ATM there's only one "mammoth page" in English wiktionary, 'a'
984 word = "a"
986 base_data: WordData = {
987 "word": word,
988 "lang": language,
989 "lang_code": lang_code,
990 }
991 if is_reconstruction:
992 data_append(base_data, "tags", "reconstruction")
993 sense_data: SenseData = {}
994 pos_data: WordData = {} # For a current part-of-speech
995 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
996 etym_data: WordData = {} # For one etymology
997 sense_datas: list[SenseData] = []
998 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
999 # Never reset, do not use as data
1000 level_four_datas: list[WordData] = []
1001 etym_datas: list[WordData] = []
1002 page_datas: list[WordData] = []
1003 have_etym = False
1004 inside_level_four = False # This is for checking if the etymology section
1005 # or article has a Pronunciation section, for Chinese mostly; because
1006 # Chinese articles can have three level three sections (two etymology
1007 # sections and pronunciation sections) one after another, we need a kludge
1008 # to better keep track of whether we're in a normal "etym" or inside a
1009 # "level four" (which is what we've turned the level three Pron sections
1010 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
1011 # a step.
1012 stack: list[str] = [] # names of items on the "stack"
1014 def merge_base(data: WordData, base: WordData) -> None:
1015 for k, v in base.items():
1016 # Copy the value to ensure that we don't share lists or
1017 # dicts between structures (even nested ones).
1018 v = copy.deepcopy(v)
1019 if k not in data:
1020 # The list was copied above, so this will not create shared ref
1021 data[k] = v # type: ignore[literal-required]
1022 continue
1023 if data[k] == v: # type: ignore[literal-required]
1024 continue
1025 if ( 1025 ↛ 1033line 1025 didn't jump to line 1033 because the condition on line 1025 was always true
1026 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1027 or isinstance(
1028 v,
1029 (list, tuple), # Should this be "and"?
1030 )
1031 ):
1032 data[k] = list(data[k]) + list(v) # type: ignore
1033 elif data[k] != v: # type: ignore[literal-required]
1034 wxr.wtp.warning(
1035 "conflicting values for {} in merge_base: "
1036 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1037 sortid="page/904",
1038 )
1040 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1041 """Remove unnecessary keys from dict values
1042 in a list comprehension..."""
1043 if key in pron:
1044 pron.pop(key) # type: ignore
1045 return pron
1047 def sound_matches_pos(sound: SoundData, pos: str) -> bool:
1048 if "pos" not in sound:
1049 return True
1050 sound_pos = sound["pos"] # type: ignore[typeddict-item]
1051 return pos in sound_pos
1053 def strip_sound_pos(sound: SoundData) -> SoundData:
1054 complementary_pop(sound, "pos")
1055 return sound
1057 # If the result has sounds, eliminate sounds that have a prefix that
1058 # does not match "word" or one of "forms"
1059 if "sounds" in data and "word" in data:
1060 accepted = [data["word"]]
1061 accepted.extend(f["form"] for f in data.get("forms", dict()))
1062 data["sounds"] = list(
1063 s
1064 for s in data["sounds"]
1065 if "form" not in s or s["form"] in accepted
1066 )
1067 # If the result has sounds, eliminate sounds that have a pos that
1068 # does not match "pos"
1069 if "sounds" in data and "pos" in data:
1070 data["sounds"] = list(
1071 strip_sound_pos(s)
1072 for s in data["sounds"]
1073 # "pos" is not a field of SoundData, correctly, so we're
1074 # removing it here. It's a kludge on a kludge on a kludge.
1075 if sound_matches_pos(s, data["pos"])
1076 )
1077 elif "sounds" in data: 1077 ↛ 1078line 1077 didn't jump to line 1078 because the condition on line 1077 was never true
1078 data["sounds"] = [strip_sound_pos(s) for s in data["sounds"]]
1080 def push_sense(sorting_ordinal: int | None = None) -> bool:
1081 """Starts collecting data for a new word sense. This returns True
1082 if a sense was added."""
1083 nonlocal sense_data
1084 if sorting_ordinal is None:
1085 sorting_ordinal = sense_ordinal
1086 tags = sense_data.get("tags", ())
1087 if (
1088 not sense_data.get("glosses")
1089 and "translation-hub" not in tags
1090 and "no-gloss" not in tags
1091 ):
1092 return False
1094 if ( 1094 ↛ 1104line 1094 didn't jump to line 1104 because the condition on line 1094 was never true
1095 (
1096 "participle" in sense_data.get("tags", ())
1097 or "infinitive" in sense_data.get("tags", ())
1098 )
1099 and "alt_of" not in sense_data
1100 and "form_of" not in sense_data
1101 and "etymology_text" in etym_data
1102 and etym_data["etymology_text"] != ""
1103 ):
1104 etym = etym_data["etymology_text"]
1105 etym = etym.split(". ")[0]
1106 ret = parse_alt_or_inflection_of(wxr, etym, set())
1107 if ret is not None:
1108 tags, lst = ret
1109 assert isinstance(lst, (list, tuple))
1110 if "form-of" in tags:
1111 data_extend(sense_data, "form_of", lst)
1112 data_extend(sense_data, "tags", tags)
1113 elif "alt-of" in tags:
1114 data_extend(sense_data, "alt_of", lst)
1115 data_extend(sense_data, "tags", tags)
1117 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1117 ↛ 1120line 1117 didn't jump to line 1120 because the condition on line 1117 was never true
1118 "tags", ()
1119 ):
1120 data_append(sense_data, "tags", "no-gloss")
1122 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore
1123 sense_datas.append(sense_data)
1124 sense_data = {}
1125 return True
1127 def push_pos(sorting_ordinal: int | None = None) -> None:
1128 """Starts collecting data for a new part-of-speech."""
1129 nonlocal pos_data
1130 nonlocal sense_datas
1131 push_sense(sorting_ordinal)
1132 if wxr.wtp.subsection:
1133 data: WordData = {"senses": sense_datas}
1134 merge_base(data, pos_data)
1135 level_four_datas.append(data)
1136 pos_data = {}
1137 sense_datas = []
1138 wxr.wtp.start_subsection(None)
1140 def push_level_four_section(clear_sound_data: bool) -> None:
1141 """Starts collecting data for a new level four sections, which
1142 is usually virtual and empty, unless the article has Chinese
1143 'Pronunciation' sections that are etymology-section-like but
1144 under etymology, and at the same level in the source. We modify
1145 the source to demote Pronunciation sections like that to level
1146 4, and other sections one step lower."""
1147 nonlocal level_four_data
1148 nonlocal level_four_datas
1149 nonlocal etym_datas
1150 push_pos()
1151 # print(f"======\n{etym_data=}")
1152 # print(f"======\n{etym_datas=}")
1153 # print(f"======\n{level_four_data=}")
1154 # print(f"======\n{level_four_datas=}")
1155 for data in level_four_datas:
1156 merge_base(data, level_four_data)
1157 etym_datas.append(data)
1158 for data in etym_datas:
1159 merge_base(data, etym_data)
1160 page_datas.append(data)
1161 if clear_sound_data:
1162 level_four_data = {}
1163 level_four_datas = []
1164 etym_datas = []
1166 def push_etym() -> None:
1167 """Starts collecting data for a new etymology."""
1168 nonlocal etym_data
1169 nonlocal etym_datas
1170 nonlocal have_etym
1171 nonlocal inside_level_four
1172 have_etym = True
1173 push_level_four_section(False)
1174 inside_level_four = False
1175 # etymology section could under pronunciation section
1176 etym_data = (
1177 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1178 )
1180 def select_data() -> WordData:
1181 """Selects where to store data (pos or etym) based on whether we
1182 are inside a pos (part-of-speech)."""
1183 # print(f"{wxr.wtp.subsection=}")
1184 # print(f"{stack=}")
1185 if wxr.wtp.subsection is not None:
1186 return pos_data
1187 if inside_level_four:
1188 return level_four_data
1189 if stack[-1] == language:
1190 return base_data
1191 return etym_data
1193 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1194 """Parses the subsection for a part-of-speech under a language on
1195 a page."""
1196 assert isinstance(posnode, WikiNode)
1197 assert isinstance(pos, str)
1198 # print("parse_part_of_speech", pos)
1199 pos_data["pos"] = pos
1200 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1201 lists: list[list[WikiNode]] = [[]] # list of lists
1202 first_para = True
1203 first_head_tmplt = True
1204 collecting_head = True
1205 start_of_paragraph = True
1207 # XXX extract templates from posnode with recursively_extract
1208 # that break stuff, like ja-kanji or az-suffix-form.
1209 # Do the extraction with a list of template names, combined from
1210 # different lists, then separate out them into different lists
1211 # that are handled at different points of the POS section.
1212 # First, extract az-suffix-form, put it in `inflection`,
1213 # and parse `inflection`'s content when appropriate later.
1214 # The contents of az-suffix-form (and ja-kanji) that generate
1215 # divs with "floatright" in their style gets deleted by
1216 # clean_value, so templates that slip through from here won't
1217 # break anything.
1218 # XXX bookmark
1219 # print("===================")
1220 # print(posnode.children)
1222 floaters, poschildren = recursively_extract(
1223 posnode.children,
1224 lambda x: (
1225 isinstance(x, WikiNode)
1226 and (
1227 (
1228 isinstance(x, TemplateNode)
1229 and x.template_name in FLOATING_TABLE_TEMPLATES
1230 )
1231 or (
1232 x.kind == NodeKind.LINK
1233 # Need to check for stringiness because some links are
1234 # broken; for example, if a template is missing an
1235 # argument, a link might look like `[[{{{1}}}...]]`
1236 and len(x.largs) > 0
1237 and len(x.largs[0]) > 0
1238 and isinstance(x.largs[0][0], str)
1239 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1240 )
1241 )
1242 ),
1243 )
1244 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1245 tempnode.largs = [["Inflection"]]
1246 tempnode.children = floaters
1247 parse_inflection(tempnode, "Floating Div", pos)
1248 # print(poschildren)
1249 # XXX new above
1251 if not poschildren: 1251 ↛ 1252line 1251 didn't jump to line 1252 because the condition on line 1251 was never true
1252 if not floaters:
1253 wxr.wtp.debug(
1254 "PoS section without contents",
1255 sortid="en/page/1051/20230612",
1256 )
1257 else:
1258 wxr.wtp.debug(
1259 "PoS section without contents except for a floating table",
1260 sortid="en/page/1056/20230612",
1261 )
1262 return
1264 for node in poschildren:
1265 if isinstance(node, str):
1266 for m in re.finditer(r"\n+|[^\n]+", node):
1267 p = m.group(0)
1268 if p.startswith("\n\n") and pre:
1269 first_para = False
1270 start_of_paragraph = True
1271 break
1272 if p and collecting_head:
1273 pre[-1].append(p)
1274 continue
1275 assert isinstance(node, WikiNode)
1276 kind = node.kind
1277 if kind == NodeKind.LIST:
1278 lists[-1].append(node)
1279 collecting_head = False
1280 start_of_paragraph = True
1281 continue
1282 elif kind in LEVEL_KINDS:
1283 # Stop parsing section if encountering any kind of
1284 # level header (like ===Noun=== or ====Further Reading====).
1285 # At a quick glance, this should be the default behavior,
1286 # but if some kinds of source articles have sub-sub-sections
1287 # that should be parsed XXX it should be handled by changing
1288 # this break.
1289 break
1290 elif collecting_head and kind == NodeKind.LINK:
1291 # We might collect relevant links as they are often pictures
1292 # relating to the word
1293 if len(node.largs[0]) >= 1 and isinstance( 1293 ↛ 1308line 1293 didn't jump to line 1308 because the condition on line 1293 was always true
1294 node.largs[0][0], str
1295 ):
1296 if node.largs[0][0].startswith( 1296 ↛ 1302line 1296 didn't jump to line 1302 because the condition on line 1296 was never true
1297 ns_title_prefix_tuple(wxr, "Category")
1298 ):
1299 # [[Category:...]]
1300 # We're at the end of the file, probably, so stop
1301 # here. Otherwise the head will get garbage.
1302 break
1303 if node.largs[0][0].startswith(
1304 ns_title_prefix_tuple(wxr, "File")
1305 ):
1306 # Skips file links
1307 continue
1308 start_of_paragraph = False
1309 pre[-1].append(node)
1310 elif kind == NodeKind.HTML:
1311 if node.sarg == "br":
1312 if pre[-1]: 1312 ↛ 1264line 1312 didn't jump to line 1264 because the condition on line 1312 was always true
1313 pre.append([]) # Switch to next head
1314 lists.append([]) # Lists parallels pre
1315 collecting_head = True
1316 start_of_paragraph = True
1317 elif collecting_head and node.sarg not in ( 1317 ↛ 1323line 1317 didn't jump to line 1323 because the condition on line 1317 was never true
1318 "gallery",
1319 "ref",
1320 "cite",
1321 "caption",
1322 ):
1323 start_of_paragraph = False
1324 pre[-1].append(node)
1325 else:
1326 start_of_paragraph = False
1327 elif isinstance(node, TemplateNode):
1328 # XXX Insert code here that disambiguates between
1329 # templates that generate word heads and templates
1330 # that don't.
1331 # There's head_tag_re that seems like a regex meant
1332 # to identify head templates. Too bad it's None.
1334 # ignore {{category}}, {{cat}}... etc.
1335 if node.template_name in stop_head_at_these_templates:
1336 # we've reached a template that should be at the end,
1337 continue
1339 # skip these templates; panel_templates is already used
1340 # to skip certain templates else, but it also applies to
1341 # head parsing quite well.
1342 # node.largs[0][0] should always be str, but can't type-check
1343 # that.
1344 if is_panel_template(wxr, node.template_name):
1345 continue
1346 # skip these templates
1347 # if node.largs[0][0] in skip_these_templates_in_head:
1348 # first_head_tmplt = False # no first_head_tmplt at all
1349 # start_of_paragraph = False
1350 # continue
1352 if first_head_tmplt and pre[-1]:
1353 first_head_tmplt = False
1354 start_of_paragraph = False
1355 pre[-1].append(node)
1356 elif pre[-1] and start_of_paragraph:
1357 pre.append([]) # Switch to the next head
1358 lists.append([]) # lists parallel pre
1359 collecting_head = True
1360 start_of_paragraph = False
1361 pre[-1].append(node)
1362 else:
1363 pre[-1].append(node)
1364 elif first_para:
1365 start_of_paragraph = False
1366 if collecting_head: 1366 ↛ 1264line 1366 didn't jump to line 1264 because the condition on line 1366 was always true
1367 pre[-1].append(node)
1368 # XXX use template_fn in clean_node to check that the head macro
1369 # is compatible with the current part-of-speech and generate warning
1370 # if not. Use template_allowed_pos_map.
1372 # Clean up empty pairs, and fix messes with extra newlines that
1373 # separate templates that are followed by lists wiktextract issue #314
1375 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1376 cleaned_lists: list[list[WikiNode]] = []
1377 pairless_pre_index = None
1379 for pre1, ls in zip(pre, lists):
1380 if pre1 and not ls:
1381 pairless_pre_index = len(cleaned_pre)
1382 if not pre1 and not ls: 1382 ↛ 1384line 1382 didn't jump to line 1384 because the condition on line 1382 was never true
1383 # skip [] + []
1384 continue
1385 if not ls and all(
1386 (isinstance(x, str) and not x.strip()) for x in pre1
1387 ):
1388 # skip ["\n", " "] + []
1389 continue
1390 if ls and not pre1:
1391 if pairless_pre_index is not None: 1391 ↛ 1392line 1391 didn't jump to line 1392 because the condition on line 1391 was never true
1392 cleaned_lists[pairless_pre_index] = ls
1393 pairless_pre_index = None
1394 continue
1395 cleaned_pre.append(pre1)
1396 cleaned_lists.append(ls)
1398 pre = cleaned_pre
1399 lists = cleaned_lists
1401 there_are_many_heads = len(pre) > 1
1402 header_tags: list[str] = []
1403 header_topics: list[str] = []
1404 previous_head_had_list = False
1406 if not any(g for g in lists):
1407 process_gloss_without_list(
1408 poschildren, pos, pos_data, header_tags, header_topics
1409 )
1410 else:
1411 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1412 # if len(ls) == 0:
1413 # # don't have gloss list
1414 # # XXX add code here to filter out 'garbage', like text
1415 # # that isn't a head template or head.
1416 # continue
1418 if all(not sl for sl in lists[i:]):
1419 if i == 0: 1419 ↛ 1420line 1419 didn't jump to line 1420 because the condition on line 1419 was never true
1420 if isinstance(node, str):
1421 wxr.wtp.debug(
1422 "first head without list of senses,"
1423 "string: '{}[...]', {}/{}".format(
1424 node[:20], word, language
1425 ),
1426 sortid="page/1689/20221215",
1427 )
1428 if isinstance(node, WikiNode):
1429 if node.largs and node.largs[0][0] in [
1430 "Han char",
1431 ]:
1432 # just ignore these templates
1433 pass
1434 else:
1435 wxr.wtp.debug(
1436 "first head without "
1437 "list of senses, "
1438 "template node "
1439 "{}, {}/{}".format(
1440 node.largs, word, language
1441 ),
1442 sortid="page/1694/20221215",
1443 )
1444 else:
1445 wxr.wtp.debug(
1446 "first head without list of senses, "
1447 "{}/{}".format(word, language),
1448 sortid="page/1700/20221215",
1449 )
1450 # no break here so that the first head always
1451 # gets processed.
1452 else:
1453 if isinstance(node, str): 1453 ↛ 1454line 1453 didn't jump to line 1454 because the condition on line 1453 was never true
1454 wxr.wtp.debug(
1455 "later head without list of senses,"
1456 "string: '{}[...]', {}/{}".format(
1457 node[:20], word, language
1458 ),
1459 sortid="page/1708/20221215",
1460 )
1461 if isinstance(node, WikiNode): 1461 ↛ 1473line 1461 didn't jump to line 1473 because the condition on line 1461 was always true
1462 wxr.wtp.debug(
1463 "later head without list of senses,"
1464 "template node "
1465 "{}, {}/{}".format(
1466 node.sarg if node.sarg else node.largs,
1467 word,
1468 language,
1469 ),
1470 sortid="page/1713/20221215",
1471 )
1472 else:
1473 wxr.wtp.debug(
1474 "later head without list of senses, "
1475 "{}/{}".format(word, language),
1476 sortid="page/1719/20221215",
1477 )
1478 break
1479 head_group = i + 1 if there_are_many_heads else None
1480 # print("parse_part_of_speech: {}: {}: pre={}"
1481 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1483 if previous_head_had_list:
1484 # We use a boolean flag here because we want to be able
1485 # let the header_tags data pass through after the loop
1486 # is over without accidentally emptying it, if there are
1487 # no pos_datas and we need a dummy data.
1488 header_tags.clear()
1489 header_topics.clear()
1491 # print(f"{pre1=}")
1492 process_gloss_header(
1493 pre1, pos, head_group, pos_data, header_tags, header_topics
1494 )
1495 for ln in ls:
1496 # Parse each list associated with this head.
1497 for node in ln.children:
1498 # Parse nodes in l.children recursively.
1499 # The recursion function uses push_sense() to
1500 # add stuff into sense_datas, and returns True or
1501 # False if something is added, which bubbles upward.
1502 # If the bubble is "True", then higher levels of
1503 # the recursion will not push_sense(), because
1504 # the data is already pushed into a sub-gloss
1505 # downstream, unless the higher level has examples
1506 # that need to be put somewhere.
1507 common_data: SenseData = {
1508 "tags": list(header_tags),
1509 "topics": list(header_topics),
1510 }
1511 if head_group:
1512 common_data["head_nr"] = head_group
1513 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1515 if len(ls) > 0:
1516 previous_head_had_list = True
1517 else:
1518 previous_head_had_list = False
1520 # If there are no senses extracted, add a dummy sense. We want to
1521 # keep tags extracted from the head for the dummy sense.
1522 push_sense() # Make sure unfinished data pushed, and start clean sense
1523 if len(sense_datas) == 0:
1524 data_extend(sense_data, "tags", header_tags)
1525 data_extend(sense_data, "topics", header_topics)
1526 data_append(sense_data, "tags", "no-gloss")
1527 push_sense()
1529 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore
1531 for sd in sense_datas:
1532 if "__temp_sense_sorting_ordinal" in sd: 1532 ↛ 1531line 1532 didn't jump to line 1531 because the condition on line 1532 was always true
1533 del sd["__temp_sense_sorting_ordinal"] # type: ignore
1535 term_label_templates: list[TemplateData] = []
1536 normal_label_templates: list[TemplateData] = []
1538 def head_post_template_fn(
1539 name: str, ht: TemplateArgs, expansion: str
1540 ) -> Optional[str]:
1541 """Handles special templates in the head section of a word. Head
1542 section is the text after part-of-speech subtitle and before word
1543 sense list. Typically it generates the bold line for the word, but
1544 may also contain other useful information that often ends in
1545 side boxes. We want to capture some of that additional information."""
1546 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1547 if is_panel_template(wxr, name): 1547 ↛ 1550line 1547 didn't jump to line 1550 because the condition on line 1547 was never true
1548 # Completely ignore these templates (not even recorded in
1549 # head_templates)
1550 return ""
1551 if name == "head":
1552 # XXX are these also captured in forms? Should this special case
1553 # be removed?
1554 t = ht.get(2, "")
1555 if t == "pinyin": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true
1556 data_append(pos_data, "tags", "Pinyin")
1557 elif t == "romanization": 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true
1558 data_append(pos_data, "tags", "romanization")
1559 if (
1560 HEAD_TAG_RE.search(name) is not None
1561 or name in PROBLEMATIC_TEMPLATES_CLUMP
1562 ):
1563 args_ht = clean_template_args(wxr, ht)
1564 cleaned_expansion = clean_node(wxr, None, expansion)
1565 dt: TemplateData = {
1566 "name": name,
1567 "args": args_ht,
1568 "expansion": cleaned_expansion,
1569 }
1570 if name in ETYMOLOGY_TEMPLATES_IN_HEADS:
1571 data_append(pos_data, "etymology_templates", dt)
1572 else:
1573 data_append(pos_data, "head_templates", dt)
1574 if name in WORD_LEVEL_HEAD_TEMPLATES:
1575 term_label_templates.append(dt)
1576 # Squash these, their tags are applied to the whole word,
1577 # and some cause problems like "term-label"
1578 return ""
1580 # The following are both captured in head_templates and parsed
1581 # separately
1583 if name in wikipedia_templates:
1584 # Note: various places expect to have content from wikipedia
1585 # templates, so cannot convert this to empty
1586 parse_wikipedia_template(wxr, pos_data, ht)
1587 return None
1589 if name == "number box": 1589 ↛ 1591line 1589 didn't jump to line 1591 because the condition on line 1589 was never true
1590 # XXX extract numeric value?
1591 return ""
1592 if name == "enum":
1593 # XXX extract?
1594 return ""
1595 if name == "cardinalbox": 1595 ↛ 1598line 1595 didn't jump to line 1598 because the condition on line 1595 was never true
1596 # XXX extract similar to enum?
1597 # XXX this can also occur in top-level under language
1598 return ""
1599 if name == "Han simplified forms": 1599 ↛ 1601line 1599 didn't jump to line 1601 because the condition on line 1599 was never true
1600 # XXX extract?
1601 return ""
1602 # if name == "ja-kanji forms":
1603 # # XXX extract?
1604 # return ""
1605 # if name == "vi-readings":
1606 # # XXX extract?
1607 # return ""
1608 # if name == "ja-kanji":
1609 # # XXX extract?
1610 # return ""
1611 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1611 ↛ 1613line 1611 didn't jump to line 1613 because the condition on line 1611 was never true
1612 # XXX extract?
1613 return ""
1614 if name == "defdate": 1614 ↛ 1616line 1614 didn't jump to line 1616 because the condition on line 1614 was never true
1615 # the one exampe I saw of this in a head was weird.
1616 return ""
1617 if name in ("lb", "lbl", "label"):
1618 args_ht = clean_template_args(wxr, ht)
1619 cleaned_expansion = clean_node(wxr, None, expansion).strip("()")
1620 dt = {
1621 "name": name,
1622 "args": args_ht,
1623 "expansion": cleaned_expansion,
1624 }
1625 normal_label_templates.append(dt)
1626 # The parens around __LABEL... below is meaningful: label
1627 # templates generate text with parens, so if we add the magical
1628 # phrase here with parens, it will look like a normal label that
1629 # will be handled as a parenthetical text; only when handling
1630 # parenthetical text do we need to actually actually access
1631 # the contents of the label.
1632 return f"(__LABEL_TEMPLATE_{len(normal_label_templates) - 1}__)"
1634 return None
1636 def process_gloss_header(
1637 header_nodes: list[Union[WikiNode, str]],
1638 pos_type: str,
1639 header_group: Optional[int],
1640 pos_data: WordData,
1641 header_tags: list[str],
1642 header_topics: list[str],
1643 ) -> None:
1644 ruby = []
1646 # process template parse nodes here
1647 new_nodes = []
1648 info_template_data = []
1649 for node in header_nodes:
1650 # print(f"{node=}")
1651 info_data, info_out = parse_info_template_node(wxr, node, "head")
1652 if info_data or info_out:
1653 if info_data: 1653 ↛ 1655line 1653 didn't jump to line 1655 because the condition on line 1653 was always true
1654 info_template_data.append(info_data)
1655 if info_out: # including just the original node 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true
1656 new_nodes.append(info_out)
1657 else:
1658 new_nodes.append(node)
1659 header_nodes = new_nodes
1661 if info_template_data:
1662 if "info_templates" not in pos_data: 1662 ↛ 1665line 1662 didn't jump to line 1665 because the condition on line 1662 was always true
1663 pos_data["info_templates"] = info_template_data
1664 else:
1665 pos_data["info_templates"].extend(info_template_data)
1667 if lang_code == "ja":
1668 exp = wxr.wtp.parse(
1669 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1670 )
1671 rub, _ = recursively_extract(
1672 exp.children,
1673 lambda x: (
1674 isinstance(x, WikiNode)
1675 and x.kind == NodeKind.HTML
1676 and x.sarg == "ruby"
1677 ),
1678 )
1679 if rub is not None: 1679 ↛ 1723line 1679 didn't jump to line 1723 because the condition on line 1679 was always true
1680 for r in rub:
1681 if TYPE_CHECKING:
1682 # we know the lambda above in recursively_extract
1683 # returns only WikiNodes in rub
1684 assert isinstance(r, WikiNode)
1685 rt = parse_ruby(wxr, r)
1686 if rt is not None: 1686 ↛ 1680line 1686 didn't jump to line 1680 because the condition on line 1686 was always true
1687 ruby.append(rt)
1688 elif lang_code == "vi":
1689 # Handle vi-readings templates that have a weird structures for
1690 # Chu Nom vietnamese characters heads
1691 # https://en.wiktionary.org/wiki/Template:vi-readings
1692 new_header_nodes = []
1693 related_readings: list[LinkageData] = []
1694 for node in header_nodes:
1695 if ( 1695 ↛ 1718line 1695 didn't jump to line 1718 because the condition on line 1695 was always true
1696 isinstance(node, TemplateNode)
1697 and node.template_name == "vi-readings"
1698 ):
1699 for parameter, tag in (
1700 ("hanviet", "han-viet-reading"),
1701 ("nom", "nom-reading"),
1702 # we ignore the fanqie parameter "phienthiet"
1703 ):
1704 arg = node.template_parameters.get(parameter)
1705 if arg is not None: 1705 ↛ 1699line 1705 didn't jump to line 1699 because the condition on line 1705 was always true
1706 text = clean_node(wxr, None, arg)
1707 for w in text.split(","):
1708 # ignore - separated references
1709 if "-" in w:
1710 w = w[: w.index("-")]
1711 w = w.strip()
1712 related_readings.append(
1713 LinkageData(word=w, tags=[tag])
1714 )
1715 continue
1717 # Skip the vi-reading template for the rest of the head parsing
1718 new_header_nodes.append(node)
1719 if len(related_readings) > 0: 1719 ↛ 1723line 1719 didn't jump to line 1723 because the condition on line 1719 was always true
1720 data_extend(pos_data, "related", related_readings)
1721 header_nodes = new_header_nodes
1723 header_text = clean_node(
1724 wxr,
1725 pos_data,
1726 header_nodes,
1727 post_template_fn=head_post_template_fn,
1728 collect_links=True,
1729 remove_anchors_from_links=True,
1730 )
1731 if "links" in pos_data:
1732 # WordData doesn't use `links`, so we can use `collect_links=True`
1733 # above without special handling and smuggle link data.
1734 extracted_links = pos_data["links"] # type: ignore
1735 del pos_data["links"] # type: ignore
1736 else:
1737 extracted_links = None
1738 # print(f"{header_text=}, {extracted_links=}")
1740 header_text = re.sub(r"\s+", " ", header_text).strip()
1742 if not header_text:
1743 return
1745 term_label_tags: list[str] = []
1746 term_label_topics: list[str] = []
1747 if len(term_label_templates) > 0:
1748 # parse term label templates; if there are other similar kinds
1749 # of templates in headers that you want to squash and apply as
1750 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1751 for templ_data in term_label_templates:
1752 # print(templ_data)
1753 expan = templ_data.get("expansion", "").strip("().,; ")
1754 if not expan: 1754 ↛ 1755line 1754 didn't jump to line 1755 because the condition on line 1754 was never true
1755 continue
1756 tlb_tagsets, tlb_topics = decode_tags(expan)
1757 for tlb_tags in tlb_tagsets:
1758 if len(tlb_tags) > 0 and not any(
1759 t.startswith("error-") for t in tlb_tags
1760 ):
1761 term_label_tags.extend(tlb_tags)
1762 term_label_topics.extend(tlb_topics)
1763 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1765 # print(f"{header_text=}")
1766 parse_word_head(
1767 wxr,
1768 word,
1769 pos_type,
1770 header_text,
1771 pos_data,
1772 is_reconstruction,
1773 header_group,
1774 header_nodes,
1775 ruby=ruby,
1776 links=extracted_links,
1777 label_templates=normal_label_templates,
1778 )
1779 if "tags" in pos_data:
1780 # pos_data can get "tags" data from some source; type-checkers
1781 # doesn't like it, so let's ignore it.
1782 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1783 del pos_data["tags"] # type: ignore[typeddict-item]
1784 if len(term_label_tags) > 0:
1785 header_tags.extend(term_label_tags)
1786 if len(term_label_topics) > 0:
1787 header_topics.extend(term_label_topics)
1789 def process_gloss_without_list(
1790 nodes: list[Union[WikiNode, str]],
1791 pos_type: str,
1792 pos_data: WordData,
1793 header_tags: list[str],
1794 header_topics: list[str],
1795 ) -> None:
1796 # gloss text might not inside a list
1797 header_nodes: list[Union[str, WikiNode]] = []
1798 gloss_nodes: list[Union[str, WikiNode]] = []
1799 for node in strip_nodes(nodes):
1800 if isinstance(node, WikiNode):
1801 if isinstance(node, TemplateNode):
1802 if node.template_name in (
1803 "zh-see",
1804 "ja-see",
1805 "ja-see-kango",
1806 ):
1807 continue # soft redirect
1808 elif (
1809 node.template_name == "head"
1810 or node.template_name.startswith(f"{lang_code}-")
1811 ):
1812 header_nodes.append(node)
1813 continue
1814 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1814 ↛ 1816line 1814 didn't jump to line 1816 because the condition on line 1814 was always true
1815 break
1816 gloss_nodes.append(node)
1818 if len(header_nodes) > 0:
1819 process_gloss_header(
1820 header_nodes,
1821 pos_type,
1822 None,
1823 pos_data,
1824 header_tags,
1825 header_topics,
1826 )
1827 if len(gloss_nodes) > 0:
1828 process_gloss_contents(
1829 gloss_nodes,
1830 pos_type,
1831 {"tags": list(header_tags), "topics": list(header_topics)},
1832 )
1834 def parse_sense_node(
1835 node: Union[str, WikiNode], # never receives str
1836 sense_base: SenseData,
1837 pos: str,
1838 ) -> bool:
1839 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1840 Uses push_sense() to attempt adding data to pos_data in the scope
1841 of parse_language() when it reaches deep in the recursion. push_sense()
1842 returns True if it succeeds, and that is bubbled up the stack; if
1843 a sense was added downstream, the higher levels (whose shared data
1844 was already added by a subsense) do not push_sense(), unless it
1845 has examples that need to be put somewhere.
1846 """
1847 assert isinstance(sense_base, dict) # Added to every sense deeper in
1849 nonlocal sense_ordinal
1850 my_ordinal = sense_ordinal # copies, not a reference
1851 sense_ordinal += 1 # only use for sorting
1853 if not isinstance(node, WikiNode): 1853 ↛ 1855line 1853 didn't jump to line 1855 because the condition on line 1853 was never true
1854 # This doesn't seem to ever happen in practice.
1855 wxr.wtp.debug(
1856 "{}: parse_sense_node called with"
1857 "something that isn't a WikiNode".format(pos),
1858 sortid="page/1287/20230119",
1859 )
1860 return False
1862 if node.kind != NodeKind.LIST_ITEM: 1862 ↛ 1863line 1862 didn't jump to line 1863 because the condition on line 1862 was never true
1863 wxr.wtp.debug(
1864 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1865 )
1866 return False
1868 if node.sarg == ":":
1869 # Skip example entries at the highest level, ones without
1870 # a sense ("...#") above them.
1871 # If node.sarg is exactly and only ":", then it's at
1872 # the highest level; lower levels would have more
1873 # "indentation", like "#:" or "##:"
1874 return False
1876 # If a recursion call succeeds in push_sense(), bubble it up with
1877 # `added`.
1878 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1879 added = False
1881 gloss_template_args: set[str] = set()
1883 # For LISTs and LIST_ITEMS, their argument is something like
1884 # "##" or "##:", and using that we can rudimentally determine
1885 # list 'depth' if need be, and also what kind of list or
1886 # entry it is; # is for normal glosses, : for examples (indent)
1887 # and * is used for quotations on wiktionary.
1888 current_depth = node.sarg
1890 children = node.children
1892 # subentries, (presumably) a list
1893 # of subglosses below this. The list's
1894 # argument ends with #, and its depth should
1895 # be bigger than parent node.
1896 subentries = [
1897 x
1898 for x in children
1899 if isinstance(x, WikiNode)
1900 and x.kind == NodeKind.LIST
1901 and x.sarg == current_depth + "#"
1902 ]
1904 # sublists of examples and quotations. .sarg
1905 # does not end with "#".
1906 others = [
1907 x
1908 for x in children
1909 if isinstance(x, WikiNode)
1910 and x.kind == NodeKind.LIST
1911 and x.sarg != current_depth + "#"
1912 ]
1914 # the actual contents of this particular node.
1915 # can be a gloss (or a template that expands into
1916 # many glosses which we can't easily pre-expand)
1917 # or could be an "outer gloss" with more specific
1918 # subglosses, or could be a qualfier for the subglosses.
1919 contents = [
1920 x
1921 for x in children
1922 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1923 ]
1924 # If this entry has sublists of entries, we should combine
1925 # gloss information from both the "outer" and sublist content.
1926 # Sometimes the outer gloss
1927 # is more non-gloss or tags, sometimes it is a coarse sense
1928 # and the inner glosses are more specific. The outer one
1929 # does not seem to have qualifiers.
1931 # If we have one sublist with one element, treat it
1932 # specially as it may be a Wiktionary error; raise
1933 # that nested element to the same level.
1934 # XXX If need be, this block can be easily removed in
1935 # the current recursive logicand the result is one sense entry
1936 # with both glosses in the glosses list, as you would
1937 # expect. If the higher entry has examples, there will
1938 # be a higher entry with some duplicated data.
1939 if len(subentries) == 1:
1940 slc = subentries[0].children
1941 if len(slc) == 1:
1942 # copy current node and modify it so it doesn't
1943 # loop infinitely.
1944 cropped_node = copy.copy(node)
1945 cropped_node.children = [
1946 x
1947 for x in children
1948 if not (
1949 isinstance(x, WikiNode)
1950 and x.kind == NodeKind.LIST
1951 and x.sarg == current_depth + "#"
1952 )
1953 ]
1954 added |= parse_sense_node(cropped_node, sense_base, pos)
1955 nonlocal sense_data # this kludge causes duplicated raw_
1956 # glosses data if this is not done;
1957 # if the top-level (cropped_node)
1958 # does not push_sense() properly or
1959 # parse_sense_node() returns early,
1960 # sense_data is not reset. This happens
1961 # for example when you have a no-gloss
1962 # string like "(intransitive)":
1963 # no gloss, push_sense() returns early
1964 # and sense_data has duplicate data with
1965 # sense_base
1966 sense_data = {}
1967 added |= parse_sense_node(slc[0], sense_base, pos)
1968 return added
1970 return process_gloss_contents(
1971 contents,
1972 pos,
1973 sense_base,
1974 subentries,
1975 others,
1976 gloss_template_args,
1977 added,
1978 my_ordinal,
1979 )
1981 def process_gloss_contents(
1982 contents: list[Union[str, WikiNode]],
1983 pos: str,
1984 sense_base: SenseData,
1985 subentries: list[WikiNode] = [],
1986 others: list[WikiNode] = [],
1987 gloss_template_args: Set[str] = set(),
1988 added: bool = False,
1989 sorting_ordinal: int | None = None,
1990 ) -> bool:
1991 def sense_template_fn(
1992 name: str, ht: TemplateArgs, is_gloss: bool = False
1993 ) -> Optional[str]:
1994 # print(f"sense_template_fn: {name}, {ht}")
1995 if name in wikipedia_templates:
1996 # parse_wikipedia_template(wxr, pos_data, ht)
1997 return None
1998 if is_panel_template(wxr, name):
1999 return ""
2000 if name in INFO_TEMPLATE_FUNCS:
2001 info_data, info_exp = parse_info_template_arguments(
2002 wxr, name, ht, "sense"
2003 )
2004 if info_data or info_exp: 2004 ↛ 2010line 2004 didn't jump to line 2010 because the condition on line 2004 was always true
2005 if info_data: 2005 ↛ 2007line 2005 didn't jump to line 2007 because the condition on line 2005 was always true
2006 data_append(sense_base, "info_templates", info_data)
2007 if info_exp and isinstance(info_exp, str): 2007 ↛ 2009line 2007 didn't jump to line 2009 because the condition on line 2007 was always true
2008 return info_exp
2009 return ""
2010 if name in ("defdate",):
2011 date = clean_node(wxr, None, ht.get(1, ()))
2012 if part_two := ht.get(2): 2012 ↛ 2014line 2012 didn't jump to line 2014 because the condition on line 2012 was never true
2013 # Unicode mdash, not '-'
2014 date += "–" + clean_node(wxr, None, part_two)
2015 refs: dict[str, ReferenceData] = {}
2016 # ref, refn, ref2, ref2n, ref3, ref3n
2017 # ref1 not valid
2018 for k, v in sorted(
2019 (k, v) for k, v in ht.items() if isinstance(k, str)
2020 ):
2021 if m := re.match(r"ref(\d?)(n?)", k): 2021 ↛ 2018line 2021 didn't jump to line 2018 because the condition on line 2021 was always true
2022 ref_v = clean_node(wxr, None, v)
2023 if m.group(1) not in refs: # empty string or digit
2024 refs[m.group(1)] = ReferenceData()
2025 if m.group(2):
2026 refs[m.group(1)]["refn"] = ref_v
2027 else:
2028 refs[m.group(1)]["text"] = ref_v
2029 data_append(
2030 sense_base,
2031 "attestations",
2032 AttestationData(date=date, references=list(refs.values())),
2033 )
2034 return ""
2035 if name == "senseid":
2036 langid = clean_node(wxr, None, ht.get(1, ()))
2037 arg = clean_node(wxr, sense_base, ht.get(2, ()))
2038 if re.match(r"Q\d+$", arg):
2039 data_append(sense_base, "wikidata", arg)
2040 data_append(sense_base, "senseid", langid + ":" + arg)
2041 if name in sense_linkage_templates:
2042 # print(f"SENSE_TEMPLATE_FN: {name}")
2043 parse_sense_linkage(wxr, sense_base, name, ht, pos)
2044 return ""
2045 if name == "†" or name == "zh-obsolete":
2046 data_append(sense_base, "tags", "obsolete")
2047 return ""
2048 if name in {
2049 "ux",
2050 "uxi",
2051 "usex",
2052 "afex",
2053 "prefixusex",
2054 "ko-usex",
2055 "ko-x",
2056 "hi-x",
2057 "ja-usex-inline",
2058 "ja-x",
2059 "quotei",
2060 "he-x",
2061 "hi-x",
2062 "km-x",
2063 "ne-x",
2064 "shn-x",
2065 "th-x",
2066 "ur-x",
2067 }:
2068 # Usage examples are captured separately below. We don't
2069 # want to expand them into glosses even when unusual coding
2070 # is used in the entry.
2071 # These templates may slip through inside another item, but
2072 # currently we're separating out example entries (..#:)
2073 # well enough that there seems to very little contamination.
2074 if is_gloss:
2075 wxr.wtp.wiki_notice(
2076 "Example template is used for gloss text",
2077 sortid="extractor.en.page.sense_template_fn/1415",
2078 )
2079 else:
2080 return ""
2081 if name == "w": 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true
2082 if ht.get(2) == "Wp":
2083 return ""
2084 for v in ht.values():
2085 v = v.strip()
2086 if v and "<" not in v:
2087 gloss_template_args.add(v)
2088 return None
2090 def extract_link_texts(item: GeneralNode) -> None:
2091 """Recursively extracts link texts from the gloss source. This
2092 information is used to select whether to remove final "." from
2093 form_of/alt_of (e.g., ihm/Hunsrik)."""
2094 if isinstance(item, (list, tuple)):
2095 for x in item:
2096 extract_link_texts(x)
2097 return
2098 if isinstance(item, str):
2099 # There seem to be HTML sections that may futher contain
2100 # unparsed links.
2101 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2101 ↛ 2102line 2101 didn't jump to line 2102 because the loop on line 2101 never started
2102 print("ITER:", m.group(0))
2103 v = m.group(1).split("|")[-1].strip()
2104 if v:
2105 gloss_template_args.add(v)
2106 return
2107 if not isinstance(item, WikiNode): 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true
2108 return
2109 if item.kind == NodeKind.LINK:
2110 v = item.largs[-1]
2111 if ( 2111 ↛ 2117line 2111 didn't jump to line 2117 because the condition on line 2111 was always true
2112 isinstance(v, list)
2113 and len(v) == 1
2114 and isinstance(v[0], str)
2115 ):
2116 gloss_template_args.add(v[0].strip())
2117 for x in item.children:
2118 extract_link_texts(x)
2120 extract_link_texts(contents)
2122 # get the raw text of non-list contents of this node, and other stuff
2123 # like tag and category data added to sense_base
2124 # cast = no-op type-setter for the type-checker
2125 partial_template_fn = cast(
2126 TemplateFnCallable,
2127 partial(sense_template_fn, is_gloss=True),
2128 )
2129 rawgloss = clean_node(
2130 wxr,
2131 sense_base,
2132 contents,
2133 template_fn=partial_template_fn,
2134 collect_links=True,
2135 )
2137 if not rawgloss: 2137 ↛ 2138line 2137 didn't jump to line 2138 because the condition on line 2137 was never true
2138 return False
2140 # remove manually typed ordered list text at the start("1. ")
2141 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2143 # get stuff like synonyms and categories from "others",
2144 # maybe examples and quotations
2145 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2147 # The gloss could contain templates that produce more list items.
2148 # This happens commonly with, e.g., {{inflection of|...}}. Split
2149 # to parts. However, e.g. Interlingua generates multiple glosses
2150 # in HTML directly without Wikitext markup, so we must also split
2151 # by just newlines.
2152 subglosses = rawgloss.splitlines()
2154 if len(subglosses) == 0: 2154 ↛ 2155line 2154 didn't jump to line 2155 because the condition on line 2154 was never true
2155 return False
2157 if any(s.startswith("#") for s in subglosses):
2158 subtree = wxr.wtp.parse(rawgloss)
2159 # from wikitextprocessor.parser import print_tree
2160 # print("SUBTREE GENERATED BY TEMPLATE:")
2161 # print_tree(subtree)
2162 new_subentries = [
2163 x
2164 for x in subtree.children
2165 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2166 ]
2168 new_others = [
2169 x
2170 for x in subtree.children
2171 if isinstance(x, WikiNode)
2172 and x.kind == NodeKind.LIST
2173 and not x.sarg.endswith("#")
2174 ]
2176 new_contents = [
2177 clean_node(wxr, [], x)
2178 for x in subtree.children
2179 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2180 ]
2182 subentries = subentries or new_subentries
2183 others = others or new_others
2184 subglosses = new_contents
2185 rawgloss = "".join(subglosses)
2186 # Generate no gloss for translation hub pages, but add the
2187 # "translation-hub" tag for them
2188 if rawgloss == "(This entry is a translation hub.)": 2188 ↛ 2189line 2188 didn't jump to line 2189 because the condition on line 2188 was never true
2189 data_append(sense_data, "tags", "translation-hub")
2190 return push_sense(sorting_ordinal)
2192 # Remove certain substrings specific to outer glosses
2193 strip_ends = [", particularly:"]
2194 for x in strip_ends:
2195 if rawgloss.endswith(x):
2196 rawgloss = rawgloss[: -len(x)].strip()
2197 break
2199 # A single gloss, or possibly an outer gloss.
2200 # Check if the possible outer gloss starts with
2201 # parenthesized tags/topics
2203 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2204 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2205 m = QUALIFIERS_RE.match(rawgloss)
2206 # (...): ... or (...(...)...): ...
2207 if m:
2208 q = m.group(1)
2209 rawgloss = rawgloss[m.end() :].strip()
2210 parse_sense_qualifier(wxr, q, sense_base)
2211 if rawgloss == "A pejorative:": 2211 ↛ 2212line 2211 didn't jump to line 2212 because the condition on line 2211 was never true
2212 data_append(sense_base, "tags", "pejorative")
2213 rawgloss = ""
2214 elif rawgloss == "Short forms.": 2214 ↛ 2215line 2214 didn't jump to line 2215 because the condition on line 2214 was never true
2215 data_append(sense_base, "tags", "abbreviation")
2216 rawgloss = ""
2217 elif rawgloss == "Technical or specialized senses.": 2217 ↛ 2218line 2217 didn't jump to line 2218 because the condition on line 2217 was never true
2218 rawgloss = ""
2219 elif rawgloss.startswith("inflection of "):
2220 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2221 if parsed is not None: 2221 ↛ 2230line 2221 didn't jump to line 2230 because the condition on line 2221 was always true
2222 tags, origins = parsed
2223 if origins is not None: 2223 ↛ 2225line 2223 didn't jump to line 2225 because the condition on line 2223 was always true
2224 data_extend(sense_base, "form_of", origins)
2225 if tags is not None: 2225 ↛ 2228line 2225 didn't jump to line 2228 because the condition on line 2225 was always true
2226 data_extend(sense_base, "tags", tags)
2227 else:
2228 data_append(sense_base, "tags", "form-of")
2229 else:
2230 data_append(sense_base, "tags", "form-of")
2231 if rawgloss: 2231 ↛ 2262line 2231 didn't jump to line 2262 because the condition on line 2231 was always true
2232 # Code duplicating a lot of clean-up operations from later in
2233 # this block. We want to clean up the "supergloss" as much as
2234 # possible, in almost the same way as a normal gloss.
2235 supergloss = rawgloss
2237 if supergloss.startswith("; "): 2237 ↛ 2238line 2237 didn't jump to line 2238 because the condition on line 2237 was never true
2238 supergloss = supergloss[1:].strip()
2240 if supergloss.startswith(("^†", "†")):
2241 data_append(sense_base, "tags", "obsolete")
2242 supergloss = supergloss[2:].strip()
2243 elif supergloss.startswith("^‡"): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true
2244 data_extend(sense_base, "tags", ["obsolete", "historical"])
2245 supergloss = supergloss[2:].strip()
2247 # remove [14th century...] style brackets at the end
2248 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2250 if supergloss.startswith((",", ":")):
2251 supergloss = supergloss[1:]
2252 supergloss = supergloss.strip()
2253 if supergloss.startswith("N. of "): 2253 ↛ 2254line 2253 didn't jump to line 2254 because the condition on line 2253 was never true
2254 supergloss = "Name of " + supergloss[6:]
2255 supergloss = supergloss[2:]
2256 data_append(sense_base, "glosses", supergloss)
2257 if supergloss in ("A person:",):
2258 data_append(sense_base, "tags", "g-person")
2260 # The main recursive call (except for the exceptions at the
2261 # start of this function).
2262 for sublist in subentries:
2263 if not ( 2263 ↛ 2266line 2263 didn't jump to line 2266 because the condition on line 2263 was never true
2264 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2265 ):
2266 wxr.wtp.debug(
2267 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2268 f"with items that are not LISTs",
2269 sortid="page/1511/20230119",
2270 )
2271 continue
2272 for item in sublist.children:
2273 if not ( 2273 ↛ 2277line 2273 didn't jump to line 2277 because the condition on line 2273 was never true
2274 isinstance(item, WikiNode)
2275 and item.kind == NodeKind.LIST_ITEM
2276 ):
2277 continue
2278 # copy sense_base to prevent cross-contamination between
2279 # subglosses and other subglosses and superglosses
2280 sense_base2 = copy.deepcopy(sense_base)
2281 if parse_sense_node(item, sense_base2, pos): 2281 ↛ 2272line 2281 didn't jump to line 2272 because the condition on line 2281 was always true
2282 added = True
2284 # Capture examples.
2285 # This is called after the recursive calls above so that
2286 # sense_base is not contaminated with meta-data from
2287 # example entries for *this* gloss.
2288 examples = []
2289 if wxr.config.capture_examples: 2289 ↛ 2293line 2289 didn't jump to line 2293 because the condition on line 2289 was always true
2290 examples = extract_examples(others, sense_base)
2292 # push_sense() succeeded somewhere down-river, so skip this level
2293 if added:
2294 if examples:
2295 # this higher-up gloss has examples that we do not want to skip
2296 wxr.wtp.debug(
2297 "'{}[...]' gloss has examples we want to keep, "
2298 "but there are subglosses.".format(repr(rawgloss[:30])),
2299 sortid="page/1498/20230118",
2300 )
2301 else:
2302 return True
2304 # Some entries, e.g., "iacebam", have weird sentences in quotes
2305 # after the gloss, but these sentences don't seem to be intended
2306 # as glosses. Skip them.
2307 indexed_subglosses = list(
2308 (i, gl)
2309 for i, gl in enumerate(subglosses)
2310 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2311 )
2313 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2313 ↛ 2314line 2313 didn't jump to line 2314 because the condition on line 2313 was never true
2314 gl = indexed_subglosses[0][1].strip()
2315 if gl.endswith(":"):
2316 gl = gl[:-1].strip()
2317 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2318 if parsed is not None:
2319 infl_tags, infl_dts = parsed
2320 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2321 # Interpret others as a particular form under
2322 # "inflection of"
2323 data_extend(sense_base, "tags", infl_tags)
2324 data_extend(sense_base, "form_of", infl_dts)
2325 indexed_subglosses = indexed_subglosses[1:]
2326 elif not infl_dts:
2327 data_extend(sense_base, "tags", infl_tags)
2328 indexed_subglosses = indexed_subglosses[1:]
2330 # Create senses for remaining subglosses
2331 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2332 gloss = gloss.strip()
2333 if not gloss and len(indexed_subglosses) > 1: 2333 ↛ 2334line 2333 didn't jump to line 2334 because the condition on line 2333 was never true
2334 continue
2335 # Push a new sense (if the last one is not empty)
2336 if push_sense(sorting_ordinal): 2336 ↛ 2337line 2336 didn't jump to line 2337 because the condition on line 2336 was never true
2337 added = True
2338 # if gloss not in sense_data.get("raw_glosses", ()):
2339 # data_append(sense_data, "raw_glosses", gloss)
2340 if i == 0 and examples:
2341 # In a multi-line gloss, associate examples
2342 # with only one of them.
2343 # XXX or you could use gloss_i == len(indexed_subglosses)
2344 # to associate examples with the *last* one.
2345 data_extend(sense_data, "examples", examples)
2346 if gloss.startswith("; ") and gloss_i > 0: 2346 ↛ 2347line 2346 didn't jump to line 2347 because the condition on line 2346 was never true
2347 gloss = gloss[1:].strip()
2348 # If the gloss starts with †, mark as obsolete
2349 if gloss.startswith("^†"): 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true
2350 data_append(sense_data, "tags", "obsolete")
2351 gloss = gloss[2:].strip()
2352 elif gloss.startswith("^‡"): 2352 ↛ 2353line 2352 didn't jump to line 2353 because the condition on line 2352 was never true
2353 data_extend(sense_data, "tags", ["obsolete", "historical"])
2354 gloss = gloss[2:].strip()
2355 # Copy data for all senses to this sense
2356 for k, v in sense_base.items():
2357 if isinstance(v, (list, tuple)):
2358 if k != "tags":
2359 # Tags handled below (countable/uncountable special)
2360 data_extend(sense_data, k, v)
2361 else:
2362 assert k not in ("tags", "categories", "topics")
2363 sense_data[k] = v # type:ignore[literal-required]
2364 # Parse the gloss for this particular sense
2365 m = QUALIFIERS_RE.match(gloss)
2366 # (...): ... or (...(...)...): ...
2367 if m:
2368 parse_sense_qualifier(wxr, m.group(1), sense_data)
2369 gloss = gloss[m.end() :].strip()
2371 # Remove common suffix "[from 14th c.]" and similar
2372 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2374 # Check to make sure we don't have unhandled list items in gloss
2375 ofs = max(gloss.find("#"), gloss.find("* "))
2376 if ofs > 10 and "(#)" not in gloss:
2377 wxr.wtp.debug(
2378 "gloss may contain unhandled list items: {}".format(gloss),
2379 sortid="page/1412",
2380 )
2381 elif "\n" in gloss: 2381 ↛ 2382line 2381 didn't jump to line 2382 because the condition on line 2381 was never true
2382 wxr.wtp.debug(
2383 "gloss contains newline: {}".format(gloss),
2384 sortid="page/1416",
2385 )
2387 # Kludge, some glosses have a comma after initial qualifiers in
2388 # parentheses
2389 if gloss.startswith((",", ":")):
2390 gloss = gloss[1:]
2391 gloss = gloss.strip()
2392 if gloss.endswith(":"):
2393 gloss = gloss[:-1].strip()
2394 if gloss.startswith("N. of "): 2394 ↛ 2395line 2394 didn't jump to line 2395 because the condition on line 2394 was never true
2395 gloss = "Name of " + gloss[6:]
2396 if gloss.startswith("†"): 2396 ↛ 2397line 2396 didn't jump to line 2397 because the condition on line 2396 was never true
2397 data_append(sense_data, "tags", "obsolete")
2398 gloss = gloss[1:]
2399 elif gloss.startswith("^†"): 2399 ↛ 2400line 2399 didn't jump to line 2400 because the condition on line 2399 was never true
2400 data_append(sense_data, "tags", "obsolete")
2401 gloss = gloss[2:]
2403 # Copy tags from sense_base if any. This will not copy
2404 # countable/uncountable if either was specified in the sense,
2405 # as sometimes both are specified in word head but only one
2406 # in individual senses.
2407 countability_tags = []
2408 base_tags = sense_base.get("tags", ())
2409 sense_tags = sense_data.get("tags", ())
2410 for tag in base_tags:
2411 if tag in ("countable", "uncountable"):
2412 if tag not in countability_tags: 2412 ↛ 2414line 2412 didn't jump to line 2414 because the condition on line 2412 was always true
2413 countability_tags.append(tag)
2414 continue
2415 if tag not in sense_tags:
2416 data_append(sense_data, "tags", tag)
2417 if countability_tags:
2418 if ( 2418 ↛ 2427line 2418 didn't jump to line 2427 because the condition on line 2418 was always true
2419 "countable" not in sense_tags
2420 and "uncountable" not in sense_tags
2421 ):
2422 data_extend(sense_data, "tags", countability_tags)
2424 # If outer gloss specifies a form-of ("inflection of", see
2425 # aquamarine/German), try to parse the inner glosses as
2426 # tags for an inflected form.
2427 if "form-of" in sense_base.get("tags", ()):
2428 parsed = parse_alt_or_inflection_of(
2429 wxr, gloss, gloss_template_args
2430 )
2431 if parsed is not None: 2431 ↛ 2437line 2431 didn't jump to line 2437 because the condition on line 2431 was always true
2432 infl_tags, infl_dts = parsed
2433 if not infl_dts and infl_tags: 2433 ↛ 2437line 2433 didn't jump to line 2437 because the condition on line 2433 was always true
2434 # Interpret as a particular form under "inflection of"
2435 data_extend(sense_data, "tags", infl_tags)
2437 if not gloss: 2437 ↛ 2438line 2437 didn't jump to line 2438 because the condition on line 2437 was never true
2438 data_append(sense_data, "tags", "empty-gloss")
2439 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2440 if ( 2440 ↛ 2451line 2440 didn't jump to line 2451 because the condition on line 2440 was always true
2441 gloss_i == 0
2442 and len(sense_data.get("glosses", tuple())) >= 1
2443 ):
2444 # If we added a "high-level gloss" from rawgloss, but this
2445 # is that same gloss_i, add this instead of the raw_gloss
2446 # from before if they're different: the rawgloss was not
2447 # cleaned exactly the same as this later gloss
2448 sense_data["glosses"][-1] = gloss
2449 else:
2450 # Add the gloss for the sense.
2451 data_append(sense_data, "glosses", gloss)
2453 # Kludge: there are cases (e.g., etc./Swedish) where there are
2454 # two abbreviations in the same sense, both generated by the
2455 # {{abbreviation of|...}} template. Handle these with some magic.
2456 position = 0
2457 split_glosses = []
2458 for m in re.finditer(r"Abbreviation of ", gloss):
2459 if m.start() != position: 2459 ↛ 2458line 2459 didn't jump to line 2458 because the condition on line 2459 was always true
2460 split_glosses.append(gloss[position : m.start()])
2461 position = m.start()
2462 split_glosses.append(gloss[position:])
2463 for gloss in split_glosses:
2464 # Check if this gloss describes an alt-of or inflection-of
2465 if (
2466 lang_code != "en"
2467 and " " not in gloss
2468 and distw([word], gloss) < 0.3
2469 ):
2470 # Don't try to parse gloss if it is one word
2471 # that is close to the word itself for non-English words
2472 # (probable translations of a tag/form name)
2473 continue
2474 parsed = parse_alt_or_inflection_of(
2475 wxr, gloss, gloss_template_args
2476 )
2477 if parsed is None:
2478 continue
2479 tags, dts = parsed
2480 if not dts and tags:
2481 data_extend(sense_data, "tags", tags)
2482 continue
2483 for dt in dts: # type:ignore[union-attr]
2484 ftags = list(tag for tag in tags if tag != "form-of")
2485 if "alt-of" in tags:
2486 data_extend(sense_data, "tags", ftags)
2487 data_append(sense_data, "alt_of", dt)
2488 elif "compound-of" in tags: 2488 ↛ 2489line 2488 didn't jump to line 2489 because the condition on line 2488 was never true
2489 data_extend(sense_data, "tags", ftags)
2490 data_append(sense_data, "compound_of", dt)
2491 elif "synonym-of" in tags: 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true
2492 data_extend(dt, "tags", ftags)
2493 data_append(sense_data, "synonyms", dt)
2494 elif tags and dt.get("word", "").startswith("of "): 2494 ↛ 2495line 2494 didn't jump to line 2495 because the condition on line 2494 was never true
2495 dt["word"] = dt["word"][3:]
2496 data_append(sense_data, "tags", "form-of")
2497 data_extend(sense_data, "tags", ftags)
2498 data_append(sense_data, "form_of", dt)
2499 elif "form-of" in tags: 2499 ↛ 2483line 2499 didn't jump to line 2483 because the condition on line 2499 was always true
2500 data_extend(sense_data, "tags", tags)
2501 data_append(sense_data, "form_of", dt)
2503 if len(sense_data) == 0:
2504 if len(sense_base.get("tags", [])) == 0: 2504 ↛ 2506line 2504 didn't jump to line 2506 because the condition on line 2504 was always true
2505 del sense_base["tags"]
2506 sense_data.update(sense_base)
2507 if push_sense(sorting_ordinal): 2507 ↛ 2511line 2507 didn't jump to line 2511 because the condition on line 2507 was always true
2508 # push_sense succeded in adding a sense to pos_data
2509 added = True
2510 # print("PARSE_SENSE DONE:", pos_datas[-1])
2511 return added
2513 def parse_inflection(
2514 node: WikiNode, section: str, pos: Optional[str]
2515 ) -> None:
2516 """Parses inflection data (declension, conjugation) from the given
2517 page. This retrieves the actual inflection template
2518 parameters, which are very useful for applications that need
2519 to learn the inflection classes and generate inflected
2520 forms."""
2521 assert isinstance(node, WikiNode)
2522 assert isinstance(section, str)
2523 assert pos is None or isinstance(pos, str)
2524 # print("parse_inflection:", node)
2526 if pos is None: 2526 ↛ 2527line 2526 didn't jump to line 2527 because the condition on line 2526 was never true
2527 wxr.wtp.debug(
2528 "inflection table outside part-of-speech", sortid="page/1812"
2529 )
2530 return
2532 def inflection_template_fn(
2533 name: str, ht: TemplateArgs
2534 ) -> Optional[str]:
2535 # print("decl_conj_template_fn", name, ht)
2536 if is_panel_template(wxr, name): 2536 ↛ 2537line 2536 didn't jump to line 2537 because the condition on line 2536 was never true
2537 return ""
2538 if name in ("is-u-mutation",): 2538 ↛ 2541line 2538 didn't jump to line 2541 because the condition on line 2538 was never true
2539 # These are not to be captured as an exception to the
2540 # generic code below
2541 return None
2542 m = re.search(
2543 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2544 r"declension|inflection|mut|mutation)($|-)",
2545 name,
2546 )
2547 if m:
2548 args_ht = clean_template_args(wxr, ht)
2549 dt = {"name": name, "args": args_ht}
2550 data_append(pos_data, "inflection_templates", dt)
2552 return None
2554 # Convert the subtree back to Wikitext, then expand all and parse,
2555 # capturing templates in the process
2556 text = wxr.wtp.node_to_wikitext(node.children)
2558 # Split text into separate sections for each to-level template
2559 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2560 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2561 # The (?:...) creates a non-capturing regex group; if it was capturing,
2562 # like the group around it, it would create elements in brace_matches,
2563 # including None if it doesn't match.
2564 # 20250114: Added {| and |} into the regex because tables were being
2565 # cut into pieces by this code. Issue #973, introduction of two-part
2566 # book-end templates similar to trans-top and tran-bottom.
2567 template_sections = []
2568 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2569 # Because there is the possibility of triple curly braces
2570 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2571 # count nesting depth using pairs of two brackets, but
2572 # instead use singular braces ("{ }").
2573 # Because template delimiters should be balanced, regardless
2574 # of whether {{ or {{{ is used, and because we only care
2575 # about the outer-most delimiters (the highest level template)
2576 # we can just count the single braces when those single
2577 # braces are part of a group.
2578 table_nesting = 0
2579 # However, if we have a stray table ({| ... |}) that should always
2580 # be its own section, and should prevent templates from cutting it
2581 # into sections.
2583 # print(f"Parse inflection: {text=}")
2584 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2585 if len(brace_matches) > 1:
2586 tsection: list[str] = []
2587 after_templates = False # kludge to keep any text
2588 # before first template
2589 # with the first template;
2590 # otherwise, text
2591 # goes with preceding template
2592 for m in brace_matches:
2593 if m.startswith("\n; ") and after_templates: 2593 ↛ 2594line 2593 didn't jump to line 2594 because the condition on line 2593 was never true
2594 after_templates = False
2595 template_sections.append(tsection)
2596 tsection = []
2597 tsection.append(m)
2598 elif m.startswith("{{") or m.endswith("{|"):
2599 if (
2600 template_nesting == 0
2601 and after_templates
2602 and table_nesting == 0
2603 ):
2604 template_sections.append(tsection)
2605 tsection = []
2606 # start new section
2607 after_templates = True
2608 if m.startswith("{{"):
2609 template_nesting += 1
2610 else:
2611 # m.endswith("{|")
2612 table_nesting += 1
2613 tsection.append(m)
2614 elif m.startswith("}}") or m.endswith("|}"):
2615 if m.startswith("}}"):
2616 template_nesting -= 1
2617 if template_nesting < 0: 2617 ↛ 2618line 2617 didn't jump to line 2618 because the condition on line 2617 was never true
2618 wxr.wtp.error(
2619 "Negatively nested braces, "
2620 "couldn't split inflection templates, "
2621 "{}/{} section {}".format(
2622 word, language, section
2623 ),
2624 sortid="page/1871",
2625 )
2626 template_sections = [] # use whole text
2627 break
2628 else:
2629 table_nesting -= 1
2630 if table_nesting < 0: 2630 ↛ 2631line 2630 didn't jump to line 2631 because the condition on line 2630 was never true
2631 wxr.wtp.error(
2632 "Negatively nested table braces, "
2633 "couldn't split inflection section, "
2634 "{}/{} section {}".format(
2635 word, language, section
2636 ),
2637 sortid="page/20250114",
2638 )
2639 template_sections = [] # use whole text
2640 break
2641 tsection.append(m)
2642 else:
2643 tsection.append(m)
2644 if tsection: # dangling tsection 2644 ↛ 2652line 2644 didn't jump to line 2652 because the condition on line 2644 was always true
2645 template_sections.append(tsection)
2646 # Why do it this way around? The parser has a preference
2647 # to associate bits outside of tables with the preceding
2648 # table (`after`-variable), so a new tsection begins
2649 # at {{ and everything before it belongs to the previous
2650 # template.
2652 texts = []
2653 if not template_sections:
2654 texts = [text]
2655 else:
2656 for tsection in template_sections:
2657 texts.append("".join(tsection))
2658 if template_nesting != 0: 2658 ↛ 2659line 2658 didn't jump to line 2659 because the condition on line 2658 was never true
2659 wxr.wtp.error(
2660 "Template nesting error: "
2661 "template_nesting = {} "
2662 "couldn't split inflection templates, "
2663 "{}/{} section {}".format(
2664 template_nesting, word, language, section
2665 ),
2666 sortid="page/1896",
2667 )
2668 texts = [text]
2669 for text in texts:
2670 tree = wxr.wtp.parse(
2671 text, expand_all=True, template_fn=inflection_template_fn
2672 )
2674 if not text.strip():
2675 continue
2677 # Parse inflection tables from the section. The data is stored
2678 # under "forms".
2679 if wxr.config.capture_inflections: 2679 ↛ 2669line 2679 didn't jump to line 2669 because the condition on line 2679 was always true
2680 tablecontext = None
2681 m = re.search(r"{{([^}{|]+)\|?", text)
2682 if m:
2683 template_name = m.group(1).strip()
2684 tablecontext = TableContext(template_name)
2686 parse_inflection_section(
2687 wxr,
2688 pos_data,
2689 word,
2690 language,
2691 pos,
2692 section,
2693 tree,
2694 tablecontext=tablecontext,
2695 )
2697 def get_subpage_section(
2698 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2699 ) -> Optional[Union[WikiNode, str]]:
2700 """Loads a subpage of the given page, and finds the section
2701 for the given language, part-of-speech, and section title. This
2702 is used for finding translations and other sections on subpages."""
2703 assert isinstance(language, str)
2704 assert isinstance(title, str)
2705 assert isinstance(subtitle, str)
2706 assert isinstance(seqs, (list, tuple))
2707 for seq in seqs:
2708 for x in seq:
2709 assert isinstance(x, str)
2710 subpage_title = word + "/" + subtitle
2711 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2712 if subpage_content is None:
2713 wxr.wtp.error(
2714 "/translations not found despite "
2715 "{{see translation subpage|...}}",
2716 sortid="page/1934",
2717 )
2718 return None
2720 def recurse(
2721 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2722 ) -> Optional[Union[str, WikiNode]]:
2723 # print(f"seq: {seq}")
2724 if not seq:
2725 return node
2726 if not isinstance(node, WikiNode):
2727 return None
2728 # print(f"node.kind: {node.kind}")
2729 if node.kind in LEVEL_KINDS:
2730 t = clean_node(wxr, None, node.largs[0])
2731 # print(f"t: {t} == seq[0]: {seq[0]}?")
2732 if t.lower() == seq[0].lower():
2733 seq = seq[1:]
2734 if not seq:
2735 return node
2736 for n in node.children:
2737 ret = recurse(n, seq)
2738 if ret is not None:
2739 return ret
2740 return None
2742 tree = wxr.wtp.parse(
2743 subpage_content,
2744 pre_expand=True,
2745 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2746 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2747 )
2748 assert tree.kind == NodeKind.ROOT
2749 for seq in seqs:
2750 ret = recurse(tree, seq)
2751 if ret is None:
2752 wxr.wtp.debug(
2753 "Failed to find subpage section {}/{} seq {}".format(
2754 title, subtitle, seq
2755 ),
2756 sortid="page/1963",
2757 )
2758 return ret
2760 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2761 """Parses translations for a word. This may also pull in translations
2762 from separate translation subpages."""
2763 assert isinstance(data, dict)
2764 assert isinstance(xlatnode, WikiNode)
2765 # print("===== PARSE_TRANSLATIONS {} {} {}"
2766 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2767 # print("parse_translations xlatnode={}".format(xlatnode))
2768 if not wxr.config.capture_translations: 2768 ↛ 2769line 2768 didn't jump to line 2769 because the condition on line 2768 was never true
2769 return
2770 sense_parts: list[Union[WikiNode, str]] = []
2771 sense: Optional[str] = None
2773 def parse_translation_item(
2774 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2775 ) -> None:
2776 nonlocal sense
2777 assert isinstance(contents, list)
2778 assert lang is None or isinstance(lang, str)
2779 # print("PARSE_TRANSLATION_ITEM:", contents)
2781 langcode: Optional[str] = None
2782 if sense is None:
2783 sense = clean_node(wxr, data, sense_parts).strip()
2784 # print("sense <- clean_node: ", sense)
2785 idx = sense.find("See also translations at")
2786 if idx > 0: 2786 ↛ 2787line 2786 didn't jump to line 2787 because the condition on line 2786 was never true
2787 wxr.wtp.debug(
2788 "Skipping translation see also: {}".format(sense),
2789 sortid="page/2361",
2790 )
2791 sense = sense[:idx].strip()
2792 if sense.endswith(":"): 2792 ↛ 2793line 2792 didn't jump to line 2793 because the condition on line 2792 was never true
2793 sense = sense[:-1].strip()
2794 if sense.endswith("—"): 2794 ↛ 2795line 2794 didn't jump to line 2795 because the condition on line 2794 was never true
2795 sense = sense[:-1].strip()
2796 translations_from_template: list[str] = []
2798 def translation_item_template_fn(
2799 name: str, ht: TemplateArgs
2800 ) -> Optional[str]:
2801 nonlocal langcode
2802 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2803 if is_panel_template(wxr, name):
2804 return ""
2805 if name in ("t+check", "t-check", "t-needed"):
2806 # We ignore these templates. They seem to have outright
2807 # garbage in some entries, and very varying formatting in
2808 # others. These should be transitory and unreliable
2809 # anyway.
2810 return "__IGNORE__"
2811 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2812 code = ht.get(1)
2813 if code: 2813 ↛ 2823line 2813 didn't jump to line 2823 because the condition on line 2813 was always true
2814 if langcode and code != langcode:
2815 wxr.wtp.debug(
2816 "inconsistent language codes {} vs "
2817 "{} in translation item: {!r} {}".format(
2818 langcode, code, name, ht
2819 ),
2820 sortid="page/2386",
2821 )
2822 langcode = code
2823 tr = ht.get(2)
2824 if tr:
2825 tr = clean_node(wxr, None, [tr])
2826 translations_from_template.append(tr)
2827 return None
2828 if name == "t-egy":
2829 langcode = "egy"
2830 return None
2831 if name == "ttbc":
2832 code = ht.get(1)
2833 if code: 2833 ↛ 2835line 2833 didn't jump to line 2835 because the condition on line 2833 was always true
2834 langcode = code
2835 return None
2836 if name == "trans-see": 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true
2837 wxr.wtp.error(
2838 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2839 )
2840 return ""
2841 if name.endswith("-top"): 2841 ↛ 2842line 2841 didn't jump to line 2842 because the condition on line 2841 was never true
2842 return ""
2843 if name.endswith("-bottom"): 2843 ↛ 2844line 2843 didn't jump to line 2844 because the condition on line 2843 was never true
2844 return ""
2845 if name.endswith("-mid"): 2845 ↛ 2846line 2845 didn't jump to line 2846 because the condition on line 2845 was never true
2846 return ""
2847 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2848 # .format(name),
2849 # sortid="page/2414")
2850 return None
2852 sublists = list(
2853 x
2854 for x in contents
2855 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2856 )
2857 contents = list(
2858 x
2859 for x in contents
2860 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2861 )
2863 item = clean_node(
2864 wxr, data, contents, template_fn=translation_item_template_fn
2865 )
2866 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2868 # Parse the translation item.
2869 if item: 2869 ↛ exitline 2869 didn't return from function 'parse_translation_item' because the condition on line 2869 was always true
2870 lang = parse_translation_item_text(
2871 wxr,
2872 word,
2873 data,
2874 item,
2875 sense,
2876 lang,
2877 langcode,
2878 translations_from_template,
2879 is_reconstruction,
2880 )
2882 # Handle sublists. They are frequently used for different
2883 # scripts for the language and different variants of the
2884 # language. We will include the lower-level header as a
2885 # tag in those cases.
2886 for listnode in sublists:
2887 assert listnode.kind == NodeKind.LIST
2888 for node in listnode.children:
2889 if not isinstance(node, WikiNode): 2889 ↛ 2890line 2889 didn't jump to line 2890 because the condition on line 2889 was never true
2890 continue
2891 if node.kind == NodeKind.LIST_ITEM: 2891 ↛ 2888line 2891 didn't jump to line 2888 because the condition on line 2891 was always true
2892 parse_translation_item(node.children, lang=lang)
2894 def parse_translation_template(node: WikiNode) -> None:
2895 assert isinstance(node, WikiNode)
2897 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2898 nonlocal sense_parts
2899 nonlocal sense
2900 if is_panel_template(wxr, name):
2901 return ""
2902 if name == "see also":
2903 # XXX capture
2904 # XXX for example, "/" has top-level list containing
2905 # see also items. So also should parse those.
2906 return ""
2907 if name == "trans-see":
2908 # XXX capture
2909 return ""
2910 if name == "see translation subpage": 2910 ↛ 2911line 2910 didn't jump to line 2911 because the condition on line 2910 was never true
2911 sense_parts = []
2912 sense = None
2913 sub = ht.get(1, "")
2914 if sub:
2915 m = re.match(
2916 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
2917 )
2918 else:
2919 m = None
2920 etym = ""
2921 etym_numbered = ""
2922 pos = ""
2923 if m:
2924 etym_numbered = m.group(1)
2925 etym = m.group(2)
2926 pos = m.group(3)
2927 if not sub:
2928 wxr.wtp.debug(
2929 "no part-of-speech in "
2930 "{{see translation subpage|...}}, "
2931 "defaulting to just wxr.wtp.section "
2932 "(= language)",
2933 sortid="page/2468",
2934 )
2935 # seq sent to get_subpage_section without sub and pos
2936 seq = [
2937 language,
2938 TRANSLATIONS_TITLE,
2939 ]
2940 elif (
2941 m
2942 and etym.lower().strip() in ETYMOLOGY_TITLES
2943 and pos.lower() in POS_TITLES
2944 ):
2945 seq = [
2946 language,
2947 etym_numbered,
2948 pos,
2949 TRANSLATIONS_TITLE,
2950 ]
2951 elif sub.lower() in POS_TITLES:
2952 # seq with sub but not pos
2953 seq = [
2954 language,
2955 sub,
2956 TRANSLATIONS_TITLE,
2957 ]
2958 else:
2959 # seq with sub and pos
2960 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2961 if pos.lower() not in POS_TITLES:
2962 wxr.wtp.debug(
2963 "unhandled see translation subpage: "
2964 "language={} sub={} "
2965 "wxr.wtp.subsection={}".format(
2966 language, sub, wxr.wtp.subsection
2967 ),
2968 sortid="page/2478",
2969 )
2970 seq = [language, sub, pos, TRANSLATIONS_TITLE]
2971 subnode = get_subpage_section(
2972 wxr.wtp.title or "MISSING_TITLE",
2973 TRANSLATIONS_TITLE,
2974 [seq],
2975 )
2976 if subnode is None or not isinstance(subnode, WikiNode):
2977 # Failed to find the normal subpage section
2978 # seq with sub and pos
2979 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2980 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
2981 seqs: list[list[str] | tuple[str, ...]] = [
2982 [TRANSLATIONS_TITLE],
2983 [language, pos],
2984 ]
2985 subnode = get_subpage_section(
2986 wxr.wtp.title or "MISSING_TITLE",
2987 TRANSLATIONS_TITLE,
2988 seqs,
2989 )
2990 if subnode is not None and isinstance(subnode, WikiNode):
2991 parse_translations(data, subnode)
2992 return ""
2993 if name in (
2994 "c",
2995 "C",
2996 "categorize",
2997 "cat",
2998 "catlangname",
2999 "topics",
3000 "top",
3001 "qualifier",
3002 "cln",
3003 ):
3004 # These are expanded in the default way
3005 return None
3006 if name in (
3007 "trans-top",
3008 "trans-top-see",
3009 ):
3010 # XXX capture id from trans-top? Capture sense here
3011 # instead of trying to parse it from expanded content?
3012 if ht.get(1):
3013 sense_parts = []
3014 sense = ht.get(1)
3015 else:
3016 sense_parts = []
3017 sense = None
3018 return None
3019 if name in (
3020 "trans-bottom",
3021 "trans-mid",
3022 "checktrans-mid",
3023 "checktrans-bottom",
3024 ):
3025 return None
3026 if name == "checktrans-top":
3027 sense_parts = []
3028 sense = None
3029 return ""
3030 if name == "trans-top-also":
3031 # XXX capture?
3032 sense_parts = []
3033 sense = None
3034 return ""
3035 wxr.wtp.error(
3036 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3037 name, ht
3038 ),
3039 sortid="page/2517",
3040 )
3041 return ""
3043 wxr.wtp.expand(
3044 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3045 )
3047 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3048 nonlocal sense
3049 nonlocal sense_parts
3050 for node in xlatnode.children:
3051 # print(node)
3052 if isinstance(node, str):
3053 if sense:
3054 if not node.isspace():
3055 wxr.wtp.debug(
3056 "skipping string in the middle of "
3057 "translations: {}".format(node),
3058 sortid="page/2530",
3059 )
3060 continue
3061 # Add a part to the sense
3062 sense_parts.append(node)
3063 sense = None
3064 continue
3065 assert isinstance(node, WikiNode)
3066 kind = node.kind
3067 if kind == NodeKind.LIST:
3068 for item in node.children:
3069 if not isinstance(item, WikiNode): 3069 ↛ 3070line 3069 didn't jump to line 3070 because the condition on line 3069 was never true
3070 continue
3071 if item.kind != NodeKind.LIST_ITEM: 3071 ↛ 3072line 3071 didn't jump to line 3072 because the condition on line 3071 was never true
3072 continue
3073 if item.sarg == ":": 3073 ↛ 3074line 3073 didn't jump to line 3074 because the condition on line 3073 was never true
3074 continue
3075 parse_translation_item(item.children)
3076 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3076 ↛ 3080line 3076 didn't jump to line 3080 because the condition on line 3076 was never true
3077 # Silently skip list items that are just indented; these
3078 # are used for text between translations, such as indicating
3079 # translations that need to be checked.
3080 pass
3081 elif kind == NodeKind.TEMPLATE:
3082 parse_translation_template(node)
3083 elif kind in ( 3083 ↛ 3088line 3083 didn't jump to line 3088 because the condition on line 3083 was never true
3084 NodeKind.TABLE,
3085 NodeKind.TABLE_ROW,
3086 NodeKind.TABLE_CELL,
3087 ):
3088 parse_translation_recurse(node)
3089 elif kind == NodeKind.HTML:
3090 if node.attrs.get("class") == "NavFrame": 3090 ↛ 3096line 3090 didn't jump to line 3096 because the condition on line 3090 was never true
3091 # Reset ``sense_parts`` (and force recomputing
3092 # by clearing ``sense``) as each NavFrame specifies
3093 # its own sense. This helps eliminate garbage coming
3094 # from text at the beginning at the translations
3095 # section.
3096 sense_parts = []
3097 sense = None
3098 # for item in node.children:
3099 # if not isinstance(item, WikiNode):
3100 # continue
3101 # parse_translation_recurse(item)
3102 parse_translation_recurse(node)
3103 elif kind in LEVEL_KINDS: 3103 ↛ 3105line 3103 didn't jump to line 3105 because the condition on line 3103 was never true
3104 # Sub-levels will be recursed elsewhere
3105 pass
3106 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3107 parse_translation_recurse(node)
3108 elif kind == NodeKind.PREFORMATTED: 3108 ↛ 3109line 3108 didn't jump to line 3109 because the condition on line 3108 was never true
3109 print("parse_translation_recurse: PREFORMATTED:", node)
3110 elif kind == NodeKind.LINK: 3110 ↛ 3164line 3110 didn't jump to line 3164 because the condition on line 3110 was always true
3111 arg0 = node.largs[0]
3112 # Kludge: I've seen occasional normal links to translation
3113 # subpages from main pages (e.g., language/English/Noun
3114 # in July 2021) instead of the normal
3115 # {{see translation subpage|...}} template. This should
3116 # handle them. Note: must be careful not to read other
3117 # links, particularly things like in "human being":
3118 # "a human being -- see [[man/translations]]" (group title)
3119 if ( 3119 ↛ 3127line 3119 didn't jump to line 3127 because the condition on line 3119 was never true
3120 isinstance(arg0, (list, tuple))
3121 and arg0
3122 and isinstance(arg0[0], str)
3123 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3124 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3125 == wxr.wtp.title
3126 ):
3127 wxr.wtp.debug(
3128 "translations subpage link found on main "
3129 "page instead "
3130 "of normal {{see translation subpage|...}}",
3131 sortid="page/2595",
3132 )
3133 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3134 if sub.lower() in POS_TITLES:
3135 seq = [
3136 language,
3137 sub,
3138 TRANSLATIONS_TITLE,
3139 ]
3140 subnode = get_subpage_section(
3141 wxr.wtp.title,
3142 TRANSLATIONS_TITLE,
3143 [seq],
3144 )
3145 if subnode is not None and isinstance(
3146 subnode, WikiNode
3147 ):
3148 parse_translations(data, subnode)
3149 else:
3150 wxr.wtp.error(
3151 "/translations link outside part-of-speech"
3152 )
3154 if (
3155 len(arg0) >= 1
3156 and isinstance(arg0[0], str)
3157 and not arg0[0].lower().startswith("category:")
3158 ):
3159 for x in node.largs[-1]:
3160 if isinstance(x, str): 3160 ↛ 3163line 3160 didn't jump to line 3163 because the condition on line 3160 was always true
3161 sense_parts.append(x)
3162 else:
3163 parse_translation_recurse(x)
3164 elif not sense:
3165 sense_parts.append(node)
3166 else:
3167 wxr.wtp.debug(
3168 "skipping text between translation items/senses: "
3169 "{}".format(node),
3170 sortid="page/2621",
3171 )
3173 # Main code of parse_translation(). We want ``sense`` to be assigned
3174 # regardless of recursion levels, and thus the code is structured
3175 # to define at this level and recurse in parse_translation_recurse().
3176 parse_translation_recurse(xlatnode)
3178 def parse_etymology(data: WordData, node: LevelNode) -> None:
3179 """Parses an etymology section."""
3180 assert isinstance(data, dict)
3181 assert isinstance(node, WikiNode)
3183 templates: list[TemplateData] = []
3185 # Counter for preventing the capture of etymology templates
3186 # when we are inside templates that we want to ignore (i.e.,
3187 # not capture).
3188 ignore_count = 0
3190 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3191 nonlocal ignore_count
3192 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3193 return ""
3194 if re.match(ignored_etymology_templates_re, name):
3195 ignore_count += 1
3196 return None
3198 # CONTINUE_HERE
3200 def etym_post_template_fn(
3201 name: str, ht: TemplateArgs, expansion: str
3202 ) -> None:
3203 nonlocal ignore_count
3204 if name in wikipedia_templates:
3205 parse_wikipedia_template(wxr, data, ht)
3206 return None
3207 if re.match(ignored_etymology_templates_re, name):
3208 ignore_count -= 1
3209 return None
3210 if ignore_count == 0: 3210 ↛ 3216line 3210 didn't jump to line 3216 because the condition on line 3210 was always true
3211 ht = clean_template_args(wxr, ht)
3212 expansion = clean_node(wxr, None, expansion)
3213 templates.append(
3214 {"name": name, "args": ht, "expansion": expansion}
3215 )
3216 return None
3218 # Remove any subsections
3219 contents = list(
3220 x
3221 for x in node.children
3222 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3223 )
3224 # Convert to text, also capturing templates using post_template_fn
3225 text = clean_node(
3226 wxr,
3227 None,
3228 contents,
3229 template_fn=etym_template_fn,
3230 post_template_fn=etym_post_template_fn,
3231 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3232 # Save the collected information.
3233 if len(text) > 0:
3234 data["etymology_text"] = text
3235 if len(templates) > 0:
3236 # Some etymology templates, like Template:root do not generate
3237 # text, so they should be added here. Elsewhere, we check
3238 # for Template:root and add some text to the expansion to please
3239 # the validation.
3240 data["etymology_templates"] = templates
3242 for child_node in node.find_child_recursively( 3242 ↛ exitline 3242 didn't return from function 'parse_etymology' because the loop on line 3242 didn't complete
3243 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3244 ):
3245 if child_node.kind in LEVEL_KIND_FLAGS:
3246 break
3247 elif isinstance( 3247 ↛ 3250line 3247 didn't jump to line 3250 because the condition on line 3247 was never true
3248 child_node, TemplateNode
3249 ) and child_node.template_name in ["zh-x", "zh-q"]:
3250 if "etymology_examples" not in data:
3251 data["etymology_examples"] = []
3252 data["etymology_examples"].extend(
3253 extract_template_zh_x(
3254 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3255 )
3256 )
3258 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3259 """This recurses into a subtree in the parse tree for a page."""
3260 nonlocal etym_data
3261 nonlocal pos_data
3262 nonlocal inside_level_four
3264 redirect_list: list[str] = [] # for `zh-see` template
3266 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3267 """This is called for otherwise unprocessed parts of the page.
3268 We still expand them so that e.g. Category links get captured."""
3269 if name in wikipedia_templates:
3270 data = select_data()
3271 parse_wikipedia_template(wxr, data, ht)
3272 return None
3273 if is_panel_template(wxr, name):
3274 return ""
3275 return None
3277 for node in treenode.children:
3278 if not isinstance(node, WikiNode):
3279 # print(" X{}".format(repr(node)[:40]))
3280 continue
3281 if isinstance(node, TemplateNode):
3282 if process_soft_redirect_template(wxr, node, redirect_list):
3283 continue
3284 elif node.template_name == "zh-forms":
3285 extract_zh_forms_template(wxr, node, select_data())
3286 elif (
3287 node.template_name.endswith("-kanjitab")
3288 or node.template_name == "ja-kt"
3289 ):
3290 extract_ja_kanjitab_template(wxr, node, select_data())
3292 if not isinstance(node, LevelNode):
3293 # XXX handle e.g. wikipedia links at the top of a language
3294 # XXX should at least capture "also" at top of page
3295 if node.kind in (
3296 NodeKind.HLINE,
3297 NodeKind.LIST,
3298 NodeKind.LIST_ITEM,
3299 ):
3300 continue
3301 # print(" UNEXPECTED: {}".format(node))
3302 # Clean the node to collect category links
3303 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3304 continue
3305 t = clean_node(
3306 wxr, etym_data, node.sarg if node.sarg else node.largs
3307 )
3308 t = t.lower()
3309 # XXX these counts were never implemented fully, and even this
3310 # gets discarded: Search STATISTICS_IMPLEMENTATION
3311 wxr.config.section_counts[t] += 1
3312 # print("PROCESS_CHILDREN: T:", repr(t))
3313 if t in IGNORED_TITLES:
3314 pass
3315 elif t.startswith(PRONUNCIATION_TITLE):
3316 # Chinese Pronunciation section kludge; we demote these to
3317 # be level 4 instead of 3 so that they're part of a larger
3318 # etymology hierarchy; usually the data here is empty and
3319 # acts as an inbetween between POS and Etymology data
3320 if lang_code in ("zh",):
3321 inside_level_four = True
3322 if t.startswith(PRONUNCIATION_TITLE + " "):
3323 # Pronunciation 1, etc, are used in Chinese Glyphs,
3324 # and each of them may have senses under Definition
3325 push_level_four_section(True)
3326 wxr.wtp.start_subsection(None)
3327 if wxr.config.capture_pronunciation: 3327 ↛ 3435line 3327 didn't jump to line 3435 because the condition on line 3327 was always true
3328 data = select_data()
3329 parse_pronunciation(
3330 wxr,
3331 node,
3332 data,
3333 etym_data,
3334 have_etym,
3335 base_data,
3336 lang_code,
3337 )
3338 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3339 push_etym()
3340 wxr.wtp.start_subsection(None)
3341 if wxr.config.capture_etymologies: 3341 ↛ 3435line 3341 didn't jump to line 3435 because the condition on line 3341 was always true
3342 m = re.search(r"\s(\d+(\.\d+)?)$", t)
3343 if m:
3344 etym_data["etymology_number"] = m.group(1)
3345 parse_etymology(etym_data, node)
3346 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3347 data = select_data()
3348 extract_descendant_section(wxr, data, node, False)
3349 elif (
3350 t in PROTO_ROOT_DERIVED_TITLES
3351 and pos == "root"
3352 and is_reconstruction
3353 and wxr.config.capture_descendants
3354 ):
3355 data = select_data()
3356 extract_descendant_section(wxr, data, node, True)
3357 elif t == TRANSLATIONS_TITLE:
3358 data = select_data()
3359 parse_translations(data, node)
3360 elif t in INFLECTION_TITLES:
3361 parse_inflection(node, t, pos)
3362 elif t == "alternative forms":
3363 extract_alt_form_section(wxr, select_data(), node)
3364 else:
3365 lst = t.split()
3366 while len(lst) > 1 and lst[-1].isdigit():
3367 lst = lst[:-1]
3368 t_no_number = " ".join(lst).lower()
3369 if t_no_number in POS_TITLES:
3370 push_pos()
3371 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3372 pos = dt["pos"] or "MISSING_POS"
3373 wxr.wtp.start_subsection(t)
3374 if "debug" in dt:
3375 wxr.wtp.debug(
3376 "{} in section {}".format(dt["debug"], t),
3377 sortid="page/2755",
3378 )
3379 if "warning" in dt: 3379 ↛ 3380line 3379 didn't jump to line 3380 because the condition on line 3379 was never true
3380 wxr.wtp.wiki_notice(
3381 "{} in section {}".format(dt["warning"], t),
3382 sortid="page/2759",
3383 )
3384 if "error" in dt: 3384 ↛ 3385line 3384 didn't jump to line 3385 because the condition on line 3384 was never true
3385 wxr.wtp.error(
3386 "{} in section {}".format(dt["error"], t),
3387 sortid="page/2763",
3388 )
3389 if "note" in dt: 3389 ↛ 3390line 3389 didn't jump to line 3390 because the condition on line 3389 was never true
3390 wxr.wtp.note(
3391 "{} in section {}".format(dt["note"], t),
3392 sortid="page/20251017a",
3393 )
3394 if "wiki_notice" in dt: 3394 ↛ 3395line 3394 didn't jump to line 3395 because the condition on line 3394 was never true
3395 wxr.wtp.wiki_notice(
3396 "{} in section {}".format(dt["wiki_notices"], t),
3397 sortid="page/20251017b",
3398 )
3399 # Parse word senses for the part-of-speech
3400 parse_part_of_speech(node, pos)
3401 if "tags" in dt:
3402 for pdata in sense_datas:
3403 data_extend(pdata, "tags", dt["tags"])
3404 elif t_no_number in LINKAGE_TITLES:
3405 # print(f"LINKAGE_TITLES NODE {node=}")
3406 rel = LINKAGE_TITLES[t_no_number]
3407 data = select_data()
3408 parse_linkage(
3409 wxr,
3410 data,
3411 rel,
3412 node,
3413 word,
3414 sense_datas,
3415 is_reconstruction,
3416 )
3417 elif t_no_number == COMPOUNDS_TITLE:
3418 data = select_data()
3419 if wxr.config.capture_compounds: 3419 ↛ 3435line 3419 didn't jump to line 3435 because the condition on line 3419 was always true
3420 parse_linkage(
3421 wxr,
3422 data,
3423 "derived",
3424 node,
3425 word,
3426 sense_datas,
3427 is_reconstruction,
3428 )
3430 # XXX parse interesting templates also from other sections. E.g.,
3431 # {{Letter|...}} in ===See also===
3432 # Also <gallery>
3434 # Recurse to children of this node, processing subtitles therein
3435 stack.append(t)
3436 process_children(node, pos)
3437 stack.pop()
3439 if len(redirect_list) > 0:
3440 if len(pos_data) > 0:
3441 pos_data["redirects"] = redirect_list
3442 if "pos" not in pos_data: 3442 ↛ 3443line 3442 didn't jump to line 3443 because the condition on line 3442 was never true
3443 pos_data["pos"] = "soft-redirect"
3444 else:
3445 new_page_data = copy.deepcopy(base_data)
3446 new_page_data["redirects"] = redirect_list
3447 if "pos" not in new_page_data: 3447 ↛ 3449line 3447 didn't jump to line 3449 because the condition on line 3447 was always true
3448 new_page_data["pos"] = "soft-redirect"
3449 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3450 page_datas.append(new_page_data)
3452 def extract_examples(
3453 others: list[WikiNode], sense_base: SenseData
3454 ) -> list[ExampleData]:
3455 """Parses through a list of definitions and quotes to find examples.
3456 Returns a list of example dicts to be added to sense data. Adds
3457 meta-data, mostly categories, into sense_base."""
3458 assert isinstance(others, list)
3459 examples: list[ExampleData] = []
3461 for sub in others:
3462 if not sub.sarg.endswith((":", "*")): 3462 ↛ 3463line 3462 didn't jump to line 3463 because the condition on line 3462 was never true
3463 continue
3464 for item in sub.children:
3465 if not isinstance(item, WikiNode): 3465 ↛ 3466line 3465 didn't jump to line 3466 because the condition on line 3465 was never true
3466 continue
3467 if item.kind != NodeKind.LIST_ITEM: 3467 ↛ 3468line 3467 didn't jump to line 3468 because the condition on line 3467 was never true
3468 continue
3469 usex_type = None
3470 example_template_args = []
3471 example_template_names = []
3472 taxons = set()
3474 # Bypass this function when parsing Chinese, Japanese and
3475 # quotation templates.
3476 new_example_lists = extract_example_list_item(
3477 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3478 )
3479 if len(new_example_lists) > 0:
3480 examples.extend(new_example_lists)
3481 continue
3483 def usex_template_fn(
3484 name: str, ht: TemplateArgs
3485 ) -> Optional[str]:
3486 nonlocal usex_type
3487 if is_panel_template(wxr, name):
3488 return ""
3489 if name in usex_templates:
3490 usex_type = "example"
3491 example_template_args.append(ht)
3492 example_template_names.append(name)
3493 elif name in quotation_templates:
3494 usex_type = "quotation"
3495 elif name in taxonomy_templates: 3495 ↛ 3496line 3495 didn't jump to line 3496 because the condition on line 3495 was never true
3496 taxons.update(ht.get(1, "").split())
3497 for prefix in template_linkages_to_ignore_in_examples:
3498 if re.search(
3499 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3500 ):
3501 return ""
3502 return None
3504 # bookmark
3505 ruby: list[tuple[str, str]] = []
3506 contents = item.children
3507 if lang_code == "ja":
3508 # Capture ruby contents if this is a Japanese language
3509 # example.
3510 # print(contents)
3511 if ( 3511 ↛ 3516line 3511 didn't jump to line 3516 because the condition on line 3511 was never true
3512 contents
3513 and isinstance(contents, str)
3514 and re.match(r"\s*$", contents[0])
3515 ):
3516 contents = contents[1:]
3517 exp = wxr.wtp.parse(
3518 wxr.wtp.node_to_wikitext(contents),
3519 # post_template_fn=head_post_template_fn,
3520 expand_all=True,
3521 )
3522 rub, rest = extract_ruby(wxr, exp.children)
3523 if rub:
3524 for rtup in rub:
3525 ruby.append(rtup)
3526 contents = rest
3527 subtext = clean_node(
3528 wxr, sense_base, contents, template_fn=usex_template_fn
3529 )
3531 frozen_taxons = frozenset(taxons)
3532 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3534 # print(f"{subtext=}")
3535 subtext = re.sub(
3536 r"\s*\(please add an English "
3537 r"translation of this "
3538 r"(example|usage example|quote)\)",
3539 "",
3540 subtext,
3541 ).strip()
3542 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3543 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3544 # print("subtext:", repr(subtext))
3546 lines = subtext.splitlines()
3547 # print(lines)
3549 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3550 lines = list(
3551 x
3552 for x in lines
3553 if not re.match(
3554 r"(Synonyms: |Antonyms: |Hyponyms: |"
3555 r"Synonym: |Antonym: |Hyponym: |"
3556 r"Hypernyms: |Derived terms: |"
3557 r"Related terms: |"
3558 r"Hypernym: |Derived term: |"
3559 r"Coordinate terms:|"
3560 r"Related term: |"
3561 r"For more quotations using )",
3562 x,
3563 )
3564 )
3565 tr = ""
3566 ref = ""
3567 roman = ""
3568 # for line in lines:
3569 # print("LINE:", repr(line))
3570 # print(classify_desc(line))
3571 if len(lines) == 1 and lang_code != "en":
3572 parts = example_splitter_re.split(lines[0])
3573 if ( 3573 ↛ 3581line 3573 didn't jump to line 3581 because the condition on line 3573 was never true
3574 len(parts) > 2
3575 and len(example_template_args) == 1
3576 and any(
3577 ("―" in s) or ("—" in s)
3578 for s in example_template_args[0].values()
3579 )
3580 ):
3581 if nparts := synch_splits_with_args(
3582 lines[0], example_template_args[0]
3583 ):
3584 parts = nparts
3585 if ( 3585 ↛ 3590line 3585 didn't jump to line 3590 because the condition on line 3585 was never true
3586 len(example_template_args) == 1
3587 and "lit" in example_template_args[0]
3588 ):
3589 # ugly brute-force kludge in case there's a lit= arg
3590 literally = example_template_args[0].get("lit", "")
3591 if literally:
3592 literally = (
3593 " (literally, “"
3594 + clean_value(wxr, literally)
3595 + "”)"
3596 )
3597 else:
3598 literally = ""
3599 if ( 3599 ↛ 3638line 3599 didn't jump to line 3638 because the condition on line 3599 was never true
3600 len(example_template_args) == 1
3601 and len(parts) == 2
3602 and len(example_template_args[0])
3603 - (
3604 # horrible kludge to ignore these arguments
3605 # when calculating how many there are
3606 sum(
3607 s in example_template_args[0]
3608 for s in (
3609 "lit", # generates text, but we handle it
3610 "inline",
3611 "noenum",
3612 "nocat",
3613 "sort",
3614 )
3615 )
3616 )
3617 == 3
3618 and clean_value(
3619 wxr, example_template_args[0].get(2, "")
3620 )
3621 == parts[0].strip()
3622 and clean_value(
3623 wxr,
3624 (
3625 example_template_args[0].get(3)
3626 or example_template_args[0].get("translation")
3627 or example_template_args[0].get("t", "")
3628 )
3629 + literally, # in case there's a lit= argument
3630 )
3631 == parts[1].strip()
3632 ):
3633 # {{exampletemplate|ex|Foo bar baz|English translation}}
3634 # is a pretty reliable 'heuristic', so we use it here
3635 # before the others. To be extra sure the template
3636 # doesn't do anything weird, we compare the arguments
3637 # and the output to each other.
3638 lines = [parts[0].strip()]
3639 tr = parts[1].strip()
3640 elif (
3641 len(parts) == 2
3642 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3643 ):
3644 # These other branches just do some simple heuristics w/
3645 # the expanded output of the template (if applicable).
3646 lines = [parts[0].strip()]
3647 tr = parts[1].strip()
3648 elif ( 3648 ↛ 3654line 3648 didn't jump to line 3654 because the condition on line 3648 was never true
3649 len(parts) == 3
3650 and classify_desc2(parts[1])
3651 in ("romanization", "english")
3652 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3653 ):
3654 lines = [parts[0].strip()]
3655 roman = parts[1].strip()
3656 tr = parts[2].strip()
3657 else:
3658 parts = re.split(r"\s+-\s+", lines[0])
3659 if ( 3659 ↛ 3663line 3659 didn't jump to line 3663 because the condition on line 3659 was never true
3660 len(parts) == 2
3661 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3662 ):
3663 lines = [parts[0].strip()]
3664 tr = parts[1].strip()
3665 elif len(lines) > 1:
3666 if any(
3667 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3668 ) and not (len(example_template_names) == 1):
3669 refs: list[str] = []
3670 for i in range(len(lines)): 3670 ↛ 3676line 3670 didn't jump to line 3676 because the loop on line 3670 didn't complete
3671 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3671 ↛ 3672line 3671 didn't jump to line 3672 because the condition on line 3671 was never true
3672 break
3673 refs.append(lines[i].strip())
3674 if re.search(r"[]\d:)]\s*$", lines[i]):
3675 break
3676 ref = " ".join(refs)
3677 lines = lines[i + 1 :]
3678 if (
3679 lang_code != "en"
3680 and len(lines) >= 2
3681 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3682 ):
3683 i = len(lines) - 1
3684 while ( 3684 ↛ 3689line 3684 didn't jump to line 3689 because the condition on line 3684 was never true
3685 i > 1
3686 and classify_desc2(lines[i - 1])
3687 in ENGLISH_TEXTS
3688 ):
3689 i -= 1
3690 tr = "\n".join(lines[i:])
3691 lines = lines[:i]
3692 if len(lines) >= 2:
3693 if classify_desc2(lines[-1]) == "romanization":
3694 roman = lines[-1].strip()
3695 lines = lines[:-1]
3697 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3698 ref = lines[0]
3699 lines = lines[1:]
3700 elif lang_code != "en" and len(lines) == 2:
3701 cls1 = classify_desc2(lines[0])
3702 cls2 = classify_desc2(lines[1])
3703 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3704 tr = lines[1]
3705 lines = [lines[0]]
3706 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3706 ↛ 3707line 3706 didn't jump to line 3707 because the condition on line 3706 was never true
3707 tr = lines[0]
3708 lines = [lines[1]]
3709 elif ( 3709 ↛ 3716line 3709 didn't jump to line 3716 because the condition on line 3709 was never true
3710 re.match(r"^[#*]*:+", lines[1])
3711 and classify_desc2(
3712 re.sub(r"^[#*:]+\s*", "", lines[1])
3713 )
3714 in ENGLISH_TEXTS
3715 ):
3716 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3717 lines = [lines[0]]
3718 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3719 # Both were classified as English, but
3720 # presumably one is not. Assume first is
3721 # non-English, as that seems more common.
3722 tr = lines[1]
3723 lines = [lines[0]]
3724 elif (
3725 usex_type != "quotation"
3726 and lang_code != "en"
3727 and len(lines) == 3
3728 ):
3729 cls1 = classify_desc2(lines[0])
3730 cls2 = classify_desc2(lines[1])
3731 cls3 = classify_desc2(lines[2])
3732 if (
3733 cls3 == "english"
3734 and cls2 in ("english", "romanization")
3735 and cls1 != "english"
3736 ):
3737 tr = lines[2].strip()
3738 roman = lines[1].strip()
3739 lines = [lines[0].strip()]
3740 elif ( 3740 ↛ 3748line 3740 didn't jump to line 3748 because the condition on line 3740 was never true
3741 usex_type == "quotation"
3742 and lang_code != "en"
3743 and len(lines) > 2
3744 ):
3745 # for x in lines:
3746 # print(" LINE: {}: {}"
3747 # .format(classify_desc2(x), x))
3748 if re.match(r"^[#*]*:+\s*$", lines[1]):
3749 ref = lines[0]
3750 lines = lines[2:]
3751 cls1 = classify_desc2(lines[-1])
3752 if cls1 == "english":
3753 i = len(lines) - 1
3754 while (
3755 i > 1
3756 and classify_desc2(lines[i - 1])
3757 == ENGLISH_TEXTS
3758 ):
3759 i -= 1
3760 tr = "\n".join(lines[i:])
3761 lines = lines[:i]
3763 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3764 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3765 tr = re.sub(r"^[#*:]+\s*", "", tr)
3766 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3767 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3768 ref = re.sub(r"^[#*:]+\s*", "", ref)
3769 ref = re.sub(
3770 r", (volume |number |page )?“?"
3771 r"\(please specify ([^)]|\(s\))*\)”?|"
3772 ", text here$",
3773 "",
3774 ref,
3775 )
3776 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3777 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3778 subtext = "\n".join(x for x in lines if x)
3779 if not tr and lang_code != "en":
3780 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3781 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3781 ↛ 3782line 3781 didn't jump to line 3782 because the condition on line 3781 was never true
3782 tr = m.group(2)
3783 subtext = subtext[: m.start()] + m.group(1)
3784 elif lines:
3785 parts = re.split(r"\s*[―—]+\s*", lines[0])
3786 if ( 3786 ↛ 3790line 3786 didn't jump to line 3790 because the condition on line 3786 was never true
3787 len(parts) == 2
3788 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3789 ):
3790 subtext = parts[0].strip()
3791 tr = parts[1].strip()
3792 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3793 subtext = re.sub(
3794 r"(please add an English translation of "
3795 r"this (quote|usage example))",
3796 "",
3797 subtext,
3798 )
3799 subtext = re.sub(
3800 r"\s*→New International Version " "translation$",
3801 "",
3802 subtext,
3803 ) # e.g. pis/Tok Pisin (Bible)
3804 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
3805 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
3806 note = None
3807 m = re.match(r"^\(([^)]*)\):\s+", subtext)
3808 if ( 3808 ↛ 3816line 3808 didn't jump to line 3816 because the condition on line 3808 was never true
3809 m is not None
3810 and lang_code != "en"
3811 and (
3812 m.group(1).startswith("with ")
3813 or classify_desc2(m.group(1)) == "english"
3814 )
3815 ):
3816 note = m.group(1)
3817 subtext = subtext[m.end() :]
3818 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
3819 ref = re.sub(r",\s*→ISBN", "", ref)
3820 ref = ref.strip()
3821 if ref.endswith(":") or ref.endswith(","):
3822 ref = ref[:-1].strip()
3823 ref = re.sub(r"\s+,\s+", ", ", ref)
3824 ref = re.sub(r"\s+", " ", ref)
3825 if ref and not subtext: 3825 ↛ 3826line 3825 didn't jump to line 3826 because the condition on line 3825 was never true
3826 subtext = ref
3827 ref = ""
3828 if subtext:
3829 dt: ExampleData = {"text": subtext}
3830 if ref:
3831 dt["ref"] = ref
3832 if tr:
3833 dt["english"] = tr # DEPRECATED for "translation"
3834 dt["translation"] = tr
3835 if usex_type:
3836 dt["type"] = usex_type
3837 if note: 3837 ↛ 3838line 3837 didn't jump to line 3838 because the condition on line 3837 was never true
3838 dt["note"] = note
3839 if roman:
3840 dt["roman"] = roman
3841 if ruby:
3842 dt["ruby"] = ruby
3843 examples.append(dt)
3845 return examples
3847 # Main code of parse_language()
3848 # Process the section
3849 stack.append(language)
3850 process_children(langnode, None)
3851 stack.pop()
3853 # Finalize word entires
3854 push_etym()
3855 ret = []
3856 for data in page_datas:
3857 merge_base(data, base_data)
3858 ret.append(data)
3860 # Copy all tags to word senses
3861 for data in ret:
3862 if "senses" not in data: 3862 ↛ 3863line 3862 didn't jump to line 3863 because the condition on line 3862 was never true
3863 continue
3864 # WordData should not have a 'tags' field, but if it does, it's
3865 # deleted and its contents removed and placed in each sense;
3866 # that's why the type ignores.
3867 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
3868 if "tags" in data:
3869 del data["tags"] # type: ignore[typeddict-item]
3870 for sense in data["senses"]:
3871 data_extend(sense, "tags", tags)
3873 return ret
3876def parse_wikipedia_template(
3877 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
3878) -> None:
3879 """Helper function for parsing {{wikipedia|...}} and related templates."""
3880 assert isinstance(wxr, WiktextractContext)
3881 assert isinstance(data, dict)
3882 assert isinstance(ht, dict)
3883 langid = clean_node(wxr, data, ht.get("lang", ()))
3884 pagename = (
3885 clean_node(wxr, data, ht.get(1, ()))
3886 or wxr.wtp.title
3887 or "MISSING_PAGE_TITLE"
3888 )
3889 if langid:
3890 data_append(data, "wikipedia", langid + ":" + pagename)
3891 else:
3892 data_append(data, "wikipedia", pagename)
3895def parse_top_template(
3896 wxr: WiktextractContext, node: WikiNode, data: WordData
3897) -> None:
3898 """Parses a template that occurs on the top-level in a page, before any
3899 language subtitles."""
3900 assert isinstance(wxr, WiktextractContext)
3901 assert isinstance(node, WikiNode)
3902 assert isinstance(data, dict)
3904 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3905 if name in wikipedia_templates:
3906 parse_wikipedia_template(wxr, data, ht)
3907 return None
3908 if is_panel_template(wxr, name):
3909 return ""
3910 if name in ("reconstruction",): 3910 ↛ 3911line 3910 didn't jump to line 3911 because the condition on line 3910 was never true
3911 return ""
3912 if name.lower() == "also" or name.lower().startswith("also/"):
3913 # XXX shows related words that might really have been the intended
3914 # word, capture them
3915 return ""
3916 if name == "see also": 3916 ↛ 3918line 3916 didn't jump to line 3918 because the condition on line 3916 was never true
3917 # XXX capture
3918 return ""
3919 if name == "cardinalbox": 3919 ↛ 3921line 3919 didn't jump to line 3921 because the condition on line 3919 was never true
3920 # XXX capture
3921 return ""
3922 if name == "character info": 3922 ↛ 3924line 3922 didn't jump to line 3924 because the condition on line 3922 was never true
3923 # XXX capture
3924 return ""
3925 if name == "commonscat": 3925 ↛ 3927line 3925 didn't jump to line 3927 because the condition on line 3925 was never true
3926 # XXX capture link to Wikimedia commons
3927 return ""
3928 if name == "wrongtitle": 3928 ↛ 3931line 3928 didn't jump to line 3931 because the condition on line 3928 was never true
3929 # XXX this should be captured to replace page title with the
3930 # correct title. E.g. ⿰亻革家
3931 return ""
3932 if name == "wikidata": 3932 ↛ 3933line 3932 didn't jump to line 3933 because the condition on line 3932 was never true
3933 arg = clean_node(wxr, data, ht.get(1, ()))
3934 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
3935 data_append(data, "wikidata", arg)
3936 return ""
3937 wxr.wtp.debug(
3938 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
3939 sortid="page/2870",
3940 )
3941 return ""
3943 clean_node(wxr, None, [node], template_fn=top_template_fn)
3946def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
3947 """Fix subtitle hierarchy to be strict Language -> Etymology ->
3948 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
3949 that are next to each other."""
3951 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
3952 # section get overwritten. In this case, let's just combine the two.
3954 # In Chinese entries, Pronunciation can be preceded on the
3955 # same level 3 by its Etymology *and* Glyph Origin sections:
3956 # ===Glyph Origin===
3957 # ===Etymology===
3958 # ===Pronunciation===
3959 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
3960 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
3961 # are now level 6
3963 # Known lowercase PoS names are in part_of_speech_map
3964 # Known lowercase linkage section names are in linkage_map
3966 old = re.split(
3967 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
3968 )
3970 parts = []
3971 npar = 4 # Number of parentheses in above expression
3972 parts.append(old[0])
3973 prev_level = None
3974 level = None
3975 skip_level_title = False # When combining etymology sections
3976 for i in range(1, len(old), npar + 1):
3977 left = old[i]
3978 right = old[i + npar - 1]
3979 # remove Wikilinks in title
3980 title = re.sub(r"^\[\[", "", old[i + 1])
3981 title = re.sub(r"\]\]$", "", title)
3982 prev_level = level
3983 level = len(left)
3984 part = old[i + npar]
3985 if level != len(right): 3985 ↛ 3986line 3985 didn't jump to line 3986 because the condition on line 3985 was never true
3986 wxr.wtp.debug(
3987 "subtitle has unbalanced levels: "
3988 "{!r} has {} on the left and {} on the right".format(
3989 title, left, right
3990 ),
3991 sortid="page/2904",
3992 )
3993 lc = title.lower()
3994 if name_to_code(title, "en") != "":
3995 if level > 2: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true
3996 wxr.wtp.debug(
3997 "subtitle has language name {} at level {}".format(
3998 title, level
3999 ),
4000 sortid="page/2911",
4001 )
4002 level = 2
4003 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4004 if level > 3: 4004 ↛ 4005line 4004 didn't jump to line 4005 because the condition on line 4004 was never true
4005 wxr.wtp.debug(
4006 "etymology section {} at level {}".format(title, level),
4007 sortid="page/2917",
4008 )
4009 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
4010 # sections cheek-to-cheek
4011 skip_level_title = True
4012 # Modify the title of previous ("Glyph Origin") section, in
4013 # case we have a meaningful title like "Etymology 1"
4014 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4015 level = 3
4016 elif lc.startswith(PRONUNCIATION_TITLE):
4017 # Pronunciation is now a level between POS and Etymology, so
4018 # we need to shift everything down by one
4019 level = 4
4020 elif lc in POS_TITLES:
4021 level = 5
4022 elif lc == TRANSLATIONS_TITLE:
4023 level = 6
4024 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4025 level = 6
4026 elif lc in INFLECTION_TITLES:
4027 level = 6
4028 elif lc == DESCENDANTS_TITLE:
4029 level = 6
4030 elif title in PROTO_ROOT_DERIVED_TITLES: 4030 ↛ 4031line 4030 didn't jump to line 4031 because the condition on line 4030 was never true
4031 level = 6
4032 elif lc in IGNORED_TITLES:
4033 level = 6
4034 else:
4035 level = 6
4036 if skip_level_title:
4037 skip_level_title = False
4038 parts.append(part)
4039 else:
4040 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4041 parts.append(part)
4042 # print("=" * level, title)
4043 # if level != len(left):
4044 # print(" FIXED LEVEL OF {} {} -> {}"
4045 # .format(title, len(left), level))
4047 text = "".join(parts)
4048 # print(text)
4049 return text
4052def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4053 # Skip translation pages
4054 if word.endswith("/" + TRANSLATIONS_TITLE): 4054 ↛ 4055line 4054 didn't jump to line 4055 because the condition on line 4054 was never true
4055 return []
4057 if wxr.config.verbose: 4057 ↛ 4058line 4057 didn't jump to line 4058 because the condition on line 4057 was never true
4058 logger.info(f"Parsing page: {word}")
4060 wxr.config.word = word
4061 wxr.wtp.start_page(word)
4063 # Remove <noinclude> and similar tags from main pages. They
4064 # should not appear there, but at least net/Elfdala has one and it
4065 # is probably not the only one.
4066 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4067 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4068 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4070 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4071 # pages that have, for example, Translations section under Linkage, or
4072 # Translations section on the same level as Noun. Enforce a proper
4073 # hierarchy by manipulating the subtitle levels in certain cases.
4074 text = fix_subtitle_hierarchy(wxr, text)
4076 # Parse the page, pre-expanding those templates that are likely to
4077 # influence parsing
4078 tree = wxr.wtp.parse(
4079 text,
4080 pre_expand=True,
4081 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4082 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4083 )
4084 # from wikitextprocessor.parser import print_tree
4085 # print("PAGE PARSE:", print_tree(tree))
4087 top_data: WordData = {}
4089 # Iterate over top-level titles, which should be languages for normal
4090 # pages
4091 by_lang = defaultdict(list)
4092 for langnode in tree.children:
4093 if not isinstance(langnode, WikiNode):
4094 continue
4095 if langnode.kind == NodeKind.TEMPLATE:
4096 parse_top_template(wxr, langnode, top_data)
4097 continue
4098 if langnode.kind == NodeKind.LINK:
4099 # Some pages have links at top level, e.g., "trees" in Wiktionary
4100 continue
4101 if langnode.kind != NodeKind.LEVEL2: 4101 ↛ 4102line 4101 didn't jump to line 4102 because the condition on line 4101 was never true
4102 wxr.wtp.debug(
4103 f"unexpected top-level node: {langnode}", sortid="page/3014"
4104 )
4105 continue
4106 lang = clean_node(
4107 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4108 )
4109 lang_code = name_to_code(lang, "en")
4110 if lang_code == "": 4110 ↛ 4111line 4110 didn't jump to line 4111 because the condition on line 4110 was never true
4111 wxr.wtp.debug(
4112 f"unrecognized language name: {lang}", sortid="page/3019"
4113 )
4114 if (
4115 wxr.config.capture_language_codes
4116 and lang_code not in wxr.config.capture_language_codes
4117 ):
4118 continue
4119 wxr.wtp.start_section(lang)
4121 # Collect all words from the page.
4122 # print(f"{langnode=}")
4123 datas = parse_language(wxr, langnode, lang, lang_code)
4125 # Propagate fields resulting from top-level templates to this
4126 # part-of-speech.
4127 for data in datas:
4128 if "lang" not in data: 4128 ↛ 4129line 4128 didn't jump to line 4129 because the condition on line 4128 was never true
4129 wxr.wtp.debug(
4130 "internal error -- no lang in data: {}".format(data),
4131 sortid="page/3034",
4132 )
4133 continue
4134 for k, v in top_data.items():
4135 assert isinstance(v, (list, tuple))
4136 data_extend(data, k, v)
4137 by_lang[data["lang"]].append(data)
4139 # XXX this code is clearly out of date. There is no longer a "conjugation"
4140 # field. FIX OR REMOVE.
4141 # Do some post-processing on the words. For example, we may distribute
4142 # conjugation information to all the words.
4143 ret = []
4144 for lang, lang_datas in by_lang.items():
4145 ret.extend(lang_datas)
4147 for x in ret:
4148 if x["word"] != word:
4149 if word.startswith("Unsupported titles/"):
4150 wxr.wtp.debug(
4151 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4152 sortid="20231101/3578page.py",
4153 )
4154 else:
4155 wxr.wtp.debug(
4156 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4157 sortid="20231101/3582page.py",
4158 )
4159 x["original_title"] = word
4160 # validate tag data
4161 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4162 return ret
4165def recursively_separate_raw_tags(
4166 wxr: WiktextractContext, data: dict[str, Any]
4167) -> None:
4168 if not isinstance(data, dict): 4168 ↛ 4169line 4168 didn't jump to line 4169 because the condition on line 4168 was never true
4169 wxr.wtp.error(
4170 "'data' is not dict; most probably "
4171 "data has a list that contains at least one dict and "
4172 "at least one non-dict item",
4173 sortid="en/page-4016/20240419",
4174 )
4175 return
4176 new_tags: list[str] = []
4177 raw_tags: list[str] = data.get("raw_tags", [])
4178 for field, val in data.items():
4179 if field == "tags":
4180 for tag in val:
4181 if tag not in valid_tags:
4182 raw_tags.append(tag)
4183 else:
4184 new_tags.append(tag)
4185 if isinstance(val, list):
4186 if len(val) > 0 and isinstance(val[0], dict):
4187 for d in val:
4188 recursively_separate_raw_tags(wxr, d)
4189 if "tags" in data and not new_tags:
4190 del data["tags"]
4191 elif new_tags:
4192 data["tags"] = new_tags
4193 if raw_tags:
4194 data["raw_tags"] = raw_tags
4197def process_soft_redirect_template(
4198 wxr: WiktextractContext,
4199 template_node: TemplateNode,
4200 redirect_pages: list[str],
4201) -> bool:
4202 # return `True` if the template is soft redirect template
4203 if template_node.template_name == "zh-see":
4204 # https://en.wiktionary.org/wiki/Template:zh-see
4205 title = clean_node(
4206 wxr, None, template_node.template_parameters.get(1, "")
4207 )
4208 if title != "": 4208 ↛ 4210line 4208 didn't jump to line 4210 because the condition on line 4208 was always true
4209 redirect_pages.append(title)
4210 return True
4211 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4212 # https://en.wiktionary.org/wiki/Template:ja-see
4213 for key, value in template_node.template_parameters.items():
4214 if isinstance(key, int): 4214 ↛ 4213line 4214 didn't jump to line 4213 because the condition on line 4214 was always true
4215 title = clean_node(wxr, None, value)
4216 if title != "": 4216 ↛ 4213line 4216 didn't jump to line 4213 because the condition on line 4216 was always true
4217 redirect_pages.append(title)
4218 return True
4219 return False
4222ZH_FORMS_TAGS = {
4223 "trad.": "Traditional-Chinese",
4224 "simp.": "Simplified-Chinese",
4225 "alternative forms": "alternative",
4226 "2nd round simp.": "Second-Round-Simplified-Chinese",
4227}
4230def extract_zh_forms_template(
4231 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4232):
4233 # https://en.wiktionary.org/wiki/Template:zh-forms
4234 lit_meaning = clean_node(
4235 wxr, None, t_node.template_parameters.get("lit", "")
4236 )
4237 if lit_meaning != "":
4238 base_data["literal_meaning"] = lit_meaning
4239 expanded_node = wxr.wtp.parse(
4240 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4241 )
4242 for table in expanded_node.find_child(NodeKind.TABLE):
4243 for row in table.find_child(NodeKind.TABLE_ROW):
4244 row_header = ""
4245 row_header_tags: list[str] = []
4246 header_has_span = False
4247 for cell in row.find_child(
4248 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4249 ):
4250 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4251 row_header, row_header_tags, header_has_span = (
4252 extract_zh_forms_header_cell(wxr, base_data, cell)
4253 )
4254 elif not header_has_span:
4255 extract_zh_forms_data_cell(
4256 wxr, base_data, cell, row_header, row_header_tags
4257 )
4259 if "forms" in base_data and len(base_data["forms"]) == 0: 4259 ↛ 4260line 4259 didn't jump to line 4260 because the condition on line 4259 was never true
4260 del base_data["forms"]
4263def extract_zh_forms_header_cell(
4264 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4265) -> tuple[str, list[str], bool]:
4266 row_header = ""
4267 row_header_tags = []
4268 header_has_span = False
4269 first_span_index = len(header_cell.children)
4270 for index, span_tag in header_cell.find_html("span", with_index=True):
4271 if index < first_span_index: 4271 ↛ 4273line 4271 didn't jump to line 4273 because the condition on line 4271 was always true
4272 first_span_index = index
4273 header_has_span = True
4274 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4275 for raw_tag in row_header.split(" and "):
4276 raw_tag = raw_tag.strip()
4277 if raw_tag != "":
4278 row_header_tags.append(raw_tag)
4279 for span_tag in header_cell.find_html_recursively("span"):
4280 span_lang = span_tag.attrs.get("lang", "")
4281 form_nodes = []
4282 sup_title = ""
4283 for node in span_tag.children:
4284 if isinstance(node, HTMLNode) and node.tag == "sup": 4284 ↛ 4285line 4284 didn't jump to line 4285 because the condition on line 4284 was never true
4285 for sup_span in node.find_html("span"):
4286 sup_title = sup_span.attrs.get("title", "")
4287 else:
4288 form_nodes.append(node)
4289 if span_lang in ["zh-Hant", "zh-Hans"]:
4290 for word in clean_node(wxr, None, form_nodes).split("/"):
4291 if word not in [wxr.wtp.title, ""]:
4292 form = {"form": word}
4293 for raw_tag in row_header_tags:
4294 if raw_tag in ZH_FORMS_TAGS: 4294 ↛ 4297line 4294 didn't jump to line 4297 because the condition on line 4294 was always true
4295 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4296 else:
4297 data_append(form, "raw_tags", raw_tag)
4298 if sup_title != "": 4298 ↛ 4299line 4298 didn't jump to line 4299 because the condition on line 4298 was never true
4299 data_append(form, "raw_tags", sup_title)
4300 data_append(base_data, "forms", form)
4301 return row_header, row_header_tags, header_has_span
4304TagLiteral = Literal["tags", "raw_tags"]
4305TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags")
4308def extract_zh_forms_data_cell(
4309 wxr: WiktextractContext,
4310 base_data: WordData,
4311 cell: WikiNode,
4312 row_header: str,
4313 row_header_tags: list[str],
4314) -> None:
4315 from .zh_pron_tags import ZH_PRON_TAGS
4317 forms: list[FormData] = []
4318 for top_span_tag in cell.find_html("span"):
4319 span_style = top_span_tag.attrs.get("style", "")
4320 span_lang = top_span_tag.attrs.get("lang", "")
4321 if span_style == "white-space:nowrap;":
4322 extract_zh_forms_data_cell(
4323 wxr, base_data, top_span_tag, row_header, row_header_tags
4324 )
4325 elif "font-size:80%" in span_style:
4326 raw_tag = clean_node(wxr, None, top_span_tag)
4327 if raw_tag != "": 4327 ↛ 4318line 4327 didn't jump to line 4318 because the condition on line 4327 was always true
4328 for form in forms:
4329 if raw_tag in ZH_PRON_TAGS: 4329 ↛ 4335line 4329 didn't jump to line 4335 because the condition on line 4329 was always true
4330 tr_tag = ZH_PRON_TAGS[raw_tag]
4331 if isinstance(tr_tag, list): 4331 ↛ 4332line 4331 didn't jump to line 4332 because the condition on line 4331 was never true
4332 data_extend(form, "tags", tr_tag)
4333 elif isinstance(tr_tag, str): 4333 ↛ 4328line 4333 didn't jump to line 4328 because the condition on line 4333 was always true
4334 data_append(form, "tags", tr_tag)
4335 elif raw_tag in valid_tags:
4336 data_append(form, "tags", raw_tag)
4337 else:
4338 data_append(form, "raw_tags", raw_tag)
4339 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4339 ↛ 4318line 4339 didn't jump to line 4318 because the condition on line 4339 was always true
4340 word = clean_node(wxr, None, top_span_tag)
4341 if word not in ["", "/", wxr.wtp.title]:
4342 form = {"form": word}
4343 if row_header != "anagram": 4343 ↛ 4349line 4343 didn't jump to line 4349 because the condition on line 4343 was always true
4344 for raw_tag in row_header_tags:
4345 if raw_tag in ZH_FORMS_TAGS: 4345 ↛ 4348line 4345 didn't jump to line 4348 because the condition on line 4345 was always true
4346 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4347 else:
4348 data_append(form, "raw_tags", raw_tag)
4349 if span_lang == "zh-Hant":
4350 data_append(form, "tags", "Traditional-Chinese")
4351 elif span_lang == "zh-Hans":
4352 data_append(form, "tags", "Simplified-Chinese")
4353 forms.append(form)
4355 if row_header == "anagram": 4355 ↛ 4356line 4355 didn't jump to line 4356 because the condition on line 4355 was never true
4356 for form in forms:
4357 l_data: LinkageData = {"word": form["form"]}
4358 for key in TAG_LITERALS_TUPLE:
4359 if key in form:
4360 l_data[key] = form[key]
4361 data_append(base_data, "anagrams", l_data)
4362 else:
4363 data_extend(base_data, "forms", forms)
4366def extract_ja_kanjitab_template(
4367 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4368):
4369 # https://en.wiktionary.org/wiki/Template:ja-kanjitab
4370 expanded_node = wxr.wtp.parse(
4371 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4372 )
4373 for table in expanded_node.find_child(NodeKind.TABLE):
4374 is_alt_form_table = False
4375 for row in table.find_child(NodeKind.TABLE_ROW):
4376 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
4377 header_text = clean_node(wxr, None, header_node)
4378 if header_text.startswith("Alternative spelling"):
4379 is_alt_form_table = True
4380 if not is_alt_form_table:
4381 continue
4382 forms = []
4383 for row in table.find_child(NodeKind.TABLE_ROW):
4384 for cell_node in row.find_child(NodeKind.TABLE_CELL):
4385 for child_node in cell_node.children:
4386 if isinstance(child_node, HTMLNode):
4387 if child_node.tag == "span":
4388 word = clean_node(wxr, None, child_node)
4389 if word != "": 4389 ↛ 4385line 4389 didn't jump to line 4385 because the condition on line 4389 was always true
4390 forms.append(
4391 {
4392 "form": word,
4393 "tags": ["alternative", "kanji"],
4394 }
4395 )
4396 elif child_node.tag == "small":
4397 raw_tag = clean_node(wxr, None, child_node).strip(
4398 "()"
4399 )
4400 if raw_tag != "" and len(forms) > 0: 4400 ↛ 4385line 4400 didn't jump to line 4385 because the condition on line 4400 was always true
4401 data_append(
4402 forms[-1],
4403 "tags"
4404 if raw_tag in valid_tags
4405 else "raw_tags",
4406 raw_tag,
4407 )
4408 data_extend(base_data, "forms", forms)
4409 for link_node in expanded_node.find_child(NodeKind.LINK):
4410 clean_node(wxr, base_data, link_node)