Coverage for src/wiktextract/extractor/en/page.py: 79%
1834 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 06:55 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Literal,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .descendant import extract_descendant_section
51from .example import extract_example_list_item, extract_template_zh_x
52from .form_descriptions import (
53 classify_desc,
54 decode_tags,
55 distw,
56 parse_alt_or_inflection_of,
57 parse_sense_qualifier,
58 parse_word_head,
59)
60from .inflection import TableContext, parse_inflection_section
61from .info_templates import (
62 INFO_TEMPLATE_FUNCS,
63 parse_info_template_arguments,
64 parse_info_template_node,
65)
66from .linkages import (
67 extract_alt_form_section,
68 parse_linkage,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 FormData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280FLOATING_TABLE_TEMPLATES: set[str] = {
281 # az-suffix-form creates a style=floatright div that is otherwise
282 # deleted; if it is not pre-expanded, we can intercept the template
283 # so we add this set into do_not_pre_expand, and intercept the
284 # templates in parse_part_of_speech
285 "az-suffix-forms",
286 "az-inf-p",
287 "kk-suffix-forms",
288 "ky-suffix-forms",
289 "tr-inf-p",
290 "tr-suffix-forms",
291 "tt-suffix-forms",
292 "uz-suffix-forms",
293}
294# These two should contain template names that should always be
295# pre-expanded when *first* processing the tree, or not pre-expanded
296# so that the template are left in place with their identifying
297# name intact for later filtering.
299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
302# Additional templates to be expanded in the pre-expand phase
303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
304 "multitrans",
305 "multitrans-nowiki",
306 "trans-top",
307 "trans-top-also",
308 "trans-bottom",
309 "checktrans-top",
310 "checktrans-bottom",
311 "col",
312 "col1",
313 "col2",
314 "col3",
315 "col4",
316 "col5",
317 "col1-u",
318 "col2-u",
319 "col3-u",
320 "col4-u",
321 "col5-u",
322 "check deprecated lang param usage",
323 "deprecated code",
324 "ru-verb-alt-ё",
325 "ru-noun-alt-ё",
326 "ru-adj-alt-ё",
327 "ru-proper noun-alt-ё",
328 "ru-pos-alt-ё",
329 "ru-alt-ё",
330 "inflection of",
331 "no deprecated lang param usage",
332 "transclude", # these produce sense entries (or other lists)
333 "tcl",
334}
336# Inverse linkage for those that have them
337linkage_inverses: dict[str, str] = {
338 # XXX this is not currently used, move to post-processing
339 "synonyms": "synonyms",
340 "hypernyms": "hyponyms",
341 "hyponyms": "hypernyms",
342 "holonyms": "meronyms",
343 "meronyms": "holonyms",
344 "derived": "derived_from",
345 "coordinate_terms": "coordinate_terms",
346 "troponyms": "hypernyms",
347 "antonyms": "antonyms",
348 "instances": "instance_of",
349 "related": "related",
350}
352# Templates that are used to form panels on pages and that
353# should be ignored in various positions
354PANEL_TEMPLATES: set[str] = {
355 "Character info",
356 "CJKV",
357 "French personal pronouns",
358 "French possessive adjectives",
359 "French possessive pronouns",
360 "Han etym",
361 "Japanese demonstratives",
362 "Latn-script",
363 "LDL",
364 "MW1913Abbr",
365 "Number-encoding",
366 "Nuttall",
367 "Spanish possessive adjectives",
368 "Spanish possessive pronouns",
369 "USRegionDisputed",
370 "Webster 1913",
371 "ase-rfr",
372 "attention",
373 "attn",
374 "beer",
375 "broken ref",
376 "ca-compass",
377 "character info",
378 "character info/var",
379 "checksense",
380 "compass-fi",
381 "copyvio suspected",
382 "delete",
383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
384 "etystub",
385 "examples",
386 "hu-corr",
387 "hu-suff-pron",
388 "interwiktionary",
389 "ja-kanjitab",
390 "ja-kt",
391 "ko-hanja-search",
392 "look",
393 "maintenance box",
394 "maintenance line",
395 "mediagenic terms",
396 "merge",
397 "missing template",
398 "morse links",
399 "move",
400 "multiple images",
401 "no inline",
402 "picdic",
403 "picdicimg",
404 "picdiclabel",
405 "polyominoes",
406 "predidential nomics",
407 "punctuation", # This actually gets pre-expanded
408 "reconstructed",
409 "request box",
410 "rf-sound example",
411 "rfaccents",
412 "rfap",
413 "rfaspect",
414 "rfc",
415 "rfc-auto",
416 "rfc-header",
417 "rfc-level",
418 "rfc-pron-n",
419 "rfc-sense",
420 "rfclarify",
421 "rfd",
422 "rfd-redundant",
423 "rfd-sense",
424 "rfdate",
425 "rfdatek",
426 "rfdef",
427 "rfe",
428 "rfe/dowork",
429 "rfex",
430 "rfexp",
431 "rfform",
432 "rfgender",
433 "rfi",
434 "rfinfl",
435 "rfm",
436 "rfm-sense",
437 "rfp",
438 "rfp-old",
439 "rfquote",
440 "rfquote-sense",
441 "rfquotek",
442 "rfref",
443 "rfscript",
444 "rft2",
445 "rftaxon",
446 "rftone",
447 "rftranslit",
448 "rfv",
449 "rfv-etym",
450 "rfv-pron",
451 "rfv-quote",
452 "rfv-sense",
453 "selfref",
454 "split",
455 "stroke order", # XXX consider capturing this?
456 "stub entry",
457 "t-needed",
458 "tbot entry",
459 "tea room",
460 "tea room sense",
461 # "ttbc", - XXX needed in at least on/Preposition/Translation page
462 "unblock",
463 "unsupportedpage",
464 "video frames",
465 "was wotd",
466 "wrongtitle",
467 "zh-forms",
468 "zh-hanzi-box",
469 "no entry",
470}
472# Template name prefixes used for language-specific panel templates (i.e.,
473# templates that create side boxes or notice boxes or that should generally
474# be ignored).
475PANEL_PREFIXES: set[str] = {
476 "list:compass points/",
477 "list:Gregorian calendar months/",
478 "RQ:",
479}
481# Templates used for wikipedia links.
482wikipedia_templates: set[str] = {
483 "wikipedia",
484 "slim-wikipedia",
485 "w",
486 "W",
487 "swp",
488 "wiki",
489 "Wikipedia",
490 "wtorw",
491}
492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started
493 print(
494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
495 x
496 )
497 )
499# Mapping from a template name (without language prefix) for the main word
500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
501# it could validly occur. This is used as just a sanity check to give
502# warnings about probably incorrect coding in Wiktionary.
503template_allowed_pos_map: dict[str, list[str]] = {
504 "abbr": ["abbrev"],
505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
506 "plural noun": ["noun", "name"],
507 "plural-noun": ["noun", "name"],
508 "proper noun": ["noun", "name"],
509 "proper-noun": ["name", "noun"],
510 "prop": ["name", "noun"],
511 "verb": ["verb", "phrase"],
512 "gerund": ["verb"],
513 "particle": ["adv", "particle"],
514 "adj": ["adj", "adj_noun"],
515 "pron": ["pron", "noun"],
516 "name": ["name", "noun"],
517 "adv": ["adv", "intj", "conj", "particle"],
518 "phrase": ["phrase", "prep_phrase"],
519 "noun phrase": ["phrase"],
520 "ordinal": ["num"],
521 "number": ["num"],
522 "pos": ["affix", "name", "num"],
523 "suffix": ["suffix", "affix"],
524 "character": ["character"],
525 "letter": ["character"],
526 "kanji": ["character"],
527 "cont": ["abbrev"],
528 "interj": ["intj"],
529 "con": ["conj"],
530 "part": ["particle"],
531 "prep": ["prep", "postp"],
532 "postp": ["postp"],
533 "misspelling": ["noun", "adj", "verb", "adv"],
534 "part-form": ["verb"],
535}
536for k, v in template_allowed_pos_map.items():
537 for x in v:
538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 print(
540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
541 "".format(x, k, v)
542 )
543 assert False
546# Templates ignored during etymology extraction, i.e., these will not be listed
547# in the extracted etymology templates.
548ignored_etymology_templates: list[str] = [
549 "...",
550 "IPAchar",
551 "ipachar",
552 "ISBN",
553 "isValidPageName",
554 "redlink category",
555 "deprecated code",
556 "check deprecated lang param usage",
557 "para",
558 "p",
559 "cite",
560 "Cite news",
561 "Cite newsgroup",
562 "cite paper",
563 "cite MLLM 1976",
564 "cite journal",
565 "cite news/documentation",
566 "cite paper/documentation",
567 "cite video game",
568 "cite video game/documentation",
569 "cite newsgroup",
570 "cite newsgroup/documentation",
571 "cite web/documentation",
572 "cite news",
573 "Cite book",
574 "Cite-book",
575 "cite book",
576 "cite web",
577 "cite-usenet",
578 "cite-video/documentation",
579 "Cite-journal",
580 "rfe",
581 "catlangname",
582 "cln",
583 "langname-lite",
584 "no deprecated lang param usage",
585 "mention",
586 "m",
587 "m-self",
588 "link",
589 "l",
590 "ll",
591 "l-self",
592]
593# Regexp for matching ignored etymology template names. This adds certain
594# prefixes to the names listed above.
595ignored_etymology_templates_re = re.compile(
596 r"^((cite-|R:|RQ:).*|"
597 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
598 + r")$"
599)
601# Regexp for matching ignored descendants template names. Right now we just
602# copy the ignored etymology templates
603ignored_descendants_templates_re = ignored_etymology_templates_re
605# Set of template names that are used to define usage examples. If the usage
606# example contains one of these templates, then it its type is set to
607# "example"
608usex_templates: set[str] = {
609 "afex",
610 "affixusex",
611 "co", # {{collocation}} acts like a example template, specifically for
612 # pairs of combinations of words that are more common than you'd
613 # except would be randomly; hlavní#Czech
614 "coi",
615 "collocation",
616 "el-example",
617 "el-x",
618 "example",
619 "examples",
620 "he-usex",
621 "he-x",
622 "hi-usex",
623 "hi-x",
624 "ja-usex-inline",
625 "ja-usex",
626 "ja-x",
627 "jbo-example",
628 "jbo-x",
629 "km-usex",
630 "km-x",
631 "ko-usex",
632 "ko-x",
633 "lo-usex",
634 "lo-x",
635 "ne-x",
636 "ne-usex",
637 "prefixusex",
638 "ryu-usex",
639 "ryu-x",
640 "shn-usex",
641 "shn-x",
642 "suffixusex",
643 "th-usex",
644 "th-x",
645 "ur-usex",
646 "ur-x",
647 "usex",
648 "usex-suffix",
649 "ux",
650 "uxi",
651}
653stop_head_at_these_templates: set[str] = {
654 "category",
655 "cat",
656 "topics",
657 "catlangname",
658 "c",
659 "C",
660 "top",
661 "cln",
662}
664# Set of template names that are used to define quotation examples. If the
665# usage example contains one of these templates, then its type is set to
666# "quotation".
667quotation_templates: set[str] = {
668 "collapse-quote",
669 "quote-av",
670 "quote-book",
671 "quote-GYLD",
672 "quote-hansard",
673 "quotei",
674 "quote-journal",
675 "quotelite",
676 "quote-mailing list",
677 "quote-meta",
678 "quote-newsgroup",
679 "quote-song",
680 "quote-text",
681 "quote",
682 "quote-us-patent",
683 "quote-video game",
684 "quote-web",
685 "quote-wikipedia",
686 "wikiquote",
687 "Wikiquote",
688 "Q",
689}
691taxonomy_templates = {
692 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
693 "taxfmt",
694 "taxlink",
695 "taxlink2",
696 "taxlinknew",
697 "taxlook",
698}
700# Template names, this was exctracted from template_linkage_mappings,
701# because the code using template_linkage_mappings was actually not used
702# (but not removed).
703template_linkages_to_ignore_in_examples: set[str] = {
704 "syn",
705 "synonyms",
706 "ant",
707 "antonyms",
708 "hyp",
709 "hyponyms",
710 "der",
711 "derived terms",
712 "coordinate terms",
713 "cot",
714 "rel",
715 "col",
716 "inline alt forms",
717 "alti",
718 "comeronyms",
719 "holonyms",
720 "holo",
721 "hypernyms",
722 "hyper",
723 "meronyms",
724 "mero",
725 "troponyms",
726 "perfectives",
727 "pf",
728 "imperfectives",
729 "impf",
730 "syndiff",
731 "synsee",
732 # not linkage nor example templates
733 "sense",
734 "s",
735 "color panel",
736 "colour panel",
737}
739# Maps template name used in a word sense to a linkage field that it adds.
740sense_linkage_templates: dict[str, str] = {
741 "syn": "synonyms",
742 "synonyms": "synonyms",
743 "synsee": "synonyms",
744 "syndiff": "synonyms",
745 "hyp": "hyponyms",
746 "hyponyms": "hyponyms",
747 "ant": "antonyms",
748 "antonyms": "antonyms",
749 "alti": "related",
750 "inline alt forms": "related",
751 "coordinate terms": "coordinate_terms",
752 "cot": "coordinate_terms",
753 "comeronyms": "related",
754 "holonyms": "holonyms",
755 "holo": "holonyms",
756 "hypernyms": "hypernyms",
757 "hyper": "hypernyms",
758 "meronyms": "meronyms",
759 "mero": "meronyms",
760 "troponyms": "troponyms",
761 "perfectives": "related",
762 "pf": "related",
763 "imperfectives": "related",
764 "impf": "related",
765 "parasynonyms": "synonyms",
766 "par": "synonyms",
767 "parasyn": "synonyms",
768 "nearsyn": "synonyms",
769 "near-syn": "synonyms",
770}
772sense_linkage_templates_tags: dict[str, list[str]] = {
773 "alti": ["alternative"],
774 "inline alt forms": ["alternative"],
775 "comeronyms": ["comeronym"],
776 "perfectives": ["perfective"],
777 "pf": ["perfective"],
778 "imperfectives": ["imperfective"],
779 "impf": ["imperfective"],
780}
783def decode_html_entities(v: Union[str, int]) -> str:
784 """Decodes HTML entities from a value, converting them to the respective
785 Unicode characters/strings."""
786 if isinstance(v, int):
787 # I changed this to return str(v) instead of v = str(v),
788 # but there might have been the intention to have more logic
789 # here. html.unescape would not do anything special with an integer,
790 # it needs html escape symbols (&xx;).
791 return str(v)
792 return html.unescape(v)
795def parse_sense_linkage(
796 wxr: WiktextractContext,
797 data: SenseData,
798 name: str,
799 ht: TemplateArgs,
800 pos: str,
801) -> None:
802 """Parses a linkage (synonym, etc) specified in a word sense."""
803 assert isinstance(wxr, WiktextractContext)
804 assert isinstance(data, dict)
805 assert isinstance(name, str)
806 assert isinstance(ht, dict)
807 field = sense_linkage_templates[name]
808 field_tags = sense_linkage_templates_tags.get(name, [])
809 for i in range(2, 20):
810 if i not in ht:
811 break
812 w = clean_node(wxr, data, ht[i])
813 if "#" in w:
814 w = w[: w.index("#")]
815 if w in ["", "<"]: # `<` used in "hypernyms" template
816 continue
817 if ( 817 ↛ 822line 817 didn't jump to line 822 because the condition on line 817 was never true
818 i > 2
819 and w in (",", "or", ";")
820 or w.startswith(("see also", "See also"))
821 ):
822 continue
823 is_thesaurus = False
824 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
825 if w.startswith(alias):
826 is_thesaurus = True
827 w = w[len(alias) :]
828 if w != wxr.wtp.title: 828 ↛ 848line 828 didn't jump to line 848 because the condition on line 828 was always true
829 from ...thesaurus import search_thesaurus
831 lang_code = clean_node(wxr, None, ht.get(1, ""))
832 for t_data in search_thesaurus(
833 wxr.thesaurus_db_conn, # type: ignore
834 w,
835 lang_code,
836 pos,
837 "synonyms", # GH issue #1570
838 ):
839 l_data: LinkageData = {
840 "word": t_data.term,
841 "source": "Thesaurus:" + w,
842 }
843 if len(t_data.tags) > 0: 843 ↛ 844line 843 didn't jump to line 844 because the condition on line 843 was never true
844 l_data["tags"] = t_data.tags
845 if len(t_data.raw_tags) > 0: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true
846 l_data["raw_tags"] = t_data.raw_tags
847 data_append(data, field, l_data)
848 break
849 if is_thesaurus:
850 continue
851 tags: list[str] = []
852 topics: list[str] = []
853 english: Optional[str] = None
854 # Try to find qualifiers for this synonym
855 q = ht.get("q{}".format(i - 1))
856 if q:
857 cls = classify_desc(q)
858 if cls == "tags":
859 tagsets1, topics1 = decode_tags(q)
860 for ts in tagsets1:
861 tags.extend(ts)
862 topics.extend(topics1)
863 elif cls == "english": 863 ↛ 869line 863 didn't jump to line 869 because the condition on line 863 was always true
864 if english: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true
865 english += "; " + q
866 else:
867 english = q
868 # Try to find English translation for this synonym
869 t = ht.get("t{}".format(i - 1))
870 if t: 870 ↛ 871line 870 didn't jump to line 871 because the condition on line 870 was never true
871 if english:
872 english += "; " + t
873 else:
874 english = t
876 # See if the linkage contains a parenthesized alt
877 alt = None
878 m = re.search(r"\(([^)]+)\)$", w)
879 if m: 879 ↛ 880line 879 didn't jump to line 880 because the condition on line 879 was never true
880 w = w[: m.start()].strip()
881 alt = m.group(1)
883 dt = {"word": w}
884 if field_tags: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true
885 data_extend(dt, "tags", field_tags)
886 if tags:
887 data_extend(dt, "tags", tags)
888 if topics: 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true
889 data_extend(dt, "topics", topics)
890 if english:
891 dt["english"] = english # DEPRECATED for "translation"
892 dt["translation"] = english
893 if alt: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true
894 dt["alt"] = alt
895 data_append(data, field, dt)
898EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
899example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
900captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
903def synch_splits_with_args(
904 line: str, targs: TemplateArgs
905) -> Optional[list[str]]:
906 """If it looks like there's something weird with how a line of example
907 text has been split, this function will do the splitting after counting
908 occurences of the splitting regex inside the two main template arguments
909 containing the string data for the original language example and the
910 English translations.
911 """
912 # Previously, we split without capturing groups, but here we want to
913 # keep the original splitting hyphen regex intact.
914 fparts = captured_splitters_re.split(line)
915 new_parts = []
916 # ["First", " – ", "second", " – ", "third..."] from OL argument
917 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
918 new_parts.append("".join(fparts[:first]))
919 # Translation argument
920 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
921 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
922 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
923 new_parts.append("".join(fparts[first + 1 : second]))
925 if all(new_parts): # no empty strings from the above spaghetti
926 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
927 return new_parts
928 else:
929 return None
932QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
933QUALIFIERS_RE = re.compile(QUALIFIERS)
934# (...): ... or (...(...)...): ...
937def parse_language(
938 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
939) -> list[WordData]:
940 """Iterates over the text of the page, returning words (parts-of-speech)
941 defined on the page one at a time. (Individual word senses for the
942 same part-of-speech are typically encoded in the same entry.)"""
943 # imported here to avoid circular import
944 from .pronunciation import parse_pronunciation
946 assert isinstance(wxr, WiktextractContext)
947 assert isinstance(langnode, WikiNode)
948 assert isinstance(language, str)
949 assert isinstance(lang_code, str)
950 # print("parse_language", language)
952 is_reconstruction = False
953 word: str = wxr.wtp.title # type: ignore[assignment]
954 unsupported_prefix = "Unsupported titles/"
955 if word.startswith(unsupported_prefix):
956 w = word[len(unsupported_prefix) :]
957 if w in unsupported_title_map: 957 ↛ 960line 957 didn't jump to line 960 because the condition on line 957 was always true
958 word = unsupported_title_map[w]
959 else:
960 wxr.wtp.error(
961 "Unimplemented unsupported title: {}".format(word),
962 sortid="page/870",
963 )
964 word = w
965 elif word.startswith("Reconstruction:"):
966 word = word[word.find("/") + 1 :]
967 is_reconstruction = True
969 base_data: WordData = {
970 "word": word,
971 "lang": language,
972 "lang_code": lang_code,
973 }
974 if is_reconstruction:
975 data_append(base_data, "tags", "reconstruction")
976 sense_data: SenseData = {}
977 pos_data: WordData = {} # For a current part-of-speech
978 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
979 etym_data: WordData = {} # For one etymology
980 sense_datas: list[SenseData] = []
981 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
982 # Never reset, do not use as data
983 level_four_datas: list[WordData] = []
984 etym_datas: list[WordData] = []
985 page_datas: list[WordData] = []
986 have_etym = False
987 inside_level_four = False # This is for checking if the etymology section
988 # or article has a Pronunciation section, for Chinese mostly; because
989 # Chinese articles can have three level three sections (two etymology
990 # sections and pronunciation sections) one after another, we need a kludge
991 # to better keep track of whether we're in a normal "etym" or inside a
992 # "level four" (which is what we've turned the level three Pron sections
993 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
994 # a step.
995 stack: list[str] = [] # names of items on the "stack"
997 def merge_base(data: WordData, base: WordData) -> None:
998 for k, v in base.items():
999 # Copy the value to ensure that we don't share lists or
1000 # dicts between structures (even nested ones).
1001 v = copy.deepcopy(v)
1002 if k not in data:
1003 # The list was copied above, so this will not create shared ref
1004 data[k] = v # type: ignore[literal-required]
1005 continue
1006 if data[k] == v: # type: ignore[literal-required]
1007 continue
1008 if ( 1008 ↛ 1016line 1008 didn't jump to line 1016 because the condition on line 1008 was always true
1009 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1010 or isinstance(
1011 v,
1012 (list, tuple), # Should this be "and"?
1013 )
1014 ):
1015 data[k] = list(data[k]) + list(v) # type: ignore
1016 elif data[k] != v: # type: ignore[literal-required]
1017 wxr.wtp.warning(
1018 "conflicting values for {} in merge_base: "
1019 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1020 sortid="page/904",
1021 )
1023 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1024 """Remove unnecessary keys from dict values
1025 in a list comprehension..."""
1026 if key in pron:
1027 pron.pop(key) # type: ignore
1028 return pron
1030 def sound_matches_pos(sound: SoundData, pos: str) -> bool:
1031 if "pos" not in sound:
1032 return True
1033 sound_pos = sound["pos"] # type: ignore[typeddict-item]
1034 return pos in sound_pos
1036 def strip_sound_pos(sound: SoundData) -> SoundData:
1037 complementary_pop(sound, "pos")
1038 return sound
1040 # If the result has sounds, eliminate sounds that have a prefix that
1041 # does not match "word" or one of "forms"
1042 if "sounds" in data and "word" in data:
1043 accepted = [data["word"]]
1044 accepted.extend(f["form"] for f in data.get("forms", dict()))
1045 data["sounds"] = list(
1046 s
1047 for s in data["sounds"]
1048 if "form" not in s or s["form"] in accepted
1049 )
1050 # If the result has sounds, eliminate sounds that have a pos that
1051 # does not match "pos"
1052 if "sounds" in data and "pos" in data:
1053 data["sounds"] = list(
1054 strip_sound_pos(s)
1055 for s in data["sounds"]
1056 # "pos" is not a field of SoundData, correctly, so we're
1057 # removing it here. It's a kludge on a kludge on a kludge.
1058 if sound_matches_pos(s, data["pos"])
1059 )
1060 elif "sounds" in data: 1060 ↛ 1061line 1060 didn't jump to line 1061 because the condition on line 1060 was never true
1061 data["sounds"] = [strip_sound_pos(s) for s in data["sounds"]]
1063 def push_sense(sorting_ordinal: int | None = None) -> bool:
1064 """Starts collecting data for a new word sense. This returns True
1065 if a sense was added."""
1066 nonlocal sense_data
1067 if sorting_ordinal is None:
1068 sorting_ordinal = sense_ordinal
1069 tags = sense_data.get("tags", ())
1070 if (
1071 not sense_data.get("glosses")
1072 and "translation-hub" not in tags
1073 and "no-gloss" not in tags
1074 ):
1075 return False
1077 if ( 1077 ↛ 1087line 1077 didn't jump to line 1087 because the condition on line 1077 was never true
1078 (
1079 "participle" in sense_data.get("tags", ())
1080 or "infinitive" in sense_data.get("tags", ())
1081 )
1082 and "alt_of" not in sense_data
1083 and "form_of" not in sense_data
1084 and "etymology_text" in etym_data
1085 and etym_data["etymology_text"] != ""
1086 ):
1087 etym = etym_data["etymology_text"]
1088 etym = etym.split(". ")[0]
1089 ret = parse_alt_or_inflection_of(wxr, etym, set())
1090 if ret is not None:
1091 tags, lst = ret
1092 assert isinstance(lst, (list, tuple))
1093 if "form-of" in tags:
1094 data_extend(sense_data, "form_of", lst)
1095 data_extend(sense_data, "tags", tags)
1096 elif "alt-of" in tags:
1097 data_extend(sense_data, "alt_of", lst)
1098 data_extend(sense_data, "tags", tags)
1100 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1100 ↛ 1103line 1100 didn't jump to line 1103 because the condition on line 1100 was never true
1101 "tags", ()
1102 ):
1103 data_append(sense_data, "tags", "no-gloss")
1105 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore
1106 sense_datas.append(sense_data)
1107 sense_data = {}
1108 return True
1110 def push_pos(sorting_ordinal: int | None = None) -> None:
1111 """Starts collecting data for a new part-of-speech."""
1112 nonlocal pos_data
1113 nonlocal sense_datas
1114 push_sense(sorting_ordinal)
1115 if wxr.wtp.subsection:
1116 data: WordData = {"senses": sense_datas}
1117 merge_base(data, pos_data)
1118 level_four_datas.append(data)
1119 pos_data = {}
1120 sense_datas = []
1121 wxr.wtp.start_subsection(None)
1123 def push_level_four_section(clear_sound_data: bool) -> None:
1124 """Starts collecting data for a new level four sections, which
1125 is usually virtual and empty, unless the article has Chinese
1126 'Pronunciation' sections that are etymology-section-like but
1127 under etymology, and at the same level in the source. We modify
1128 the source to demote Pronunciation sections like that to level
1129 4, and other sections one step lower."""
1130 nonlocal level_four_data
1131 nonlocal level_four_datas
1132 nonlocal etym_datas
1133 push_pos()
1134 # print(f"======\n{etym_data=}")
1135 # print(f"======\n{etym_datas=}")
1136 # print(f"======\n{level_four_data=}")
1137 # print(f"======\n{level_four_datas=}")
1138 for data in level_four_datas:
1139 merge_base(data, level_four_data)
1140 etym_datas.append(data)
1141 for data in etym_datas:
1142 merge_base(data, etym_data)
1143 page_datas.append(data)
1144 if clear_sound_data:
1145 level_four_data = {}
1146 level_four_datas = []
1147 etym_datas = []
1149 def push_etym() -> None:
1150 """Starts collecting data for a new etymology."""
1151 nonlocal etym_data
1152 nonlocal etym_datas
1153 nonlocal have_etym
1154 nonlocal inside_level_four
1155 have_etym = True
1156 push_level_four_section(False)
1157 inside_level_four = False
1158 # etymology section could under pronunciation section
1159 etym_data = (
1160 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1161 )
1163 def select_data() -> WordData:
1164 """Selects where to store data (pos or etym) based on whether we
1165 are inside a pos (part-of-speech)."""
1166 # print(f"{wxr.wtp.subsection=}")
1167 # print(f"{stack=}")
1168 if wxr.wtp.subsection is not None:
1169 return pos_data
1170 if inside_level_four:
1171 return level_four_data
1172 if stack[-1] == language:
1173 return base_data
1174 return etym_data
1176 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1177 """Parses the subsection for a part-of-speech under a language on
1178 a page."""
1179 assert isinstance(posnode, WikiNode)
1180 assert isinstance(pos, str)
1181 # print("parse_part_of_speech", pos)
1182 pos_data["pos"] = pos
1183 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1184 lists: list[list[WikiNode]] = [[]] # list of lists
1185 first_para = True
1186 first_head_tmplt = True
1187 collecting_head = True
1188 start_of_paragraph = True
1190 # XXX extract templates from posnode with recursively_extract
1191 # that break stuff, like ja-kanji or az-suffix-form.
1192 # Do the extraction with a list of template names, combined from
1193 # different lists, then separate out them into different lists
1194 # that are handled at different points of the POS section.
1195 # First, extract az-suffix-form, put it in `inflection`,
1196 # and parse `inflection`'s content when appropriate later.
1197 # The contents of az-suffix-form (and ja-kanji) that generate
1198 # divs with "floatright" in their style gets deleted by
1199 # clean_value, so templates that slip through from here won't
1200 # break anything.
1201 # XXX bookmark
1202 # print("===================")
1203 # print(posnode.children)
1205 floaters, poschildren = recursively_extract(
1206 posnode.children,
1207 lambda x: (
1208 isinstance(x, WikiNode)
1209 and (
1210 (
1211 isinstance(x, TemplateNode)
1212 and x.template_name in FLOATING_TABLE_TEMPLATES
1213 )
1214 or (
1215 x.kind == NodeKind.LINK
1216 # Need to check for stringiness because some links are
1217 # broken; for example, if a template is missing an
1218 # argument, a link might look like `[[{{{1}}}...]]`
1219 and len(x.largs) > 0
1220 and len(x.largs[0]) > 0
1221 and isinstance(x.largs[0][0], str)
1222 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1223 )
1224 )
1225 ),
1226 )
1227 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1228 tempnode.largs = [["Inflection"]]
1229 tempnode.children = floaters
1230 parse_inflection(tempnode, "Floating Div", pos)
1231 # print(poschildren)
1232 # XXX new above
1234 if not poschildren: 1234 ↛ 1235line 1234 didn't jump to line 1235 because the condition on line 1234 was never true
1235 if not floaters:
1236 wxr.wtp.debug(
1237 "PoS section without contents",
1238 sortid="en/page/1051/20230612",
1239 )
1240 else:
1241 wxr.wtp.debug(
1242 "PoS section without contents except for a floating table",
1243 sortid="en/page/1056/20230612",
1244 )
1245 return
1247 for node in poschildren:
1248 if isinstance(node, str):
1249 for m in re.finditer(r"\n+|[^\n]+", node):
1250 p = m.group(0)
1251 if p.startswith("\n\n") and pre:
1252 first_para = False
1253 start_of_paragraph = True
1254 break
1255 if p and collecting_head:
1256 pre[-1].append(p)
1257 continue
1258 assert isinstance(node, WikiNode)
1259 kind = node.kind
1260 if kind == NodeKind.LIST:
1261 lists[-1].append(node)
1262 collecting_head = False
1263 start_of_paragraph = True
1264 continue
1265 elif kind in LEVEL_KINDS:
1266 # Stop parsing section if encountering any kind of
1267 # level header (like ===Noun=== or ====Further Reading====).
1268 # At a quick glance, this should be the default behavior,
1269 # but if some kinds of source articles have sub-sub-sections
1270 # that should be parsed XXX it should be handled by changing
1271 # this break.
1272 break
1273 elif collecting_head and kind == NodeKind.LINK:
1274 # We might collect relevant links as they are often pictures
1275 # relating to the word
1276 if len(node.largs[0]) >= 1 and isinstance( 1276 ↛ 1291line 1276 didn't jump to line 1291 because the condition on line 1276 was always true
1277 node.largs[0][0], str
1278 ):
1279 if node.largs[0][0].startswith( 1279 ↛ 1285line 1279 didn't jump to line 1285 because the condition on line 1279 was never true
1280 ns_title_prefix_tuple(wxr, "Category")
1281 ):
1282 # [[Category:...]]
1283 # We're at the end of the file, probably, so stop
1284 # here. Otherwise the head will get garbage.
1285 break
1286 if node.largs[0][0].startswith(
1287 ns_title_prefix_tuple(wxr, "File")
1288 ):
1289 # Skips file links
1290 continue
1291 start_of_paragraph = False
1292 pre[-1].append(node)
1293 elif kind == NodeKind.HTML:
1294 if node.sarg == "br":
1295 if pre[-1]: 1295 ↛ 1247line 1295 didn't jump to line 1247 because the condition on line 1295 was always true
1296 pre.append([]) # Switch to next head
1297 lists.append([]) # Lists parallels pre
1298 collecting_head = True
1299 start_of_paragraph = True
1300 elif collecting_head and node.sarg not in ( 1300 ↛ 1306line 1300 didn't jump to line 1306 because the condition on line 1300 was never true
1301 "gallery",
1302 "ref",
1303 "cite",
1304 "caption",
1305 ):
1306 start_of_paragraph = False
1307 pre[-1].append(node)
1308 else:
1309 start_of_paragraph = False
1310 elif isinstance(node, TemplateNode):
1311 # XXX Insert code here that disambiguates between
1312 # templates that generate word heads and templates
1313 # that don't.
1314 # There's head_tag_re that seems like a regex meant
1315 # to identify head templates. Too bad it's None.
1317 # ignore {{category}}, {{cat}}... etc.
1318 if node.template_name in stop_head_at_these_templates:
1319 # we've reached a template that should be at the end,
1320 continue
1322 # skip these templates; panel_templates is already used
1323 # to skip certain templates else, but it also applies to
1324 # head parsing quite well.
1325 # node.largs[0][0] should always be str, but can't type-check
1326 # that.
1327 if is_panel_template(wxr, node.template_name):
1328 continue
1329 # skip these templates
1330 # if node.largs[0][0] in skip_these_templates_in_head:
1331 # first_head_tmplt = False # no first_head_tmplt at all
1332 # start_of_paragraph = False
1333 # continue
1335 if first_head_tmplt and pre[-1]:
1336 first_head_tmplt = False
1337 start_of_paragraph = False
1338 pre[-1].append(node)
1339 elif pre[-1] and start_of_paragraph:
1340 pre.append([]) # Switch to the next head
1341 lists.append([]) # lists parallel pre
1342 collecting_head = True
1343 start_of_paragraph = False
1344 pre[-1].append(node)
1345 else:
1346 pre[-1].append(node)
1347 elif first_para:
1348 start_of_paragraph = False
1349 if collecting_head: 1349 ↛ 1247line 1349 didn't jump to line 1247 because the condition on line 1349 was always true
1350 pre[-1].append(node)
1351 # XXX use template_fn in clean_node to check that the head macro
1352 # is compatible with the current part-of-speech and generate warning
1353 # if not. Use template_allowed_pos_map.
1355 # Clean up empty pairs, and fix messes with extra newlines that
1356 # separate templates that are followed by lists wiktextract issue #314
1358 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1359 cleaned_lists: list[list[WikiNode]] = []
1360 pairless_pre_index = None
1362 for pre1, ls in zip(pre, lists):
1363 if pre1 and not ls:
1364 pairless_pre_index = len(cleaned_pre)
1365 if not pre1 and not ls: 1365 ↛ 1367line 1365 didn't jump to line 1367 because the condition on line 1365 was never true
1366 # skip [] + []
1367 continue
1368 if not ls and all(
1369 (isinstance(x, str) and not x.strip()) for x in pre1
1370 ):
1371 # skip ["\n", " "] + []
1372 continue
1373 if ls and not pre1:
1374 if pairless_pre_index is not None: 1374 ↛ 1375line 1374 didn't jump to line 1375 because the condition on line 1374 was never true
1375 cleaned_lists[pairless_pre_index] = ls
1376 pairless_pre_index = None
1377 continue
1378 cleaned_pre.append(pre1)
1379 cleaned_lists.append(ls)
1381 pre = cleaned_pre
1382 lists = cleaned_lists
1384 there_are_many_heads = len(pre) > 1
1385 header_tags: list[str] = []
1386 header_topics: list[str] = []
1387 previous_head_had_list = False
1389 if not any(g for g in lists):
1390 process_gloss_without_list(
1391 poschildren, pos, pos_data, header_tags, header_topics
1392 )
1393 else:
1394 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1395 # if len(ls) == 0:
1396 # # don't have gloss list
1397 # # XXX add code here to filter out 'garbage', like text
1398 # # that isn't a head template or head.
1399 # continue
1401 if all(not sl for sl in lists[i:]):
1402 if i == 0: 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true
1403 if isinstance(node, str):
1404 wxr.wtp.debug(
1405 "first head without list of senses,"
1406 "string: '{}[...]', {}/{}".format(
1407 node[:20], word, language
1408 ),
1409 sortid="page/1689/20221215",
1410 )
1411 if isinstance(node, WikiNode):
1412 if node.largs and node.largs[0][0] in [
1413 "Han char",
1414 ]:
1415 # just ignore these templates
1416 pass
1417 else:
1418 wxr.wtp.debug(
1419 "first head without "
1420 "list of senses, "
1421 "template node "
1422 "{}, {}/{}".format(
1423 node.largs, word, language
1424 ),
1425 sortid="page/1694/20221215",
1426 )
1427 else:
1428 wxr.wtp.debug(
1429 "first head without list of senses, "
1430 "{}/{}".format(word, language),
1431 sortid="page/1700/20221215",
1432 )
1433 # no break here so that the first head always
1434 # gets processed.
1435 else:
1436 if isinstance(node, str): 1436 ↛ 1437line 1436 didn't jump to line 1437 because the condition on line 1436 was never true
1437 wxr.wtp.debug(
1438 "later head without list of senses,"
1439 "string: '{}[...]', {}/{}".format(
1440 node[:20], word, language
1441 ),
1442 sortid="page/1708/20221215",
1443 )
1444 if isinstance(node, WikiNode): 1444 ↛ 1456line 1444 didn't jump to line 1456 because the condition on line 1444 was always true
1445 wxr.wtp.debug(
1446 "later head without list of senses,"
1447 "template node "
1448 "{}, {}/{}".format(
1449 node.sarg if node.sarg else node.largs,
1450 word,
1451 language,
1452 ),
1453 sortid="page/1713/20221215",
1454 )
1455 else:
1456 wxr.wtp.debug(
1457 "later head without list of senses, "
1458 "{}/{}".format(word, language),
1459 sortid="page/1719/20221215",
1460 )
1461 break
1462 head_group = i + 1 if there_are_many_heads else None
1463 # print("parse_part_of_speech: {}: {}: pre={}"
1464 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1466 if previous_head_had_list:
1467 # We use a boolean flag here because we want to be able
1468 # let the header_tags data pass through after the loop
1469 # is over without accidentally emptying it, if there are
1470 # no pos_datas and we need a dummy data.
1471 header_tags.clear()
1472 header_topics.clear()
1474 # print(f"{pre1=}")
1475 process_gloss_header(
1476 pre1, pos, head_group, pos_data, header_tags, header_topics
1477 )
1478 for ln in ls:
1479 # Parse each list associated with this head.
1480 for node in ln.children:
1481 # Parse nodes in l.children recursively.
1482 # The recursion function uses push_sense() to
1483 # add stuff into sense_datas, and returns True or
1484 # False if something is added, which bubbles upward.
1485 # If the bubble is "True", then higher levels of
1486 # the recursion will not push_sense(), because
1487 # the data is already pushed into a sub-gloss
1488 # downstream, unless the higher level has examples
1489 # that need to be put somewhere.
1490 common_data: SenseData = {
1491 "tags": list(header_tags),
1492 "topics": list(header_topics),
1493 }
1494 if head_group:
1495 common_data["head_nr"] = head_group
1496 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1498 if len(ls) > 0:
1499 previous_head_had_list = True
1500 else:
1501 previous_head_had_list = False
1503 # If there are no senses extracted, add a dummy sense. We want to
1504 # keep tags extracted from the head for the dummy sense.
1505 push_sense() # Make sure unfinished data pushed, and start clean sense
1506 if len(sense_datas) == 0:
1507 data_extend(sense_data, "tags", header_tags)
1508 data_extend(sense_data, "topics", header_topics)
1509 data_append(sense_data, "tags", "no-gloss")
1510 push_sense()
1512 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore
1514 for sd in sense_datas:
1515 if "__temp_sense_sorting_ordinal" in sd: 1515 ↛ 1514line 1515 didn't jump to line 1514 because the condition on line 1515 was always true
1516 del sd["__temp_sense_sorting_ordinal"] # type: ignore
1518 term_label_templates: list[TemplateData] = []
1519 normal_label_templates: list[TemplateData] = []
1521 def head_post_template_fn(
1522 name: str, ht: TemplateArgs, expansion: str
1523 ) -> Optional[str]:
1524 """Handles special templates in the head section of a word. Head
1525 section is the text after part-of-speech subtitle and before word
1526 sense list. Typically it generates the bold line for the word, but
1527 may also contain other useful information that often ends in
1528 side boxes. We want to capture some of that additional information."""
1529 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1530 if is_panel_template(wxr, name): 1530 ↛ 1533line 1530 didn't jump to line 1533 because the condition on line 1530 was never true
1531 # Completely ignore these templates (not even recorded in
1532 # head_templates)
1533 return ""
1534 if name == "head":
1535 # XXX are these also captured in forms? Should this special case
1536 # be removed?
1537 t = ht.get(2, "")
1538 if t == "pinyin": 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true
1539 data_append(pos_data, "tags", "Pinyin")
1540 elif t == "romanization": 1540 ↛ 1541line 1540 didn't jump to line 1541 because the condition on line 1540 was never true
1541 data_append(pos_data, "tags", "romanization")
1542 if (
1543 HEAD_TAG_RE.search(name) is not None
1544 or name in WORD_LEVEL_HEAD_TEMPLATES
1545 ):
1546 args_ht = clean_template_args(wxr, ht)
1547 cleaned_expansion = clean_node(wxr, None, expansion)
1548 dt: TemplateData = {
1549 "name": name,
1550 "args": args_ht,
1551 "expansion": cleaned_expansion,
1552 }
1553 data_append(pos_data, "head_templates", dt)
1554 if name in WORD_LEVEL_HEAD_TEMPLATES:
1555 term_label_templates.append(dt)
1556 # Squash these, their tags are applied to the whole word,
1557 # and some cause problems like "term-label"
1558 return ""
1560 # The following are both captured in head_templates and parsed
1561 # separately
1563 if name in wikipedia_templates:
1564 # Note: various places expect to have content from wikipedia
1565 # templates, so cannot convert this to empty
1566 parse_wikipedia_template(wxr, pos_data, ht)
1567 return None
1569 if name == "number box": 1569 ↛ 1571line 1569 didn't jump to line 1571 because the condition on line 1569 was never true
1570 # XXX extract numeric value?
1571 return ""
1572 if name == "enum":
1573 # XXX extract?
1574 return ""
1575 if name == "cardinalbox": 1575 ↛ 1578line 1575 didn't jump to line 1578 because the condition on line 1575 was never true
1576 # XXX extract similar to enum?
1577 # XXX this can also occur in top-level under language
1578 return ""
1579 if name == "Han simplified forms": 1579 ↛ 1581line 1579 didn't jump to line 1581 because the condition on line 1579 was never true
1580 # XXX extract?
1581 return ""
1582 # if name == "ja-kanji forms":
1583 # # XXX extract?
1584 # return ""
1585 # if name == "vi-readings":
1586 # # XXX extract?
1587 # return ""
1588 # if name == "ja-kanji":
1589 # # XXX extract?
1590 # return ""
1591 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1591 ↛ 1593line 1591 didn't jump to line 1593 because the condition on line 1591 was never true
1592 # XXX extract?
1593 return ""
1594 if name == "defdate": 1594 ↛ 1596line 1594 didn't jump to line 1596 because the condition on line 1594 was never true
1595 # the one exampe I saw of this in a head was weird.
1596 return ""
1597 if name in ("lb", "lbl", "label"):
1598 args_ht = clean_template_args(wxr, ht)
1599 cleaned_expansion = clean_node(wxr, None, expansion).strip("()")
1600 dt = {
1601 "name": name,
1602 "args": args_ht,
1603 "expansion": cleaned_expansion,
1604 }
1605 normal_label_templates.append(dt)
1606 # The parens around __LABEL... below is meaningful: label
1607 # templates generate text with parens, so if we add the magical
1608 # phrase here with parens, it will look like a normal label that
1609 # will be handled as a parenthetical text; only when handling
1610 # parenthetical text do we need to actually actually access
1611 # the contents of the label.
1612 return f"(__LABEL_TEMPLATE_{len(normal_label_templates) - 1}__)"
1614 return None
1616 def process_gloss_header(
1617 header_nodes: list[Union[WikiNode, str]],
1618 pos_type: str,
1619 header_group: Optional[int],
1620 pos_data: WordData,
1621 header_tags: list[str],
1622 header_topics: list[str],
1623 ) -> None:
1624 ruby = []
1626 # process template parse nodes here
1627 new_nodes = []
1628 info_template_data = []
1629 for node in header_nodes:
1630 # print(f"{node=}")
1631 info_data, info_out = parse_info_template_node(wxr, node, "head")
1632 if info_data or info_out:
1633 if info_data: 1633 ↛ 1635line 1633 didn't jump to line 1635 because the condition on line 1633 was always true
1634 info_template_data.append(info_data)
1635 if info_out: # including just the original node 1635 ↛ 1636line 1635 didn't jump to line 1636 because the condition on line 1635 was never true
1636 new_nodes.append(info_out)
1637 else:
1638 new_nodes.append(node)
1639 header_nodes = new_nodes
1641 if info_template_data:
1642 if "info_templates" not in pos_data: 1642 ↛ 1645line 1642 didn't jump to line 1645 because the condition on line 1642 was always true
1643 pos_data["info_templates"] = info_template_data
1644 else:
1645 pos_data["info_templates"].extend(info_template_data)
1647 if lang_code == "ja":
1648 exp = wxr.wtp.parse(
1649 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1650 )
1651 rub, _ = recursively_extract(
1652 exp.children,
1653 lambda x: (
1654 isinstance(x, WikiNode)
1655 and x.kind == NodeKind.HTML
1656 and x.sarg == "ruby"
1657 ),
1658 )
1659 if rub is not None: 1659 ↛ 1704line 1659 didn't jump to line 1704 because the condition on line 1659 was always true
1660 for r in rub:
1661 if TYPE_CHECKING:
1662 # we know the lambda above in recursively_extract
1663 # returns only WikiNodes in rub
1664 assert isinstance(r, WikiNode)
1665 rt = parse_ruby(wxr, r)
1666 if rt is not None: 1666 ↛ 1660line 1666 didn't jump to line 1660 because the condition on line 1666 was always true
1667 ruby.append(rt)
1668 elif lang_code == "vi":
1669 # Handle vi-readings templates that have a weird structures for
1670 # Chu Nom vietnamese characters heads
1671 # https://en.wiktionary.org/wiki/Template:vi-readings
1672 new_header_nodes = []
1673 related_readings: list[LinkageData] = []
1674 for node in header_nodes:
1675 if ( 1675 ↛ 1699line 1675 didn't jump to line 1699 because the condition on line 1675 was always true
1676 isinstance(node, TemplateNode)
1677 and node.template_name == "vi-readings"
1678 ):
1679 print(node.template_parameters)
1680 for parameter, tag in (
1681 ("hanviet", "han-viet-reading"),
1682 ("nom", "nom-reading"),
1683 # we ignore the fanqie parameter "phienthiet"
1684 ):
1685 arg = node.template_parameters.get(parameter)
1686 if arg is not None: 1686 ↛ 1680line 1686 didn't jump to line 1680 because the condition on line 1686 was always true
1687 text = clean_node(wxr, None, arg)
1688 for w in text.split(","):
1689 # ignore - separated references
1690 if "-" in w:
1691 w = w[: w.index("-")]
1692 w = w.strip()
1693 related_readings.append(
1694 LinkageData(word=w, tags=[tag])
1695 )
1696 continue
1698 # Skip the vi-reading template for the rest of the head parsing
1699 new_header_nodes.append(node)
1700 if len(related_readings) > 0: 1700 ↛ 1704line 1700 didn't jump to line 1704 because the condition on line 1700 was always true
1701 data_extend(pos_data, "related", related_readings)
1702 header_nodes = new_header_nodes
1704 header_text = clean_node(
1705 wxr,
1706 pos_data,
1707 header_nodes,
1708 post_template_fn=head_post_template_fn,
1709 collect_links=True,
1710 remove_anchors_from_links=True,
1711 )
1712 if "links" in pos_data:
1713 # WordData doesn't use `links`, so we can use `collect_links=True`
1714 # above without special handling and smuggle link data.
1715 extracted_links = pos_data["links"] # type: ignore
1716 del pos_data["links"] # type: ignore
1717 else:
1718 extracted_links = None
1719 # print(f"{header_text=}, {extracted_links=}")
1721 header_text = re.sub(r"\s+", " ", header_text).strip()
1723 if not header_text:
1724 return
1726 term_label_tags: list[str] = []
1727 term_label_topics: list[str] = []
1728 if len(term_label_templates) > 0:
1729 # parse term label templates; if there are other similar kinds
1730 # of templates in headers that you want to squash and apply as
1731 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1732 for templ_data in term_label_templates:
1733 # print(templ_data)
1734 expan = templ_data.get("expansion", "").strip("().,; ")
1735 if not expan: 1735 ↛ 1736line 1735 didn't jump to line 1736 because the condition on line 1735 was never true
1736 continue
1737 tlb_tagsets, tlb_topics = decode_tags(expan)
1738 for tlb_tags in tlb_tagsets:
1739 if len(tlb_tags) > 0 and not any(
1740 t.startswith("error-") for t in tlb_tags
1741 ):
1742 term_label_tags.extend(tlb_tags)
1743 term_label_topics.extend(tlb_topics)
1744 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1746 # print(f"{header_text=}")
1747 parse_word_head(
1748 wxr,
1749 word,
1750 pos_type,
1751 header_text,
1752 pos_data,
1753 is_reconstruction,
1754 header_group,
1755 header_nodes,
1756 ruby=ruby,
1757 links=extracted_links,
1758 label_templates=normal_label_templates,
1759 )
1760 if "tags" in pos_data:
1761 # pos_data can get "tags" data from some source; type-checkers
1762 # doesn't like it, so let's ignore it.
1763 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1764 del pos_data["tags"] # type: ignore[typeddict-item]
1765 if len(term_label_tags) > 0:
1766 header_tags.extend(term_label_tags)
1767 if len(term_label_topics) > 0:
1768 header_topics.extend(term_label_topics)
1770 def process_gloss_without_list(
1771 nodes: list[Union[WikiNode, str]],
1772 pos_type: str,
1773 pos_data: WordData,
1774 header_tags: list[str],
1775 header_topics: list[str],
1776 ) -> None:
1777 # gloss text might not inside a list
1778 header_nodes: list[Union[str, WikiNode]] = []
1779 gloss_nodes: list[Union[str, WikiNode]] = []
1780 for node in strip_nodes(nodes):
1781 if isinstance(node, WikiNode):
1782 if isinstance(node, TemplateNode):
1783 if node.template_name in (
1784 "zh-see",
1785 "ja-see",
1786 "ja-see-kango",
1787 ):
1788 continue # soft redirect
1789 elif (
1790 node.template_name == "head"
1791 or node.template_name.startswith(f"{lang_code}-")
1792 ):
1793 header_nodes.append(node)
1794 continue
1795 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1795 ↛ 1797line 1795 didn't jump to line 1797 because the condition on line 1795 was always true
1796 break
1797 gloss_nodes.append(node)
1799 if len(header_nodes) > 0:
1800 process_gloss_header(
1801 header_nodes,
1802 pos_type,
1803 None,
1804 pos_data,
1805 header_tags,
1806 header_topics,
1807 )
1808 if len(gloss_nodes) > 0:
1809 process_gloss_contents(
1810 gloss_nodes,
1811 pos_type,
1812 {"tags": list(header_tags), "topics": list(header_topics)},
1813 )
1815 def parse_sense_node(
1816 node: Union[str, WikiNode], # never receives str
1817 sense_base: SenseData,
1818 pos: str,
1819 ) -> bool:
1820 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1821 Uses push_sense() to attempt adding data to pos_data in the scope
1822 of parse_language() when it reaches deep in the recursion. push_sense()
1823 returns True if it succeeds, and that is bubbled up the stack; if
1824 a sense was added downstream, the higher levels (whose shared data
1825 was already added by a subsense) do not push_sense(), unless it
1826 has examples that need to be put somewhere.
1827 """
1828 assert isinstance(sense_base, dict) # Added to every sense deeper in
1830 nonlocal sense_ordinal
1831 my_ordinal = sense_ordinal # copies, not a reference
1832 sense_ordinal += 1 # only use for sorting
1834 if not isinstance(node, WikiNode): 1834 ↛ 1836line 1834 didn't jump to line 1836 because the condition on line 1834 was never true
1835 # This doesn't seem to ever happen in practice.
1836 wxr.wtp.debug(
1837 "{}: parse_sense_node called with"
1838 "something that isn't a WikiNode".format(pos),
1839 sortid="page/1287/20230119",
1840 )
1841 return False
1843 if node.kind != NodeKind.LIST_ITEM: 1843 ↛ 1844line 1843 didn't jump to line 1844 because the condition on line 1843 was never true
1844 wxr.wtp.debug(
1845 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1846 )
1847 return False
1849 if node.sarg == ":":
1850 # Skip example entries at the highest level, ones without
1851 # a sense ("...#") above them.
1852 # If node.sarg is exactly and only ":", then it's at
1853 # the highest level; lower levels would have more
1854 # "indentation", like "#:" or "##:"
1855 return False
1857 # If a recursion call succeeds in push_sense(), bubble it up with
1858 # `added`.
1859 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1860 added = False
1862 gloss_template_args: set[str] = set()
1864 # For LISTs and LIST_ITEMS, their argument is something like
1865 # "##" or "##:", and using that we can rudimentally determine
1866 # list 'depth' if need be, and also what kind of list or
1867 # entry it is; # is for normal glosses, : for examples (indent)
1868 # and * is used for quotations on wiktionary.
1869 current_depth = node.sarg
1871 children = node.children
1873 # subentries, (presumably) a list
1874 # of subglosses below this. The list's
1875 # argument ends with #, and its depth should
1876 # be bigger than parent node.
1877 subentries = [
1878 x
1879 for x in children
1880 if isinstance(x, WikiNode)
1881 and x.kind == NodeKind.LIST
1882 and x.sarg == current_depth + "#"
1883 ]
1885 # sublists of examples and quotations. .sarg
1886 # does not end with "#".
1887 others = [
1888 x
1889 for x in children
1890 if isinstance(x, WikiNode)
1891 and x.kind == NodeKind.LIST
1892 and x.sarg != current_depth + "#"
1893 ]
1895 # the actual contents of this particular node.
1896 # can be a gloss (or a template that expands into
1897 # many glosses which we can't easily pre-expand)
1898 # or could be an "outer gloss" with more specific
1899 # subglosses, or could be a qualfier for the subglosses.
1900 contents = [
1901 x
1902 for x in children
1903 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1904 ]
1905 # If this entry has sublists of entries, we should combine
1906 # gloss information from both the "outer" and sublist content.
1907 # Sometimes the outer gloss
1908 # is more non-gloss or tags, sometimes it is a coarse sense
1909 # and the inner glosses are more specific. The outer one
1910 # does not seem to have qualifiers.
1912 # If we have one sublist with one element, treat it
1913 # specially as it may be a Wiktionary error; raise
1914 # that nested element to the same level.
1915 # XXX If need be, this block can be easily removed in
1916 # the current recursive logicand the result is one sense entry
1917 # with both glosses in the glosses list, as you would
1918 # expect. If the higher entry has examples, there will
1919 # be a higher entry with some duplicated data.
1920 if len(subentries) == 1:
1921 slc = subentries[0].children
1922 if len(slc) == 1:
1923 # copy current node and modify it so it doesn't
1924 # loop infinitely.
1925 cropped_node = copy.copy(node)
1926 cropped_node.children = [
1927 x
1928 for x in children
1929 if not (
1930 isinstance(x, WikiNode)
1931 and x.kind == NodeKind.LIST
1932 and x.sarg == current_depth + "#"
1933 )
1934 ]
1935 added |= parse_sense_node(cropped_node, sense_base, pos)
1936 nonlocal sense_data # this kludge causes duplicated raw_
1937 # glosses data if this is not done;
1938 # if the top-level (cropped_node)
1939 # does not push_sense() properly or
1940 # parse_sense_node() returns early,
1941 # sense_data is not reset. This happens
1942 # for example when you have a no-gloss
1943 # string like "(intransitive)":
1944 # no gloss, push_sense() returns early
1945 # and sense_data has duplicate data with
1946 # sense_base
1947 sense_data = {}
1948 added |= parse_sense_node(slc[0], sense_base, pos)
1949 return added
1951 return process_gloss_contents(
1952 contents,
1953 pos,
1954 sense_base,
1955 subentries,
1956 others,
1957 gloss_template_args,
1958 added,
1959 my_ordinal,
1960 )
1962 def process_gloss_contents(
1963 contents: list[Union[str, WikiNode]],
1964 pos: str,
1965 sense_base: SenseData,
1966 subentries: list[WikiNode] = [],
1967 others: list[WikiNode] = [],
1968 gloss_template_args: Set[str] = set(),
1969 added: bool = False,
1970 sorting_ordinal: int | None = None,
1971 ) -> bool:
1972 def sense_template_fn(
1973 name: str, ht: TemplateArgs, is_gloss: bool = False
1974 ) -> Optional[str]:
1975 # print(f"sense_template_fn: {name}, {ht}")
1976 if name in wikipedia_templates:
1977 # parse_wikipedia_template(wxr, pos_data, ht)
1978 return None
1979 if is_panel_template(wxr, name):
1980 return ""
1981 if name in INFO_TEMPLATE_FUNCS:
1982 info_data, info_exp = parse_info_template_arguments(
1983 wxr, name, ht, "sense"
1984 )
1985 if info_data or info_exp: 1985 ↛ 1991line 1985 didn't jump to line 1991 because the condition on line 1985 was always true
1986 if info_data: 1986 ↛ 1988line 1986 didn't jump to line 1988 because the condition on line 1986 was always true
1987 data_append(sense_base, "info_templates", info_data)
1988 if info_exp and isinstance(info_exp, str): 1988 ↛ 1990line 1988 didn't jump to line 1990 because the condition on line 1988 was always true
1989 return info_exp
1990 return ""
1991 if name in ("defdate",):
1992 date = clean_node(wxr, None, ht.get(1, ()))
1993 if part_two := ht.get(2): 1993 ↛ 1995line 1993 didn't jump to line 1995 because the condition on line 1993 was never true
1994 # Unicode mdash, not '-'
1995 date += "–" + clean_node(wxr, None, part_two)
1996 refs: dict[str, ReferenceData] = {}
1997 # ref, refn, ref2, ref2n, ref3, ref3n
1998 # ref1 not valid
1999 for k, v in sorted(
2000 (k, v) for k, v in ht.items() if isinstance(k, str)
2001 ):
2002 if m := re.match(r"ref(\d?)(n?)", k): 2002 ↛ 1999line 2002 didn't jump to line 1999 because the condition on line 2002 was always true
2003 ref_v = clean_node(wxr, None, v)
2004 if m.group(1) not in refs: # empty string or digit
2005 refs[m.group(1)] = ReferenceData()
2006 if m.group(2):
2007 refs[m.group(1)]["refn"] = ref_v
2008 else:
2009 refs[m.group(1)]["text"] = ref_v
2010 data_append(
2011 sense_base,
2012 "attestations",
2013 AttestationData(date=date, references=list(refs.values())),
2014 )
2015 return ""
2016 if name == "senseid":
2017 langid = clean_node(wxr, None, ht.get(1, ()))
2018 arg = clean_node(wxr, sense_base, ht.get(2, ()))
2019 if re.match(r"Q\d+$", arg):
2020 data_append(sense_base, "wikidata", arg)
2021 data_append(sense_base, "senseid", langid + ":" + arg)
2022 if name in sense_linkage_templates:
2023 # print(f"SENSE_TEMPLATE_FN: {name}")
2024 parse_sense_linkage(wxr, sense_base, name, ht, pos)
2025 return ""
2026 if name == "†" or name == "zh-obsolete":
2027 data_append(sense_base, "tags", "obsolete")
2028 return ""
2029 if name in {
2030 "ux",
2031 "uxi",
2032 "usex",
2033 "afex",
2034 "prefixusex",
2035 "ko-usex",
2036 "ko-x",
2037 "hi-x",
2038 "ja-usex-inline",
2039 "ja-x",
2040 "quotei",
2041 "he-x",
2042 "hi-x",
2043 "km-x",
2044 "ne-x",
2045 "shn-x",
2046 "th-x",
2047 "ur-x",
2048 }:
2049 # Usage examples are captured separately below. We don't
2050 # want to expand them into glosses even when unusual coding
2051 # is used in the entry.
2052 # These templates may slip through inside another item, but
2053 # currently we're separating out example entries (..#:)
2054 # well enough that there seems to very little contamination.
2055 if is_gloss:
2056 wxr.wtp.wiki_notice(
2057 "Example template is used for gloss text",
2058 sortid="extractor.en.page.sense_template_fn/1415",
2059 )
2060 else:
2061 return ""
2062 if name == "w": 2062 ↛ 2063line 2062 didn't jump to line 2063 because the condition on line 2062 was never true
2063 if ht.get(2) == "Wp":
2064 return ""
2065 for v in ht.values():
2066 v = v.strip()
2067 if v and "<" not in v:
2068 gloss_template_args.add(v)
2069 return None
2071 def extract_link_texts(item: GeneralNode) -> None:
2072 """Recursively extracts link texts from the gloss source. This
2073 information is used to select whether to remove final "." from
2074 form_of/alt_of (e.g., ihm/Hunsrik)."""
2075 if isinstance(item, (list, tuple)):
2076 for x in item:
2077 extract_link_texts(x)
2078 return
2079 if isinstance(item, str):
2080 # There seem to be HTML sections that may futher contain
2081 # unparsed links.
2082 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2082 ↛ 2083line 2082 didn't jump to line 2083 because the loop on line 2082 never started
2083 print("ITER:", m.group(0))
2084 v = m.group(1).split("|")[-1].strip()
2085 if v:
2086 gloss_template_args.add(v)
2087 return
2088 if not isinstance(item, WikiNode): 2088 ↛ 2089line 2088 didn't jump to line 2089 because the condition on line 2088 was never true
2089 return
2090 if item.kind == NodeKind.LINK:
2091 v = item.largs[-1]
2092 if ( 2092 ↛ 2098line 2092 didn't jump to line 2098 because the condition on line 2092 was always true
2093 isinstance(v, list)
2094 and len(v) == 1
2095 and isinstance(v[0], str)
2096 ):
2097 gloss_template_args.add(v[0].strip())
2098 for x in item.children:
2099 extract_link_texts(x)
2101 extract_link_texts(contents)
2103 # get the raw text of non-list contents of this node, and other stuff
2104 # like tag and category data added to sense_base
2105 # cast = no-op type-setter for the type-checker
2106 partial_template_fn = cast(
2107 TemplateFnCallable,
2108 partial(sense_template_fn, is_gloss=True),
2109 )
2110 rawgloss = clean_node(
2111 wxr,
2112 sense_base,
2113 contents,
2114 template_fn=partial_template_fn,
2115 collect_links=True,
2116 )
2118 if not rawgloss: 2118 ↛ 2119line 2118 didn't jump to line 2119 because the condition on line 2118 was never true
2119 return False
2121 # remove manually typed ordered list text at the start("1. ")
2122 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2124 # get stuff like synonyms and categories from "others",
2125 # maybe examples and quotations
2126 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2128 # The gloss could contain templates that produce more list items.
2129 # This happens commonly with, e.g., {{inflection of|...}}. Split
2130 # to parts. However, e.g. Interlingua generates multiple glosses
2131 # in HTML directly without Wikitext markup, so we must also split
2132 # by just newlines.
2133 subglosses = rawgloss.splitlines()
2135 if len(subglosses) == 0: 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true
2136 return False
2138 if any(s.startswith("#") for s in subglosses):
2139 subtree = wxr.wtp.parse(rawgloss)
2140 # from wikitextprocessor.parser import print_tree
2141 # print("SUBTREE GENERATED BY TEMPLATE:")
2142 # print_tree(subtree)
2143 new_subentries = [
2144 x
2145 for x in subtree.children
2146 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2147 ]
2149 new_others = [
2150 x
2151 for x in subtree.children
2152 if isinstance(x, WikiNode)
2153 and x.kind == NodeKind.LIST
2154 and not x.sarg.endswith("#")
2155 ]
2157 new_contents = [
2158 clean_node(wxr, [], x)
2159 for x in subtree.children
2160 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2161 ]
2163 subentries = subentries or new_subentries
2164 others = others or new_others
2165 subglosses = new_contents
2166 rawgloss = "".join(subglosses)
2167 # Generate no gloss for translation hub pages, but add the
2168 # "translation-hub" tag for them
2169 if rawgloss == "(This entry is a translation hub.)": 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true
2170 data_append(sense_data, "tags", "translation-hub")
2171 return push_sense(sorting_ordinal)
2173 # Remove certain substrings specific to outer glosses
2174 strip_ends = [", particularly:"]
2175 for x in strip_ends:
2176 if rawgloss.endswith(x):
2177 rawgloss = rawgloss[: -len(x)].strip()
2178 break
2180 # A single gloss, or possibly an outer gloss.
2181 # Check if the possible outer gloss starts with
2182 # parenthesized tags/topics
2184 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2185 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2186 m = QUALIFIERS_RE.match(rawgloss)
2187 # (...): ... or (...(...)...): ...
2188 if m:
2189 q = m.group(1)
2190 rawgloss = rawgloss[m.end() :].strip()
2191 parse_sense_qualifier(wxr, q, sense_base)
2192 if rawgloss == "A pejorative:": 2192 ↛ 2193line 2192 didn't jump to line 2193 because the condition on line 2192 was never true
2193 data_append(sense_base, "tags", "pejorative")
2194 rawgloss = ""
2195 elif rawgloss == "Short forms.": 2195 ↛ 2196line 2195 didn't jump to line 2196 because the condition on line 2195 was never true
2196 data_append(sense_base, "tags", "abbreviation")
2197 rawgloss = ""
2198 elif rawgloss == "Technical or specialized senses.": 2198 ↛ 2199line 2198 didn't jump to line 2199 because the condition on line 2198 was never true
2199 rawgloss = ""
2200 elif rawgloss.startswith("inflection of "):
2201 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2202 if parsed is not None: 2202 ↛ 2211line 2202 didn't jump to line 2211 because the condition on line 2202 was always true
2203 tags, origins = parsed
2204 if origins is not None: 2204 ↛ 2206line 2204 didn't jump to line 2206 because the condition on line 2204 was always true
2205 data_extend(sense_base, "form_of", origins)
2206 if tags is not None: 2206 ↛ 2209line 2206 didn't jump to line 2209 because the condition on line 2206 was always true
2207 data_extend(sense_base, "tags", tags)
2208 else:
2209 data_append(sense_base, "tags", "form-of")
2210 else:
2211 data_append(sense_base, "tags", "form-of")
2212 if rawgloss: 2212 ↛ 2243line 2212 didn't jump to line 2243 because the condition on line 2212 was always true
2213 # Code duplicating a lot of clean-up operations from later in
2214 # this block. We want to clean up the "supergloss" as much as
2215 # possible, in almost the same way as a normal gloss.
2216 supergloss = rawgloss
2218 if supergloss.startswith("; "): 2218 ↛ 2219line 2218 didn't jump to line 2219 because the condition on line 2218 was never true
2219 supergloss = supergloss[1:].strip()
2221 if supergloss.startswith(("^†", "†")):
2222 data_append(sense_base, "tags", "obsolete")
2223 supergloss = supergloss[2:].strip()
2224 elif supergloss.startswith("^‡"): 2224 ↛ 2225line 2224 didn't jump to line 2225 because the condition on line 2224 was never true
2225 data_extend(sense_base, "tags", ["obsolete", "historical"])
2226 supergloss = supergloss[2:].strip()
2228 # remove [14th century...] style brackets at the end
2229 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2231 if supergloss.startswith((",", ":")):
2232 supergloss = supergloss[1:]
2233 supergloss = supergloss.strip()
2234 if supergloss.startswith("N. of "): 2234 ↛ 2235line 2234 didn't jump to line 2235 because the condition on line 2234 was never true
2235 supergloss = "Name of " + supergloss[6:]
2236 supergloss = supergloss[2:]
2237 data_append(sense_base, "glosses", supergloss)
2238 if supergloss in ("A person:",):
2239 data_append(sense_base, "tags", "g-person")
2241 # The main recursive call (except for the exceptions at the
2242 # start of this function).
2243 for sublist in subentries:
2244 if not ( 2244 ↛ 2247line 2244 didn't jump to line 2247 because the condition on line 2244 was never true
2245 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2246 ):
2247 wxr.wtp.debug(
2248 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2249 f"with items that are not LISTs",
2250 sortid="page/1511/20230119",
2251 )
2252 continue
2253 for item in sublist.children:
2254 if not ( 2254 ↛ 2258line 2254 didn't jump to line 2258 because the condition on line 2254 was never true
2255 isinstance(item, WikiNode)
2256 and item.kind == NodeKind.LIST_ITEM
2257 ):
2258 continue
2259 # copy sense_base to prevent cross-contamination between
2260 # subglosses and other subglosses and superglosses
2261 sense_base2 = copy.deepcopy(sense_base)
2262 if parse_sense_node(item, sense_base2, pos): 2262 ↛ 2253line 2262 didn't jump to line 2253 because the condition on line 2262 was always true
2263 added = True
2265 # Capture examples.
2266 # This is called after the recursive calls above so that
2267 # sense_base is not contaminated with meta-data from
2268 # example entries for *this* gloss.
2269 examples = []
2270 if wxr.config.capture_examples: 2270 ↛ 2274line 2270 didn't jump to line 2274 because the condition on line 2270 was always true
2271 examples = extract_examples(others, sense_base)
2273 # push_sense() succeeded somewhere down-river, so skip this level
2274 if added:
2275 if examples:
2276 # this higher-up gloss has examples that we do not want to skip
2277 wxr.wtp.debug(
2278 "'{}[...]' gloss has examples we want to keep, "
2279 "but there are subglosses.".format(repr(rawgloss[:30])),
2280 sortid="page/1498/20230118",
2281 )
2282 else:
2283 return True
2285 # Some entries, e.g., "iacebam", have weird sentences in quotes
2286 # after the gloss, but these sentences don't seem to be intended
2287 # as glosses. Skip them.
2288 indexed_subglosses = list(
2289 (i, gl)
2290 for i, gl in enumerate(subglosses)
2291 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2292 )
2294 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2294 ↛ 2295line 2294 didn't jump to line 2295 because the condition on line 2294 was never true
2295 gl = indexed_subglosses[0][1].strip()
2296 if gl.endswith(":"):
2297 gl = gl[:-1].strip()
2298 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2299 if parsed is not None:
2300 infl_tags, infl_dts = parsed
2301 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2302 # Interpret others as a particular form under
2303 # "inflection of"
2304 data_extend(sense_base, "tags", infl_tags)
2305 data_extend(sense_base, "form_of", infl_dts)
2306 indexed_subglosses = indexed_subglosses[1:]
2307 elif not infl_dts:
2308 data_extend(sense_base, "tags", infl_tags)
2309 indexed_subglosses = indexed_subglosses[1:]
2311 # Create senses for remaining subglosses
2312 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2313 gloss = gloss.strip()
2314 if not gloss and len(indexed_subglosses) > 1: 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true
2315 continue
2316 # Push a new sense (if the last one is not empty)
2317 if push_sense(sorting_ordinal): 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true
2318 added = True
2319 # if gloss not in sense_data.get("raw_glosses", ()):
2320 # data_append(sense_data, "raw_glosses", gloss)
2321 if i == 0 and examples:
2322 # In a multi-line gloss, associate examples
2323 # with only one of them.
2324 # XXX or you could use gloss_i == len(indexed_subglosses)
2325 # to associate examples with the *last* one.
2326 data_extend(sense_data, "examples", examples)
2327 if gloss.startswith("; ") and gloss_i > 0: 2327 ↛ 2328line 2327 didn't jump to line 2328 because the condition on line 2327 was never true
2328 gloss = gloss[1:].strip()
2329 # If the gloss starts with †, mark as obsolete
2330 if gloss.startswith("^†"): 2330 ↛ 2331line 2330 didn't jump to line 2331 because the condition on line 2330 was never true
2331 data_append(sense_data, "tags", "obsolete")
2332 gloss = gloss[2:].strip()
2333 elif gloss.startswith("^‡"): 2333 ↛ 2334line 2333 didn't jump to line 2334 because the condition on line 2333 was never true
2334 data_extend(sense_data, "tags", ["obsolete", "historical"])
2335 gloss = gloss[2:].strip()
2336 # Copy data for all senses to this sense
2337 for k, v in sense_base.items():
2338 if isinstance(v, (list, tuple)):
2339 if k != "tags":
2340 # Tags handled below (countable/uncountable special)
2341 data_extend(sense_data, k, v)
2342 else:
2343 assert k not in ("tags", "categories", "topics")
2344 sense_data[k] = v # type:ignore[literal-required]
2345 # Parse the gloss for this particular sense
2346 m = QUALIFIERS_RE.match(gloss)
2347 # (...): ... or (...(...)...): ...
2348 if m:
2349 parse_sense_qualifier(wxr, m.group(1), sense_data)
2350 gloss = gloss[m.end() :].strip()
2352 # Remove common suffix "[from 14th c.]" and similar
2353 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2355 # Check to make sure we don't have unhandled list items in gloss
2356 ofs = max(gloss.find("#"), gloss.find("* "))
2357 if ofs > 10 and "(#)" not in gloss:
2358 wxr.wtp.debug(
2359 "gloss may contain unhandled list items: {}".format(gloss),
2360 sortid="page/1412",
2361 )
2362 elif "\n" in gloss: 2362 ↛ 2363line 2362 didn't jump to line 2363 because the condition on line 2362 was never true
2363 wxr.wtp.debug(
2364 "gloss contains newline: {}".format(gloss),
2365 sortid="page/1416",
2366 )
2368 # Kludge, some glosses have a comma after initial qualifiers in
2369 # parentheses
2370 if gloss.startswith((",", ":")):
2371 gloss = gloss[1:]
2372 gloss = gloss.strip()
2373 if gloss.endswith(":"):
2374 gloss = gloss[:-1].strip()
2375 if gloss.startswith("N. of "): 2375 ↛ 2376line 2375 didn't jump to line 2376 because the condition on line 2375 was never true
2376 gloss = "Name of " + gloss[6:]
2377 if gloss.startswith("†"): 2377 ↛ 2378line 2377 didn't jump to line 2378 because the condition on line 2377 was never true
2378 data_append(sense_data, "tags", "obsolete")
2379 gloss = gloss[1:]
2380 elif gloss.startswith("^†"): 2380 ↛ 2381line 2380 didn't jump to line 2381 because the condition on line 2380 was never true
2381 data_append(sense_data, "tags", "obsolete")
2382 gloss = gloss[2:]
2384 # Copy tags from sense_base if any. This will not copy
2385 # countable/uncountable if either was specified in the sense,
2386 # as sometimes both are specified in word head but only one
2387 # in individual senses.
2388 countability_tags = []
2389 base_tags = sense_base.get("tags", ())
2390 sense_tags = sense_data.get("tags", ())
2391 for tag in base_tags:
2392 if tag in ("countable", "uncountable"):
2393 if tag not in countability_tags: 2393 ↛ 2395line 2393 didn't jump to line 2395 because the condition on line 2393 was always true
2394 countability_tags.append(tag)
2395 continue
2396 if tag not in sense_tags:
2397 data_append(sense_data, "tags", tag)
2398 if countability_tags:
2399 if ( 2399 ↛ 2408line 2399 didn't jump to line 2408 because the condition on line 2399 was always true
2400 "countable" not in sense_tags
2401 and "uncountable" not in sense_tags
2402 ):
2403 data_extend(sense_data, "tags", countability_tags)
2405 # If outer gloss specifies a form-of ("inflection of", see
2406 # aquamarine/German), try to parse the inner glosses as
2407 # tags for an inflected form.
2408 if "form-of" in sense_base.get("tags", ()):
2409 parsed = parse_alt_or_inflection_of(
2410 wxr, gloss, gloss_template_args
2411 )
2412 if parsed is not None: 2412 ↛ 2418line 2412 didn't jump to line 2418 because the condition on line 2412 was always true
2413 infl_tags, infl_dts = parsed
2414 if not infl_dts and infl_tags: 2414 ↛ 2418line 2414 didn't jump to line 2418 because the condition on line 2414 was always true
2415 # Interpret as a particular form under "inflection of"
2416 data_extend(sense_data, "tags", infl_tags)
2418 if not gloss: 2418 ↛ 2419line 2418 didn't jump to line 2419 because the condition on line 2418 was never true
2419 data_append(sense_data, "tags", "empty-gloss")
2420 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2421 if ( 2421 ↛ 2432line 2421 didn't jump to line 2432 because the condition on line 2421 was always true
2422 gloss_i == 0
2423 and len(sense_data.get("glosses", tuple())) >= 1
2424 ):
2425 # If we added a "high-level gloss" from rawgloss, but this
2426 # is that same gloss_i, add this instead of the raw_gloss
2427 # from before if they're different: the rawgloss was not
2428 # cleaned exactly the same as this later gloss
2429 sense_data["glosses"][-1] = gloss
2430 else:
2431 # Add the gloss for the sense.
2432 data_append(sense_data, "glosses", gloss)
2434 # Kludge: there are cases (e.g., etc./Swedish) where there are
2435 # two abbreviations in the same sense, both generated by the
2436 # {{abbreviation of|...}} template. Handle these with some magic.
2437 position = 0
2438 split_glosses = []
2439 for m in re.finditer(r"Abbreviation of ", gloss):
2440 if m.start() != position: 2440 ↛ 2439line 2440 didn't jump to line 2439 because the condition on line 2440 was always true
2441 split_glosses.append(gloss[position : m.start()])
2442 position = m.start()
2443 split_glosses.append(gloss[position:])
2444 for gloss in split_glosses:
2445 # Check if this gloss describes an alt-of or inflection-of
2446 if (
2447 lang_code != "en"
2448 and " " not in gloss
2449 and distw([word], gloss) < 0.3
2450 ):
2451 # Don't try to parse gloss if it is one word
2452 # that is close to the word itself for non-English words
2453 # (probable translations of a tag/form name)
2454 continue
2455 parsed = parse_alt_or_inflection_of(
2456 wxr, gloss, gloss_template_args
2457 )
2458 if parsed is None:
2459 continue
2460 tags, dts = parsed
2461 if not dts and tags:
2462 data_extend(sense_data, "tags", tags)
2463 continue
2464 for dt in dts: # type:ignore[union-attr]
2465 ftags = list(tag for tag in tags if tag != "form-of")
2466 if "alt-of" in tags:
2467 data_extend(sense_data, "tags", ftags)
2468 data_append(sense_data, "alt_of", dt)
2469 elif "compound-of" in tags: 2469 ↛ 2470line 2469 didn't jump to line 2470 because the condition on line 2469 was never true
2470 data_extend(sense_data, "tags", ftags)
2471 data_append(sense_data, "compound_of", dt)
2472 elif "synonym-of" in tags: 2472 ↛ 2473line 2472 didn't jump to line 2473 because the condition on line 2472 was never true
2473 data_extend(dt, "tags", ftags)
2474 data_append(sense_data, "synonyms", dt)
2475 elif tags and dt.get("word", "").startswith("of "): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true
2476 dt["word"] = dt["word"][3:]
2477 data_append(sense_data, "tags", "form-of")
2478 data_extend(sense_data, "tags", ftags)
2479 data_append(sense_data, "form_of", dt)
2480 elif "form-of" in tags: 2480 ↛ 2464line 2480 didn't jump to line 2464 because the condition on line 2480 was always true
2481 data_extend(sense_data, "tags", tags)
2482 data_append(sense_data, "form_of", dt)
2484 if len(sense_data) == 0:
2485 if len(sense_base.get("tags", [])) == 0: 2485 ↛ 2487line 2485 didn't jump to line 2487 because the condition on line 2485 was always true
2486 del sense_base["tags"]
2487 sense_data.update(sense_base)
2488 if push_sense(sorting_ordinal): 2488 ↛ 2492line 2488 didn't jump to line 2492 because the condition on line 2488 was always true
2489 # push_sense succeded in adding a sense to pos_data
2490 added = True
2491 # print("PARSE_SENSE DONE:", pos_datas[-1])
2492 return added
2494 def parse_inflection(
2495 node: WikiNode, section: str, pos: Optional[str]
2496 ) -> None:
2497 """Parses inflection data (declension, conjugation) from the given
2498 page. This retrieves the actual inflection template
2499 parameters, which are very useful for applications that need
2500 to learn the inflection classes and generate inflected
2501 forms."""
2502 assert isinstance(node, WikiNode)
2503 assert isinstance(section, str)
2504 assert pos is None or isinstance(pos, str)
2505 # print("parse_inflection:", node)
2507 if pos is None: 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true
2508 wxr.wtp.debug(
2509 "inflection table outside part-of-speech", sortid="page/1812"
2510 )
2511 return
2513 def inflection_template_fn(
2514 name: str, ht: TemplateArgs
2515 ) -> Optional[str]:
2516 # print("decl_conj_template_fn", name, ht)
2517 if is_panel_template(wxr, name): 2517 ↛ 2518line 2517 didn't jump to line 2518 because the condition on line 2517 was never true
2518 return ""
2519 if name in ("is-u-mutation",): 2519 ↛ 2522line 2519 didn't jump to line 2522 because the condition on line 2519 was never true
2520 # These are not to be captured as an exception to the
2521 # generic code below
2522 return None
2523 m = re.search(
2524 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2525 r"declension|inflection|mut|mutation)($|-)",
2526 name,
2527 )
2528 if m:
2529 args_ht = clean_template_args(wxr, ht)
2530 dt = {"name": name, "args": args_ht}
2531 data_append(pos_data, "inflection_templates", dt)
2533 return None
2535 # Convert the subtree back to Wikitext, then expand all and parse,
2536 # capturing templates in the process
2537 text = wxr.wtp.node_to_wikitext(node.children)
2539 # Split text into separate sections for each to-level template
2540 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2541 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2542 # The (?:...) creates a non-capturing regex group; if it was capturing,
2543 # like the group around it, it would create elements in brace_matches,
2544 # including None if it doesn't match.
2545 # 20250114: Added {| and |} into the regex because tables were being
2546 # cut into pieces by this code. Issue #973, introduction of two-part
2547 # book-end templates similar to trans-top and tran-bottom.
2548 template_sections = []
2549 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2550 # Because there is the possibility of triple curly braces
2551 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2552 # count nesting depth using pairs of two brackets, but
2553 # instead use singular braces ("{ }").
2554 # Because template delimiters should be balanced, regardless
2555 # of whether {{ or {{{ is used, and because we only care
2556 # about the outer-most delimiters (the highest level template)
2557 # we can just count the single braces when those single
2558 # braces are part of a group.
2559 table_nesting = 0
2560 # However, if we have a stray table ({| ... |}) that should always
2561 # be its own section, and should prevent templates from cutting it
2562 # into sections.
2564 # print(f"Parse inflection: {text=}")
2565 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2566 if len(brace_matches) > 1:
2567 tsection: list[str] = []
2568 after_templates = False # kludge to keep any text
2569 # before first template
2570 # with the first template;
2571 # otherwise, text
2572 # goes with preceding template
2573 for m in brace_matches:
2574 if m.startswith("\n; ") and after_templates: 2574 ↛ 2575line 2574 didn't jump to line 2575 because the condition on line 2574 was never true
2575 after_templates = False
2576 template_sections.append(tsection)
2577 tsection = []
2578 tsection.append(m)
2579 elif m.startswith("{{") or m.endswith("{|"):
2580 if (
2581 template_nesting == 0
2582 and after_templates
2583 and table_nesting == 0
2584 ):
2585 template_sections.append(tsection)
2586 tsection = []
2587 # start new section
2588 after_templates = True
2589 if m.startswith("{{"):
2590 template_nesting += 1
2591 else:
2592 # m.endswith("{|")
2593 table_nesting += 1
2594 tsection.append(m)
2595 elif m.startswith("}}") or m.endswith("|}"):
2596 if m.startswith("}}"):
2597 template_nesting -= 1
2598 if template_nesting < 0: 2598 ↛ 2599line 2598 didn't jump to line 2599 because the condition on line 2598 was never true
2599 wxr.wtp.error(
2600 "Negatively nested braces, "
2601 "couldn't split inflection templates, "
2602 "{}/{} section {}".format(
2603 word, language, section
2604 ),
2605 sortid="page/1871",
2606 )
2607 template_sections = [] # use whole text
2608 break
2609 else:
2610 table_nesting -= 1
2611 if table_nesting < 0: 2611 ↛ 2612line 2611 didn't jump to line 2612 because the condition on line 2611 was never true
2612 wxr.wtp.error(
2613 "Negatively nested table braces, "
2614 "couldn't split inflection section, "
2615 "{}/{} section {}".format(
2616 word, language, section
2617 ),
2618 sortid="page/20250114",
2619 )
2620 template_sections = [] # use whole text
2621 break
2622 tsection.append(m)
2623 else:
2624 tsection.append(m)
2625 if tsection: # dangling tsection 2625 ↛ 2633line 2625 didn't jump to line 2633 because the condition on line 2625 was always true
2626 template_sections.append(tsection)
2627 # Why do it this way around? The parser has a preference
2628 # to associate bits outside of tables with the preceding
2629 # table (`after`-variable), so a new tsection begins
2630 # at {{ and everything before it belongs to the previous
2631 # template.
2633 texts = []
2634 if not template_sections:
2635 texts = [text]
2636 else:
2637 for tsection in template_sections:
2638 texts.append("".join(tsection))
2639 if template_nesting != 0: 2639 ↛ 2640line 2639 didn't jump to line 2640 because the condition on line 2639 was never true
2640 wxr.wtp.error(
2641 "Template nesting error: "
2642 "template_nesting = {} "
2643 "couldn't split inflection templates, "
2644 "{}/{} section {}".format(
2645 template_nesting, word, language, section
2646 ),
2647 sortid="page/1896",
2648 )
2649 texts = [text]
2650 for text in texts:
2651 tree = wxr.wtp.parse(
2652 text, expand_all=True, template_fn=inflection_template_fn
2653 )
2655 if not text.strip():
2656 continue
2658 # Parse inflection tables from the section. The data is stored
2659 # under "forms".
2660 if wxr.config.capture_inflections: 2660 ↛ 2650line 2660 didn't jump to line 2650 because the condition on line 2660 was always true
2661 tablecontext = None
2662 m = re.search(r"{{([^}{|]+)\|?", text)
2663 if m:
2664 template_name = m.group(1).strip()
2665 tablecontext = TableContext(template_name)
2667 parse_inflection_section(
2668 wxr,
2669 pos_data,
2670 word,
2671 language,
2672 pos,
2673 section,
2674 tree,
2675 tablecontext=tablecontext,
2676 )
2678 def get_subpage_section(
2679 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2680 ) -> Optional[Union[WikiNode, str]]:
2681 """Loads a subpage of the given page, and finds the section
2682 for the given language, part-of-speech, and section title. This
2683 is used for finding translations and other sections on subpages."""
2684 assert isinstance(language, str)
2685 assert isinstance(title, str)
2686 assert isinstance(subtitle, str)
2687 assert isinstance(seqs, (list, tuple))
2688 for seq in seqs:
2689 for x in seq:
2690 assert isinstance(x, str)
2691 subpage_title = word + "/" + subtitle
2692 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2693 if subpage_content is None:
2694 wxr.wtp.error(
2695 "/translations not found despite "
2696 "{{see translation subpage|...}}",
2697 sortid="page/1934",
2698 )
2699 return None
2701 def recurse(
2702 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2703 ) -> Optional[Union[str, WikiNode]]:
2704 # print(f"seq: {seq}")
2705 if not seq:
2706 return node
2707 if not isinstance(node, WikiNode):
2708 return None
2709 # print(f"node.kind: {node.kind}")
2710 if node.kind in LEVEL_KINDS:
2711 t = clean_node(wxr, None, node.largs[0])
2712 # print(f"t: {t} == seq[0]: {seq[0]}?")
2713 if t.lower() == seq[0].lower():
2714 seq = seq[1:]
2715 if not seq:
2716 return node
2717 for n in node.children:
2718 ret = recurse(n, seq)
2719 if ret is not None:
2720 return ret
2721 return None
2723 tree = wxr.wtp.parse(
2724 subpage_content,
2725 pre_expand=True,
2726 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2727 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2728 )
2729 assert tree.kind == NodeKind.ROOT
2730 for seq in seqs:
2731 ret = recurse(tree, seq)
2732 if ret is None:
2733 wxr.wtp.debug(
2734 "Failed to find subpage section {}/{} seq {}".format(
2735 title, subtitle, seq
2736 ),
2737 sortid="page/1963",
2738 )
2739 return ret
2741 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2742 """Parses translations for a word. This may also pull in translations
2743 from separate translation subpages."""
2744 assert isinstance(data, dict)
2745 assert isinstance(xlatnode, WikiNode)
2746 # print("===== PARSE_TRANSLATIONS {} {} {}"
2747 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2748 # print("parse_translations xlatnode={}".format(xlatnode))
2749 if not wxr.config.capture_translations: 2749 ↛ 2750line 2749 didn't jump to line 2750 because the condition on line 2749 was never true
2750 return
2751 sense_parts: list[Union[WikiNode, str]] = []
2752 sense: Optional[str] = None
2754 def parse_translation_item(
2755 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2756 ) -> None:
2757 nonlocal sense
2758 assert isinstance(contents, list)
2759 assert lang is None or isinstance(lang, str)
2760 # print("PARSE_TRANSLATION_ITEM:", contents)
2762 langcode: Optional[str] = None
2763 if sense is None:
2764 sense = clean_node(wxr, data, sense_parts).strip()
2765 # print("sense <- clean_node: ", sense)
2766 idx = sense.find("See also translations at")
2767 if idx > 0: 2767 ↛ 2768line 2767 didn't jump to line 2768 because the condition on line 2767 was never true
2768 wxr.wtp.debug(
2769 "Skipping translation see also: {}".format(sense),
2770 sortid="page/2361",
2771 )
2772 sense = sense[:idx].strip()
2773 if sense.endswith(":"): 2773 ↛ 2774line 2773 didn't jump to line 2774 because the condition on line 2773 was never true
2774 sense = sense[:-1].strip()
2775 if sense.endswith("—"): 2775 ↛ 2776line 2775 didn't jump to line 2776 because the condition on line 2775 was never true
2776 sense = sense[:-1].strip()
2777 translations_from_template: list[str] = []
2779 def translation_item_template_fn(
2780 name: str, ht: TemplateArgs
2781 ) -> Optional[str]:
2782 nonlocal langcode
2783 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2784 if is_panel_template(wxr, name):
2785 return ""
2786 if name in ("t+check", "t-check", "t-needed"):
2787 # We ignore these templates. They seem to have outright
2788 # garbage in some entries, and very varying formatting in
2789 # others. These should be transitory and unreliable
2790 # anyway.
2791 return "__IGNORE__"
2792 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2793 code = ht.get(1)
2794 if code: 2794 ↛ 2804line 2794 didn't jump to line 2804 because the condition on line 2794 was always true
2795 if langcode and code != langcode:
2796 wxr.wtp.debug(
2797 "inconsistent language codes {} vs "
2798 "{} in translation item: {!r} {}".format(
2799 langcode, code, name, ht
2800 ),
2801 sortid="page/2386",
2802 )
2803 langcode = code
2804 tr = ht.get(2)
2805 if tr:
2806 tr = clean_node(wxr, None, [tr])
2807 translations_from_template.append(tr)
2808 return None
2809 if name == "t-egy":
2810 langcode = "egy"
2811 return None
2812 if name == "ttbc":
2813 code = ht.get(1)
2814 if code: 2814 ↛ 2816line 2814 didn't jump to line 2816 because the condition on line 2814 was always true
2815 langcode = code
2816 return None
2817 if name == "trans-see": 2817 ↛ 2818line 2817 didn't jump to line 2818 because the condition on line 2817 was never true
2818 wxr.wtp.error(
2819 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2820 )
2821 return ""
2822 if name.endswith("-top"): 2822 ↛ 2823line 2822 didn't jump to line 2823 because the condition on line 2822 was never true
2823 return ""
2824 if name.endswith("-bottom"): 2824 ↛ 2825line 2824 didn't jump to line 2825 because the condition on line 2824 was never true
2825 return ""
2826 if name.endswith("-mid"): 2826 ↛ 2827line 2826 didn't jump to line 2827 because the condition on line 2826 was never true
2827 return ""
2828 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2829 # .format(name),
2830 # sortid="page/2414")
2831 return None
2833 sublists = list(
2834 x
2835 for x in contents
2836 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2837 )
2838 contents = list(
2839 x
2840 for x in contents
2841 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2842 )
2844 item = clean_node(
2845 wxr, data, contents, template_fn=translation_item_template_fn
2846 )
2847 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2849 # Parse the translation item.
2850 if item: 2850 ↛ exitline 2850 didn't return from function 'parse_translation_item' because the condition on line 2850 was always true
2851 lang = parse_translation_item_text(
2852 wxr,
2853 word,
2854 data,
2855 item,
2856 sense,
2857 lang,
2858 langcode,
2859 translations_from_template,
2860 is_reconstruction,
2861 )
2863 # Handle sublists. They are frequently used for different
2864 # scripts for the language and different variants of the
2865 # language. We will include the lower-level header as a
2866 # tag in those cases.
2867 for listnode in sublists:
2868 assert listnode.kind == NodeKind.LIST
2869 for node in listnode.children:
2870 if not isinstance(node, WikiNode): 2870 ↛ 2871line 2870 didn't jump to line 2871 because the condition on line 2870 was never true
2871 continue
2872 if node.kind == NodeKind.LIST_ITEM: 2872 ↛ 2869line 2872 didn't jump to line 2869 because the condition on line 2872 was always true
2873 parse_translation_item(node.children, lang=lang)
2875 def parse_translation_template(node: WikiNode) -> None:
2876 assert isinstance(node, WikiNode)
2878 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2879 nonlocal sense_parts
2880 nonlocal sense
2881 if is_panel_template(wxr, name):
2882 return ""
2883 if name == "see also":
2884 # XXX capture
2885 # XXX for example, "/" has top-level list containing
2886 # see also items. So also should parse those.
2887 return ""
2888 if name == "trans-see":
2889 # XXX capture
2890 return ""
2891 if name == "see translation subpage": 2891 ↛ 2892line 2891 didn't jump to line 2892 because the condition on line 2891 was never true
2892 sense_parts = []
2893 sense = None
2894 sub = ht.get(1, "")
2895 if sub:
2896 m = re.match(
2897 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
2898 )
2899 else:
2900 m = None
2901 etym = ""
2902 etym_numbered = ""
2903 pos = ""
2904 if m:
2905 etym_numbered = m.group(1)
2906 etym = m.group(2)
2907 pos = m.group(3)
2908 if not sub:
2909 wxr.wtp.debug(
2910 "no part-of-speech in "
2911 "{{see translation subpage|...}}, "
2912 "defaulting to just wxr.wtp.section "
2913 "(= language)",
2914 sortid="page/2468",
2915 )
2916 # seq sent to get_subpage_section without sub and pos
2917 seq = [
2918 language,
2919 TRANSLATIONS_TITLE,
2920 ]
2921 elif (
2922 m
2923 and etym.lower().strip() in ETYMOLOGY_TITLES
2924 and pos.lower() in POS_TITLES
2925 ):
2926 seq = [
2927 language,
2928 etym_numbered,
2929 pos,
2930 TRANSLATIONS_TITLE,
2931 ]
2932 elif sub.lower() in POS_TITLES:
2933 # seq with sub but not pos
2934 seq = [
2935 language,
2936 sub,
2937 TRANSLATIONS_TITLE,
2938 ]
2939 else:
2940 # seq with sub and pos
2941 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2942 if pos.lower() not in POS_TITLES:
2943 wxr.wtp.debug(
2944 "unhandled see translation subpage: "
2945 "language={} sub={} "
2946 "wxr.wtp.subsection={}".format(
2947 language, sub, wxr.wtp.subsection
2948 ),
2949 sortid="page/2478",
2950 )
2951 seq = [language, sub, pos, TRANSLATIONS_TITLE]
2952 subnode = get_subpage_section(
2953 wxr.wtp.title or "MISSING_TITLE",
2954 TRANSLATIONS_TITLE,
2955 [seq],
2956 )
2957 if subnode is None or not isinstance(subnode, WikiNode):
2958 # Failed to find the normal subpage section
2959 # seq with sub and pos
2960 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2961 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
2962 seqs: list[list[str] | tuple[str, ...]] = [
2963 [TRANSLATIONS_TITLE],
2964 [language, pos],
2965 ]
2966 subnode = get_subpage_section(
2967 wxr.wtp.title or "MISSING_TITLE",
2968 TRANSLATIONS_TITLE,
2969 seqs,
2970 )
2971 if subnode is not None and isinstance(subnode, WikiNode):
2972 parse_translations(data, subnode)
2973 return ""
2974 if name in (
2975 "c",
2976 "C",
2977 "categorize",
2978 "cat",
2979 "catlangname",
2980 "topics",
2981 "top",
2982 "qualifier",
2983 "cln",
2984 ):
2985 # These are expanded in the default way
2986 return None
2987 if name in (
2988 "trans-top",
2989 "trans-top-see",
2990 ):
2991 # XXX capture id from trans-top? Capture sense here
2992 # instead of trying to parse it from expanded content?
2993 if ht.get(1):
2994 sense_parts = []
2995 sense = ht.get(1)
2996 else:
2997 sense_parts = []
2998 sense = None
2999 return None
3000 if name in (
3001 "trans-bottom",
3002 "trans-mid",
3003 "checktrans-mid",
3004 "checktrans-bottom",
3005 ):
3006 return None
3007 if name == "checktrans-top":
3008 sense_parts = []
3009 sense = None
3010 return ""
3011 if name == "trans-top-also":
3012 # XXX capture?
3013 sense_parts = []
3014 sense = None
3015 return ""
3016 wxr.wtp.error(
3017 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3018 name, ht
3019 ),
3020 sortid="page/2517",
3021 )
3022 return ""
3024 wxr.wtp.expand(
3025 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3026 )
3028 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3029 nonlocal sense
3030 nonlocal sense_parts
3031 for node in xlatnode.children:
3032 # print(node)
3033 if isinstance(node, str):
3034 if sense:
3035 if not node.isspace():
3036 wxr.wtp.debug(
3037 "skipping string in the middle of "
3038 "translations: {}".format(node),
3039 sortid="page/2530",
3040 )
3041 continue
3042 # Add a part to the sense
3043 sense_parts.append(node)
3044 sense = None
3045 continue
3046 assert isinstance(node, WikiNode)
3047 kind = node.kind
3048 if kind == NodeKind.LIST:
3049 for item in node.children:
3050 if not isinstance(item, WikiNode): 3050 ↛ 3051line 3050 didn't jump to line 3051 because the condition on line 3050 was never true
3051 continue
3052 if item.kind != NodeKind.LIST_ITEM: 3052 ↛ 3053line 3052 didn't jump to line 3053 because the condition on line 3052 was never true
3053 continue
3054 if item.sarg == ":": 3054 ↛ 3055line 3054 didn't jump to line 3055 because the condition on line 3054 was never true
3055 continue
3056 parse_translation_item(item.children)
3057 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3057 ↛ 3061line 3057 didn't jump to line 3061 because the condition on line 3057 was never true
3058 # Silently skip list items that are just indented; these
3059 # are used for text between translations, such as indicating
3060 # translations that need to be checked.
3061 pass
3062 elif kind == NodeKind.TEMPLATE:
3063 parse_translation_template(node)
3064 elif kind in ( 3064 ↛ 3069line 3064 didn't jump to line 3069 because the condition on line 3064 was never true
3065 NodeKind.TABLE,
3066 NodeKind.TABLE_ROW,
3067 NodeKind.TABLE_CELL,
3068 ):
3069 parse_translation_recurse(node)
3070 elif kind == NodeKind.HTML:
3071 if node.attrs.get("class") == "NavFrame": 3071 ↛ 3077line 3071 didn't jump to line 3077 because the condition on line 3071 was never true
3072 # Reset ``sense_parts`` (and force recomputing
3073 # by clearing ``sense``) as each NavFrame specifies
3074 # its own sense. This helps eliminate garbage coming
3075 # from text at the beginning at the translations
3076 # section.
3077 sense_parts = []
3078 sense = None
3079 # for item in node.children:
3080 # if not isinstance(item, WikiNode):
3081 # continue
3082 # parse_translation_recurse(item)
3083 parse_translation_recurse(node)
3084 elif kind in LEVEL_KINDS: 3084 ↛ 3086line 3084 didn't jump to line 3086 because the condition on line 3084 was never true
3085 # Sub-levels will be recursed elsewhere
3086 pass
3087 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3088 parse_translation_recurse(node)
3089 elif kind == NodeKind.PREFORMATTED: 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true
3090 print("parse_translation_recurse: PREFORMATTED:", node)
3091 elif kind == NodeKind.LINK: 3091 ↛ 3145line 3091 didn't jump to line 3145 because the condition on line 3091 was always true
3092 arg0 = node.largs[0]
3093 # Kludge: I've seen occasional normal links to translation
3094 # subpages from main pages (e.g., language/English/Noun
3095 # in July 2021) instead of the normal
3096 # {{see translation subpage|...}} template. This should
3097 # handle them. Note: must be careful not to read other
3098 # links, particularly things like in "human being":
3099 # "a human being -- see [[man/translations]]" (group title)
3100 if ( 3100 ↛ 3108line 3100 didn't jump to line 3108 because the condition on line 3100 was never true
3101 isinstance(arg0, (list, tuple))
3102 and arg0
3103 and isinstance(arg0[0], str)
3104 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3105 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3106 == wxr.wtp.title
3107 ):
3108 wxr.wtp.debug(
3109 "translations subpage link found on main "
3110 "page instead "
3111 "of normal {{see translation subpage|...}}",
3112 sortid="page/2595",
3113 )
3114 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3115 if sub.lower() in POS_TITLES:
3116 seq = [
3117 language,
3118 sub,
3119 TRANSLATIONS_TITLE,
3120 ]
3121 subnode = get_subpage_section(
3122 wxr.wtp.title,
3123 TRANSLATIONS_TITLE,
3124 [seq],
3125 )
3126 if subnode is not None and isinstance(
3127 subnode, WikiNode
3128 ):
3129 parse_translations(data, subnode)
3130 else:
3131 wxr.wtp.error(
3132 "/translations link outside part-of-speech"
3133 )
3135 if (
3136 len(arg0) >= 1
3137 and isinstance(arg0[0], str)
3138 and not arg0[0].lower().startswith("category:")
3139 ):
3140 for x in node.largs[-1]:
3141 if isinstance(x, str): 3141 ↛ 3144line 3141 didn't jump to line 3144 because the condition on line 3141 was always true
3142 sense_parts.append(x)
3143 else:
3144 parse_translation_recurse(x)
3145 elif not sense:
3146 sense_parts.append(node)
3147 else:
3148 wxr.wtp.debug(
3149 "skipping text between translation items/senses: "
3150 "{}".format(node),
3151 sortid="page/2621",
3152 )
3154 # Main code of parse_translation(). We want ``sense`` to be assigned
3155 # regardless of recursion levels, and thus the code is structured
3156 # to define at this level and recurse in parse_translation_recurse().
3157 parse_translation_recurse(xlatnode)
3159 def parse_etymology(data: WordData, node: LevelNode) -> None:
3160 """Parses an etymology section."""
3161 assert isinstance(data, dict)
3162 assert isinstance(node, WikiNode)
3164 templates: list[TemplateData] = []
3166 # Counter for preventing the capture of etymology templates
3167 # when we are inside templates that we want to ignore (i.e.,
3168 # not capture).
3169 ignore_count = 0
3171 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3172 nonlocal ignore_count
3173 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3174 return ""
3175 if re.match(ignored_etymology_templates_re, name):
3176 ignore_count += 1
3177 return None
3179 # CONTINUE_HERE
3181 def etym_post_template_fn(
3182 name: str, ht: TemplateArgs, expansion: str
3183 ) -> None:
3184 nonlocal ignore_count
3185 if name in wikipedia_templates:
3186 parse_wikipedia_template(wxr, data, ht)
3187 return None
3188 if re.match(ignored_etymology_templates_re, name):
3189 ignore_count -= 1
3190 return None
3191 if ignore_count == 0: 3191 ↛ 3197line 3191 didn't jump to line 3197 because the condition on line 3191 was always true
3192 ht = clean_template_args(wxr, ht)
3193 expansion = clean_node(wxr, None, expansion)
3194 templates.append(
3195 {"name": name, "args": ht, "expansion": expansion}
3196 )
3197 return None
3199 # Remove any subsections
3200 contents = list(
3201 x
3202 for x in node.children
3203 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3204 )
3205 # Convert to text, also capturing templates using post_template_fn
3206 text = clean_node(
3207 wxr,
3208 None,
3209 contents,
3210 template_fn=etym_template_fn,
3211 post_template_fn=etym_post_template_fn,
3212 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3213 # Save the collected information.
3214 if len(text) > 0:
3215 data["etymology_text"] = text
3216 if len(templates) > 0:
3217 # Some etymology templates, like Template:root do not generate
3218 # text, so they should be added here. Elsewhere, we check
3219 # for Template:root and add some text to the expansion to please
3220 # the validation.
3221 data["etymology_templates"] = templates
3223 for child_node in node.find_child_recursively( 3223 ↛ exitline 3223 didn't return from function 'parse_etymology' because the loop on line 3223 didn't complete
3224 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3225 ):
3226 if child_node.kind in LEVEL_KIND_FLAGS:
3227 break
3228 elif isinstance( 3228 ↛ 3231line 3228 didn't jump to line 3231 because the condition on line 3228 was never true
3229 child_node, TemplateNode
3230 ) and child_node.template_name in ["zh-x", "zh-q"]:
3231 if "etymology_examples" not in data:
3232 data["etymology_examples"] = []
3233 data["etymology_examples"].extend(
3234 extract_template_zh_x(
3235 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3236 )
3237 )
3239 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3240 """This recurses into a subtree in the parse tree for a page."""
3241 nonlocal etym_data
3242 nonlocal pos_data
3243 nonlocal inside_level_four
3245 redirect_list: list[str] = [] # for `zh-see` template
3247 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3248 """This is called for otherwise unprocessed parts of the page.
3249 We still expand them so that e.g. Category links get captured."""
3250 if name in wikipedia_templates:
3251 data = select_data()
3252 parse_wikipedia_template(wxr, data, ht)
3253 return None
3254 if is_panel_template(wxr, name):
3255 return ""
3256 return None
3258 for node in treenode.children:
3259 if not isinstance(node, WikiNode):
3260 # print(" X{}".format(repr(node)[:40]))
3261 continue
3262 if isinstance(node, TemplateNode):
3263 if process_soft_redirect_template(wxr, node, redirect_list):
3264 continue
3265 elif node.template_name == "zh-forms":
3266 extract_zh_forms_template(wxr, node, select_data())
3267 elif (
3268 node.template_name.endswith("-kanjitab")
3269 or node.template_name == "ja-kt"
3270 ):
3271 extract_ja_kanjitab_template(wxr, node, select_data())
3273 if not isinstance(node, LevelNode):
3274 # XXX handle e.g. wikipedia links at the top of a language
3275 # XXX should at least capture "also" at top of page
3276 if node.kind in (
3277 NodeKind.HLINE,
3278 NodeKind.LIST,
3279 NodeKind.LIST_ITEM,
3280 ):
3281 continue
3282 # print(" UNEXPECTED: {}".format(node))
3283 # Clean the node to collect category links
3284 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3285 continue
3286 t = clean_node(
3287 wxr, etym_data, node.sarg if node.sarg else node.largs
3288 )
3289 t = t.lower()
3290 # XXX these counts were never implemented fully, and even this
3291 # gets discarded: Search STATISTICS_IMPLEMENTATION
3292 wxr.config.section_counts[t] += 1
3293 # print("PROCESS_CHILDREN: T:", repr(t))
3294 if t in IGNORED_TITLES:
3295 pass
3296 elif t.startswith(PRONUNCIATION_TITLE):
3297 # Chinese Pronunciation section kludge; we demote these to
3298 # be level 4 instead of 3 so that they're part of a larger
3299 # etymology hierarchy; usually the data here is empty and
3300 # acts as an inbetween between POS and Etymology data
3301 if lang_code in ("zh",):
3302 inside_level_four = True
3303 if t.startswith(PRONUNCIATION_TITLE + " "):
3304 # Pronunciation 1, etc, are used in Chinese Glyphs,
3305 # and each of them may have senses under Definition
3306 push_level_four_section(True)
3307 wxr.wtp.start_subsection(None)
3308 if wxr.config.capture_pronunciation: 3308 ↛ 3416line 3308 didn't jump to line 3416 because the condition on line 3308 was always true
3309 data = select_data()
3310 parse_pronunciation(
3311 wxr,
3312 node,
3313 data,
3314 etym_data,
3315 have_etym,
3316 base_data,
3317 lang_code,
3318 )
3319 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3320 push_etym()
3321 wxr.wtp.start_subsection(None)
3322 if wxr.config.capture_etymologies: 3322 ↛ 3416line 3322 didn't jump to line 3416 because the condition on line 3322 was always true
3323 m = re.search(r"\s(\d+(\.\d+)?)$", t)
3324 if m:
3325 etym_data["etymology_number"] = m.group(1)
3326 parse_etymology(etym_data, node)
3327 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3328 data = select_data()
3329 extract_descendant_section(wxr, data, node, False)
3330 elif (
3331 t in PROTO_ROOT_DERIVED_TITLES
3332 and pos == "root"
3333 and is_reconstruction
3334 and wxr.config.capture_descendants
3335 ):
3336 data = select_data()
3337 extract_descendant_section(wxr, data, node, True)
3338 elif t == TRANSLATIONS_TITLE:
3339 data = select_data()
3340 parse_translations(data, node)
3341 elif t in INFLECTION_TITLES:
3342 parse_inflection(node, t, pos)
3343 elif t == "alternative forms":
3344 extract_alt_form_section(wxr, select_data(), node)
3345 else:
3346 lst = t.split()
3347 while len(lst) > 1 and lst[-1].isdigit(): 3347 ↛ 3348line 3347 didn't jump to line 3348 because the condition on line 3347 was never true
3348 lst = lst[:-1]
3349 t_no_number = " ".join(lst).lower()
3350 if t_no_number in POS_TITLES:
3351 push_pos()
3352 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3353 pos = dt["pos"] or "MISSING_POS"
3354 wxr.wtp.start_subsection(t)
3355 if "debug" in dt:
3356 wxr.wtp.debug(
3357 "{} in section {}".format(dt["debug"], t),
3358 sortid="page/2755",
3359 )
3360 if "warning" in dt: 3360 ↛ 3361line 3360 didn't jump to line 3361 because the condition on line 3360 was never true
3361 wxr.wtp.wiki_notice(
3362 "{} in section {}".format(dt["warning"], t),
3363 sortid="page/2759",
3364 )
3365 if "error" in dt: 3365 ↛ 3366line 3365 didn't jump to line 3366 because the condition on line 3365 was never true
3366 wxr.wtp.error(
3367 "{} in section {}".format(dt["error"], t),
3368 sortid="page/2763",
3369 )
3370 if "note" in dt: 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true
3371 wxr.wtp.note(
3372 "{} in section {}".format(dt["note"], t),
3373 sortid="page/20251017a",
3374 )
3375 if "wiki_notice" in dt: 3375 ↛ 3376line 3375 didn't jump to line 3376 because the condition on line 3375 was never true
3376 wxr.wtp.wiki_notice(
3377 "{} in section {}".format(dt["wiki_notices"], t),
3378 sortid="page/20251017b",
3379 )
3380 # Parse word senses for the part-of-speech
3381 parse_part_of_speech(node, pos)
3382 if "tags" in dt:
3383 for pdata in sense_datas:
3384 data_extend(pdata, "tags", dt["tags"])
3385 elif t_no_number in LINKAGE_TITLES:
3386 # print(f"LINKAGE_TITLES NODE {node=}")
3387 rel = LINKAGE_TITLES[t_no_number]
3388 data = select_data()
3389 parse_linkage(
3390 wxr,
3391 data,
3392 rel,
3393 node,
3394 word,
3395 sense_datas,
3396 is_reconstruction,
3397 )
3398 elif t_no_number == COMPOUNDS_TITLE:
3399 data = select_data()
3400 if wxr.config.capture_compounds: 3400 ↛ 3416line 3400 didn't jump to line 3416 because the condition on line 3400 was always true
3401 parse_linkage(
3402 wxr,
3403 data,
3404 "derived",
3405 node,
3406 word,
3407 sense_datas,
3408 is_reconstruction,
3409 )
3411 # XXX parse interesting templates also from other sections. E.g.,
3412 # {{Letter|...}} in ===See also===
3413 # Also <gallery>
3415 # Recurse to children of this node, processing subtitles therein
3416 stack.append(t)
3417 process_children(node, pos)
3418 stack.pop()
3420 if len(redirect_list) > 0:
3421 if len(pos_data) > 0:
3422 pos_data["redirects"] = redirect_list
3423 if "pos" not in pos_data: 3423 ↛ 3424line 3423 didn't jump to line 3424 because the condition on line 3423 was never true
3424 pos_data["pos"] = "soft-redirect"
3425 else:
3426 new_page_data = copy.deepcopy(base_data)
3427 new_page_data["redirects"] = redirect_list
3428 if "pos" not in new_page_data: 3428 ↛ 3430line 3428 didn't jump to line 3430 because the condition on line 3428 was always true
3429 new_page_data["pos"] = "soft-redirect"
3430 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3431 page_datas.append(new_page_data)
3433 def extract_examples(
3434 others: list[WikiNode], sense_base: SenseData
3435 ) -> list[ExampleData]:
3436 """Parses through a list of definitions and quotes to find examples.
3437 Returns a list of example dicts to be added to sense data. Adds
3438 meta-data, mostly categories, into sense_base."""
3439 assert isinstance(others, list)
3440 examples: list[ExampleData] = []
3442 for sub in others:
3443 if not sub.sarg.endswith((":", "*")): 3443 ↛ 3444line 3443 didn't jump to line 3444 because the condition on line 3443 was never true
3444 continue
3445 for item in sub.children:
3446 if not isinstance(item, WikiNode): 3446 ↛ 3447line 3446 didn't jump to line 3447 because the condition on line 3446 was never true
3447 continue
3448 if item.kind != NodeKind.LIST_ITEM: 3448 ↛ 3449line 3448 didn't jump to line 3449 because the condition on line 3448 was never true
3449 continue
3450 usex_type = None
3451 example_template_args = []
3452 example_template_names = []
3453 taxons = set()
3455 # Bypass this function when parsing Chinese, Japanese and
3456 # quotation templates.
3457 new_example_lists = extract_example_list_item(
3458 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3459 )
3460 if len(new_example_lists) > 0:
3461 examples.extend(new_example_lists)
3462 continue
3464 def usex_template_fn(
3465 name: str, ht: TemplateArgs
3466 ) -> Optional[str]:
3467 nonlocal usex_type
3468 if is_panel_template(wxr, name):
3469 return ""
3470 if name in usex_templates:
3471 usex_type = "example"
3472 example_template_args.append(ht)
3473 example_template_names.append(name)
3474 elif name in quotation_templates:
3475 usex_type = "quotation"
3476 elif name in taxonomy_templates: 3476 ↛ 3477line 3476 didn't jump to line 3477 because the condition on line 3476 was never true
3477 taxons.update(ht.get(1, "").split())
3478 for prefix in template_linkages_to_ignore_in_examples:
3479 if re.search(
3480 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3481 ):
3482 return ""
3483 return None
3485 # bookmark
3486 ruby: list[tuple[str, str]] = []
3487 contents = item.children
3488 if lang_code == "ja":
3489 # Capture ruby contents if this is a Japanese language
3490 # example.
3491 # print(contents)
3492 if ( 3492 ↛ 3497line 3492 didn't jump to line 3497 because the condition on line 3492 was never true
3493 contents
3494 and isinstance(contents, str)
3495 and re.match(r"\s*$", contents[0])
3496 ):
3497 contents = contents[1:]
3498 exp = wxr.wtp.parse(
3499 wxr.wtp.node_to_wikitext(contents),
3500 # post_template_fn=head_post_template_fn,
3501 expand_all=True,
3502 )
3503 rub, rest = extract_ruby(wxr, exp.children)
3504 if rub:
3505 for rtup in rub:
3506 ruby.append(rtup)
3507 contents = rest
3508 subtext = clean_node(
3509 wxr, sense_base, contents, template_fn=usex_template_fn
3510 )
3512 frozen_taxons = frozenset(taxons)
3513 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3515 # print(f"{subtext=}")
3516 subtext = re.sub(
3517 r"\s*\(please add an English "
3518 r"translation of this "
3519 r"(example|usage example|quote)\)",
3520 "",
3521 subtext,
3522 ).strip()
3523 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3524 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3525 # print("subtext:", repr(subtext))
3527 lines = subtext.splitlines()
3528 # print(lines)
3530 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3531 lines = list(
3532 x
3533 for x in lines
3534 if not re.match(
3535 r"(Synonyms: |Antonyms: |Hyponyms: |"
3536 r"Synonym: |Antonym: |Hyponym: |"
3537 r"Hypernyms: |Derived terms: |"
3538 r"Related terms: |"
3539 r"Hypernym: |Derived term: |"
3540 r"Coordinate terms:|"
3541 r"Related term: |"
3542 r"For more quotations using )",
3543 x,
3544 )
3545 )
3546 tr = ""
3547 ref = ""
3548 roman = ""
3549 # for line in lines:
3550 # print("LINE:", repr(line))
3551 # print(classify_desc(line))
3552 if len(lines) == 1 and lang_code != "en":
3553 parts = example_splitter_re.split(lines[0])
3554 if ( 3554 ↛ 3562line 3554 didn't jump to line 3562 because the condition on line 3554 was never true
3555 len(parts) > 2
3556 and len(example_template_args) == 1
3557 and any(
3558 ("―" in s) or ("—" in s)
3559 for s in example_template_args[0].values()
3560 )
3561 ):
3562 if nparts := synch_splits_with_args(
3563 lines[0], example_template_args[0]
3564 ):
3565 parts = nparts
3566 if ( 3566 ↛ 3571line 3566 didn't jump to line 3571 because the condition on line 3566 was never true
3567 len(example_template_args) == 1
3568 and "lit" in example_template_args[0]
3569 ):
3570 # ugly brute-force kludge in case there's a lit= arg
3571 literally = example_template_args[0].get("lit", "")
3572 if literally:
3573 literally = (
3574 " (literally, “"
3575 + clean_value(wxr, literally)
3576 + "”)"
3577 )
3578 else:
3579 literally = ""
3580 if ( 3580 ↛ 3619line 3580 didn't jump to line 3619 because the condition on line 3580 was never true
3581 len(example_template_args) == 1
3582 and len(parts) == 2
3583 and len(example_template_args[0])
3584 - (
3585 # horrible kludge to ignore these arguments
3586 # when calculating how many there are
3587 sum(
3588 s in example_template_args[0]
3589 for s in (
3590 "lit", # generates text, but we handle it
3591 "inline",
3592 "noenum",
3593 "nocat",
3594 "sort",
3595 )
3596 )
3597 )
3598 == 3
3599 and clean_value(
3600 wxr, example_template_args[0].get(2, "")
3601 )
3602 == parts[0].strip()
3603 and clean_value(
3604 wxr,
3605 (
3606 example_template_args[0].get(3)
3607 or example_template_args[0].get("translation")
3608 or example_template_args[0].get("t", "")
3609 )
3610 + literally, # in case there's a lit= argument
3611 )
3612 == parts[1].strip()
3613 ):
3614 # {{exampletemplate|ex|Foo bar baz|English translation}}
3615 # is a pretty reliable 'heuristic', so we use it here
3616 # before the others. To be extra sure the template
3617 # doesn't do anything weird, we compare the arguments
3618 # and the output to each other.
3619 lines = [parts[0].strip()]
3620 tr = parts[1].strip()
3621 elif (
3622 len(parts) == 2
3623 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3624 ):
3625 # These other branches just do some simple heuristics w/
3626 # the expanded output of the template (if applicable).
3627 lines = [parts[0].strip()]
3628 tr = parts[1].strip()
3629 elif ( 3629 ↛ 3635line 3629 didn't jump to line 3635 because the condition on line 3629 was never true
3630 len(parts) == 3
3631 and classify_desc2(parts[1])
3632 in ("romanization", "english")
3633 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3634 ):
3635 lines = [parts[0].strip()]
3636 roman = parts[1].strip()
3637 tr = parts[2].strip()
3638 else:
3639 parts = re.split(r"\s+-\s+", lines[0])
3640 if ( 3640 ↛ 3644line 3640 didn't jump to line 3644 because the condition on line 3640 was never true
3641 len(parts) == 2
3642 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3643 ):
3644 lines = [parts[0].strip()]
3645 tr = parts[1].strip()
3646 elif len(lines) > 1:
3647 if any(
3648 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3649 ) and not (len(example_template_names) == 1):
3650 refs: list[str] = []
3651 for i in range(len(lines)): 3651 ↛ 3657line 3651 didn't jump to line 3657 because the loop on line 3651 didn't complete
3652 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3652 ↛ 3653line 3652 didn't jump to line 3653 because the condition on line 3652 was never true
3653 break
3654 refs.append(lines[i].strip())
3655 if re.search(r"[]\d:)]\s*$", lines[i]):
3656 break
3657 ref = " ".join(refs)
3658 lines = lines[i + 1 :]
3659 if (
3660 lang_code != "en"
3661 and len(lines) >= 2
3662 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3663 ):
3664 i = len(lines) - 1
3665 while ( 3665 ↛ 3670line 3665 didn't jump to line 3670 because the condition on line 3665 was never true
3666 i > 1
3667 and classify_desc2(lines[i - 1])
3668 in ENGLISH_TEXTS
3669 ):
3670 i -= 1
3671 tr = "\n".join(lines[i:])
3672 lines = lines[:i]
3673 if len(lines) >= 2:
3674 if classify_desc2(lines[-1]) == "romanization":
3675 roman = lines[-1].strip()
3676 lines = lines[:-1]
3678 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3679 ref = lines[0]
3680 lines = lines[1:]
3681 elif lang_code != "en" and len(lines) == 2:
3682 cls1 = classify_desc2(lines[0])
3683 cls2 = classify_desc2(lines[1])
3684 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3685 tr = lines[1]
3686 lines = [lines[0]]
3687 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3687 ↛ 3688line 3687 didn't jump to line 3688 because the condition on line 3687 was never true
3688 tr = lines[0]
3689 lines = [lines[1]]
3690 elif ( 3690 ↛ 3697line 3690 didn't jump to line 3697 because the condition on line 3690 was never true
3691 re.match(r"^[#*]*:+", lines[1])
3692 and classify_desc2(
3693 re.sub(r"^[#*:]+\s*", "", lines[1])
3694 )
3695 in ENGLISH_TEXTS
3696 ):
3697 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3698 lines = [lines[0]]
3699 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3700 # Both were classified as English, but
3701 # presumably one is not. Assume first is
3702 # non-English, as that seems more common.
3703 tr = lines[1]
3704 lines = [lines[0]]
3705 elif (
3706 usex_type != "quotation"
3707 and lang_code != "en"
3708 and len(lines) == 3
3709 ):
3710 cls1 = classify_desc2(lines[0])
3711 cls2 = classify_desc2(lines[1])
3712 cls3 = classify_desc2(lines[2])
3713 if (
3714 cls3 == "english"
3715 and cls2 in ("english", "romanization")
3716 and cls1 != "english"
3717 ):
3718 tr = lines[2].strip()
3719 roman = lines[1].strip()
3720 lines = [lines[0].strip()]
3721 elif ( 3721 ↛ 3729line 3721 didn't jump to line 3729 because the condition on line 3721 was never true
3722 usex_type == "quotation"
3723 and lang_code != "en"
3724 and len(lines) > 2
3725 ):
3726 # for x in lines:
3727 # print(" LINE: {}: {}"
3728 # .format(classify_desc2(x), x))
3729 if re.match(r"^[#*]*:+\s*$", lines[1]):
3730 ref = lines[0]
3731 lines = lines[2:]
3732 cls1 = classify_desc2(lines[-1])
3733 if cls1 == "english":
3734 i = len(lines) - 1
3735 while (
3736 i > 1
3737 and classify_desc2(lines[i - 1])
3738 == ENGLISH_TEXTS
3739 ):
3740 i -= 1
3741 tr = "\n".join(lines[i:])
3742 lines = lines[:i]
3744 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3745 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3746 tr = re.sub(r"^[#*:]+\s*", "", tr)
3747 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3748 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3749 ref = re.sub(r"^[#*:]+\s*", "", ref)
3750 ref = re.sub(
3751 r", (volume |number |page )?“?"
3752 r"\(please specify ([^)]|\(s\))*\)”?|"
3753 ", text here$",
3754 "",
3755 ref,
3756 )
3757 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3758 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3759 subtext = "\n".join(x for x in lines if x)
3760 if not tr and lang_code != "en":
3761 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3762 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3762 ↛ 3763line 3762 didn't jump to line 3763 because the condition on line 3762 was never true
3763 tr = m.group(2)
3764 subtext = subtext[: m.start()] + m.group(1)
3765 elif lines:
3766 parts = re.split(r"\s*[―—]+\s*", lines[0])
3767 if ( 3767 ↛ 3771line 3767 didn't jump to line 3771 because the condition on line 3767 was never true
3768 len(parts) == 2
3769 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3770 ):
3771 subtext = parts[0].strip()
3772 tr = parts[1].strip()
3773 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3774 subtext = re.sub(
3775 r"(please add an English translation of "
3776 r"this (quote|usage example))",
3777 "",
3778 subtext,
3779 )
3780 subtext = re.sub(
3781 r"\s*→New International Version " "translation$",
3782 "",
3783 subtext,
3784 ) # e.g. pis/Tok Pisin (Bible)
3785 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
3786 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
3787 note = None
3788 m = re.match(r"^\(([^)]*)\):\s+", subtext)
3789 if ( 3789 ↛ 3797line 3789 didn't jump to line 3797 because the condition on line 3789 was never true
3790 m is not None
3791 and lang_code != "en"
3792 and (
3793 m.group(1).startswith("with ")
3794 or classify_desc2(m.group(1)) == "english"
3795 )
3796 ):
3797 note = m.group(1)
3798 subtext = subtext[m.end() :]
3799 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
3800 ref = re.sub(r",\s*→ISBN", "", ref)
3801 ref = ref.strip()
3802 if ref.endswith(":") or ref.endswith(","):
3803 ref = ref[:-1].strip()
3804 ref = re.sub(r"\s+,\s+", ", ", ref)
3805 ref = re.sub(r"\s+", " ", ref)
3806 if ref and not subtext: 3806 ↛ 3807line 3806 didn't jump to line 3807 because the condition on line 3806 was never true
3807 subtext = ref
3808 ref = ""
3809 if subtext:
3810 dt: ExampleData = {"text": subtext}
3811 if ref:
3812 dt["ref"] = ref
3813 if tr:
3814 dt["english"] = tr # DEPRECATED for "translation"
3815 dt["translation"] = tr
3816 if usex_type:
3817 dt["type"] = usex_type
3818 if note: 3818 ↛ 3819line 3818 didn't jump to line 3819 because the condition on line 3818 was never true
3819 dt["note"] = note
3820 if roman:
3821 dt["roman"] = roman
3822 if ruby:
3823 dt["ruby"] = ruby
3824 examples.append(dt)
3826 return examples
3828 # Main code of parse_language()
3829 # Process the section
3830 stack.append(language)
3831 process_children(langnode, None)
3832 stack.pop()
3834 # Finalize word entires
3835 push_etym()
3836 ret = []
3837 for data in page_datas:
3838 merge_base(data, base_data)
3839 ret.append(data)
3841 # Copy all tags to word senses
3842 for data in ret:
3843 if "senses" not in data: 3843 ↛ 3844line 3843 didn't jump to line 3844 because the condition on line 3843 was never true
3844 continue
3845 # WordData should not have a 'tags' field, but if it does, it's
3846 # deleted and its contents removed and placed in each sense;
3847 # that's why the type ignores.
3848 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
3849 if "tags" in data:
3850 del data["tags"] # type: ignore[typeddict-item]
3851 for sense in data["senses"]:
3852 data_extend(sense, "tags", tags)
3854 return ret
3857def parse_wikipedia_template(
3858 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
3859) -> None:
3860 """Helper function for parsing {{wikipedia|...}} and related templates."""
3861 assert isinstance(wxr, WiktextractContext)
3862 assert isinstance(data, dict)
3863 assert isinstance(ht, dict)
3864 langid = clean_node(wxr, data, ht.get("lang", ()))
3865 pagename = (
3866 clean_node(wxr, data, ht.get(1, ()))
3867 or wxr.wtp.title
3868 or "MISSING_PAGE_TITLE"
3869 )
3870 if langid:
3871 data_append(data, "wikipedia", langid + ":" + pagename)
3872 else:
3873 data_append(data, "wikipedia", pagename)
3876def parse_top_template(
3877 wxr: WiktextractContext, node: WikiNode, data: WordData
3878) -> None:
3879 """Parses a template that occurs on the top-level in a page, before any
3880 language subtitles."""
3881 assert isinstance(wxr, WiktextractContext)
3882 assert isinstance(node, WikiNode)
3883 assert isinstance(data, dict)
3885 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3886 if name in wikipedia_templates:
3887 parse_wikipedia_template(wxr, data, ht)
3888 return None
3889 if is_panel_template(wxr, name):
3890 return ""
3891 if name in ("reconstruction",): 3891 ↛ 3892line 3891 didn't jump to line 3892 because the condition on line 3891 was never true
3892 return ""
3893 if name.lower() == "also" or name.lower().startswith("also/"):
3894 # XXX shows related words that might really have been the intended
3895 # word, capture them
3896 return ""
3897 if name == "see also": 3897 ↛ 3899line 3897 didn't jump to line 3899 because the condition on line 3897 was never true
3898 # XXX capture
3899 return ""
3900 if name == "cardinalbox": 3900 ↛ 3902line 3900 didn't jump to line 3902 because the condition on line 3900 was never true
3901 # XXX capture
3902 return ""
3903 if name == "character info": 3903 ↛ 3905line 3903 didn't jump to line 3905 because the condition on line 3903 was never true
3904 # XXX capture
3905 return ""
3906 if name == "commonscat": 3906 ↛ 3908line 3906 didn't jump to line 3908 because the condition on line 3906 was never true
3907 # XXX capture link to Wikimedia commons
3908 return ""
3909 if name == "wrongtitle": 3909 ↛ 3912line 3909 didn't jump to line 3912 because the condition on line 3909 was never true
3910 # XXX this should be captured to replace page title with the
3911 # correct title. E.g. ⿰亻革家
3912 return ""
3913 if name == "wikidata": 3913 ↛ 3914line 3913 didn't jump to line 3914 because the condition on line 3913 was never true
3914 arg = clean_node(wxr, data, ht.get(1, ()))
3915 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
3916 data_append(data, "wikidata", arg)
3917 return ""
3918 wxr.wtp.debug(
3919 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
3920 sortid="page/2870",
3921 )
3922 return ""
3924 clean_node(wxr, None, [node], template_fn=top_template_fn)
3927def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
3928 """Fix subtitle hierarchy to be strict Language -> Etymology ->
3929 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
3930 that are next to each other."""
3932 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
3933 # section get overwritten. In this case, let's just combine the two.
3935 # In Chinese entries, Pronunciation can be preceded on the
3936 # same level 3 by its Etymology *and* Glyph Origin sections:
3937 # ===Glyph Origin===
3938 # ===Etymology===
3939 # ===Pronunciation===
3940 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
3941 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
3942 # are now level 6
3944 # Known lowercase PoS names are in part_of_speech_map
3945 # Known lowercase linkage section names are in linkage_map
3947 old = re.split(
3948 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
3949 )
3951 parts = []
3952 npar = 4 # Number of parentheses in above expression
3953 parts.append(old[0])
3954 prev_level = None
3955 level = None
3956 skip_level_title = False # When combining etymology sections
3957 for i in range(1, len(old), npar + 1):
3958 left = old[i]
3959 right = old[i + npar - 1]
3960 # remove Wikilinks in title
3961 title = re.sub(r"^\[\[", "", old[i + 1])
3962 title = re.sub(r"\]\]$", "", title)
3963 prev_level = level
3964 level = len(left)
3965 part = old[i + npar]
3966 if level != len(right): 3966 ↛ 3967line 3966 didn't jump to line 3967 because the condition on line 3966 was never true
3967 wxr.wtp.debug(
3968 "subtitle has unbalanced levels: "
3969 "{!r} has {} on the left and {} on the right".format(
3970 title, left, right
3971 ),
3972 sortid="page/2904",
3973 )
3974 lc = title.lower()
3975 if name_to_code(title, "en") != "":
3976 if level > 2: 3976 ↛ 3977line 3976 didn't jump to line 3977 because the condition on line 3976 was never true
3977 wxr.wtp.debug(
3978 "subtitle has language name {} at level {}".format(
3979 title, level
3980 ),
3981 sortid="page/2911",
3982 )
3983 level = 2
3984 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
3985 if level > 3: 3985 ↛ 3986line 3985 didn't jump to line 3986 because the condition on line 3985 was never true
3986 wxr.wtp.debug(
3987 "etymology section {} at level {}".format(title, level),
3988 sortid="page/2917",
3989 )
3990 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
3991 # sections cheek-to-cheek
3992 skip_level_title = True
3993 # Modify the title of previous ("Glyph Origin") section, in
3994 # case we have a meaningful title like "Etymology 1"
3995 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
3996 level = 3
3997 elif lc.startswith(PRONUNCIATION_TITLE):
3998 # Pronunciation is now a level between POS and Etymology, so
3999 # we need to shift everything down by one
4000 level = 4
4001 elif lc in POS_TITLES:
4002 level = 5
4003 elif lc == TRANSLATIONS_TITLE:
4004 level = 6
4005 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4006 level = 6
4007 elif lc in INFLECTION_TITLES:
4008 level = 6
4009 elif lc == DESCENDANTS_TITLE:
4010 level = 6
4011 elif title in PROTO_ROOT_DERIVED_TITLES: 4011 ↛ 4012line 4011 didn't jump to line 4012 because the condition on line 4011 was never true
4012 level = 6
4013 elif lc in IGNORED_TITLES:
4014 level = 6
4015 else:
4016 level = 6
4017 if skip_level_title:
4018 skip_level_title = False
4019 parts.append(part)
4020 else:
4021 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4022 parts.append(part)
4023 # print("=" * level, title)
4024 # if level != len(left):
4025 # print(" FIXED LEVEL OF {} {} -> {}"
4026 # .format(title, len(left), level))
4028 text = "".join(parts)
4029 # print(text)
4030 return text
4033def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4034 # Skip translation pages
4035 if word.endswith("/" + TRANSLATIONS_TITLE): 4035 ↛ 4036line 4035 didn't jump to line 4036 because the condition on line 4035 was never true
4036 return []
4038 if wxr.config.verbose: 4038 ↛ 4039line 4038 didn't jump to line 4039 because the condition on line 4038 was never true
4039 logger.info(f"Parsing page: {word}")
4041 wxr.config.word = word
4042 wxr.wtp.start_page(word)
4044 # Remove <noinclude> and similar tags from main pages. They
4045 # should not appear there, but at least net/Elfdala has one and it
4046 # is probably not the only one.
4047 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4048 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4049 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4051 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4052 # pages that have, for example, Translations section under Linkage, or
4053 # Translations section on the same level as Noun. Enforce a proper
4054 # hierarchy by manipulating the subtitle levels in certain cases.
4055 text = fix_subtitle_hierarchy(wxr, text)
4057 # Parse the page, pre-expanding those templates that are likely to
4058 # influence parsing
4059 tree = wxr.wtp.parse(
4060 text,
4061 pre_expand=True,
4062 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4063 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4064 )
4065 # from wikitextprocessor.parser import print_tree
4066 # print("PAGE PARSE:", print_tree(tree))
4068 top_data: WordData = {}
4070 # Iterate over top-level titles, which should be languages for normal
4071 # pages
4072 by_lang = defaultdict(list)
4073 for langnode in tree.children:
4074 if not isinstance(langnode, WikiNode):
4075 continue
4076 if langnode.kind == NodeKind.TEMPLATE:
4077 parse_top_template(wxr, langnode, top_data)
4078 continue
4079 if langnode.kind == NodeKind.LINK:
4080 # Some pages have links at top level, e.g., "trees" in Wiktionary
4081 continue
4082 if langnode.kind != NodeKind.LEVEL2: 4082 ↛ 4083line 4082 didn't jump to line 4083 because the condition on line 4082 was never true
4083 wxr.wtp.debug(
4084 f"unexpected top-level node: {langnode}", sortid="page/3014"
4085 )
4086 continue
4087 lang = clean_node(
4088 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4089 )
4090 lang_code = name_to_code(lang, "en")
4091 if lang_code == "": 4091 ↛ 4092line 4091 didn't jump to line 4092 because the condition on line 4091 was never true
4092 wxr.wtp.debug(
4093 f"unrecognized language name: {lang}", sortid="page/3019"
4094 )
4095 if (
4096 wxr.config.capture_language_codes
4097 and lang_code not in wxr.config.capture_language_codes
4098 ):
4099 continue
4100 wxr.wtp.start_section(lang)
4102 # Collect all words from the page.
4103 # print(f"{langnode=}")
4104 datas = parse_language(wxr, langnode, lang, lang_code)
4106 # Propagate fields resulting from top-level templates to this
4107 # part-of-speech.
4108 for data in datas:
4109 if "lang" not in data: 4109 ↛ 4110line 4109 didn't jump to line 4110 because the condition on line 4109 was never true
4110 wxr.wtp.debug(
4111 "internal error -- no lang in data: {}".format(data),
4112 sortid="page/3034",
4113 )
4114 continue
4115 for k, v in top_data.items():
4116 assert isinstance(v, (list, tuple))
4117 data_extend(data, k, v)
4118 by_lang[data["lang"]].append(data)
4120 # XXX this code is clearly out of date. There is no longer a "conjugation"
4121 # field. FIX OR REMOVE.
4122 # Do some post-processing on the words. For example, we may distribute
4123 # conjugation information to all the words.
4124 ret = []
4125 for lang, lang_datas in by_lang.items():
4126 ret.extend(lang_datas)
4128 for x in ret:
4129 if x["word"] != word:
4130 if word.startswith("Unsupported titles/"):
4131 wxr.wtp.debug(
4132 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4133 sortid="20231101/3578page.py",
4134 )
4135 else:
4136 wxr.wtp.debug(
4137 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4138 sortid="20231101/3582page.py",
4139 )
4140 x["original_title"] = word
4141 # validate tag data
4142 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4143 return ret
4146def recursively_separate_raw_tags(
4147 wxr: WiktextractContext, data: dict[str, Any]
4148) -> None:
4149 if not isinstance(data, dict): 4149 ↛ 4150line 4149 didn't jump to line 4150 because the condition on line 4149 was never true
4150 wxr.wtp.error(
4151 "'data' is not dict; most probably "
4152 "data has a list that contains at least one dict and "
4153 "at least one non-dict item",
4154 sortid="en/page-4016/20240419",
4155 )
4156 return
4157 new_tags: list[str] = []
4158 raw_tags: list[str] = data.get("raw_tags", [])
4159 for field, val in data.items():
4160 if field == "tags":
4161 for tag in val:
4162 if tag not in valid_tags:
4163 raw_tags.append(tag)
4164 else:
4165 new_tags.append(tag)
4166 if isinstance(val, list):
4167 if len(val) > 0 and isinstance(val[0], dict):
4168 for d in val:
4169 recursively_separate_raw_tags(wxr, d)
4170 if "tags" in data and not new_tags:
4171 del data["tags"]
4172 elif new_tags:
4173 data["tags"] = new_tags
4174 if raw_tags:
4175 data["raw_tags"] = raw_tags
4178def process_soft_redirect_template(
4179 wxr: WiktextractContext,
4180 template_node: TemplateNode,
4181 redirect_pages: list[str],
4182) -> bool:
4183 # return `True` if the template is soft redirect template
4184 if template_node.template_name == "zh-see":
4185 # https://en.wiktionary.org/wiki/Template:zh-see
4186 title = clean_node(
4187 wxr, None, template_node.template_parameters.get(1, "")
4188 )
4189 if title != "": 4189 ↛ 4191line 4189 didn't jump to line 4191 because the condition on line 4189 was always true
4190 redirect_pages.append(title)
4191 return True
4192 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4193 # https://en.wiktionary.org/wiki/Template:ja-see
4194 for key, value in template_node.template_parameters.items():
4195 if isinstance(key, int): 4195 ↛ 4194line 4195 didn't jump to line 4194 because the condition on line 4195 was always true
4196 title = clean_node(wxr, None, value)
4197 if title != "": 4197 ↛ 4194line 4197 didn't jump to line 4194 because the condition on line 4197 was always true
4198 redirect_pages.append(title)
4199 return True
4200 return False
4203ZH_FORMS_TAGS = {
4204 "trad.": "Traditional-Chinese",
4205 "simp.": "Simplified-Chinese",
4206 "alternative forms": "alternative",
4207 "2nd round simp.": "Second-Round-Simplified-Chinese",
4208}
4211def extract_zh_forms_template(
4212 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4213):
4214 # https://en.wiktionary.org/wiki/Template:zh-forms
4215 lit_meaning = clean_node(
4216 wxr, None, t_node.template_parameters.get("lit", "")
4217 )
4218 if lit_meaning != "":
4219 base_data["literal_meaning"] = lit_meaning
4220 expanded_node = wxr.wtp.parse(
4221 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4222 )
4223 for table in expanded_node.find_child(NodeKind.TABLE):
4224 for row in table.find_child(NodeKind.TABLE_ROW):
4225 row_header = ""
4226 row_header_tags: list[str] = []
4227 header_has_span = False
4228 for cell in row.find_child(
4229 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4230 ):
4231 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4232 row_header, row_header_tags, header_has_span = (
4233 extract_zh_forms_header_cell(wxr, base_data, cell)
4234 )
4235 elif not header_has_span:
4236 extract_zh_forms_data_cell(
4237 wxr, base_data, cell, row_header, row_header_tags
4238 )
4240 if "forms" in base_data and len(base_data["forms"]) == 0: 4240 ↛ 4241line 4240 didn't jump to line 4241 because the condition on line 4240 was never true
4241 del base_data["forms"]
4244def extract_zh_forms_header_cell(
4245 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4246) -> tuple[str, list[str], bool]:
4247 row_header = ""
4248 row_header_tags = []
4249 header_has_span = False
4250 first_span_index = len(header_cell.children)
4251 for index, span_tag in header_cell.find_html("span", with_index=True):
4252 if index < first_span_index: 4252 ↛ 4254line 4252 didn't jump to line 4254 because the condition on line 4252 was always true
4253 first_span_index = index
4254 header_has_span = True
4255 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4256 for raw_tag in row_header.split(" and "):
4257 raw_tag = raw_tag.strip()
4258 if raw_tag != "":
4259 row_header_tags.append(raw_tag)
4260 for span_tag in header_cell.find_html_recursively("span"):
4261 span_lang = span_tag.attrs.get("lang", "")
4262 form_nodes = []
4263 sup_title = ""
4264 for node in span_tag.children:
4265 if isinstance(node, HTMLNode) and node.tag == "sup": 4265 ↛ 4266line 4265 didn't jump to line 4266 because the condition on line 4265 was never true
4266 for sup_span in node.find_html("span"):
4267 sup_title = sup_span.attrs.get("title", "")
4268 else:
4269 form_nodes.append(node)
4270 if span_lang in ["zh-Hant", "zh-Hans"]:
4271 for word in clean_node(wxr, None, form_nodes).split("/"):
4272 if word not in [wxr.wtp.title, ""]:
4273 form = {"form": word}
4274 for raw_tag in row_header_tags:
4275 if raw_tag in ZH_FORMS_TAGS: 4275 ↛ 4278line 4275 didn't jump to line 4278 because the condition on line 4275 was always true
4276 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4277 else:
4278 data_append(form, "raw_tags", raw_tag)
4279 if sup_title != "": 4279 ↛ 4280line 4279 didn't jump to line 4280 because the condition on line 4279 was never true
4280 data_append(form, "raw_tags", sup_title)
4281 data_append(base_data, "forms", form)
4282 return row_header, row_header_tags, header_has_span
4285TagLiteral = Literal["tags", "raw_tags"]
4286TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags")
4289def extract_zh_forms_data_cell(
4290 wxr: WiktextractContext,
4291 base_data: WordData,
4292 cell: WikiNode,
4293 row_header: str,
4294 row_header_tags: list[str],
4295) -> None:
4296 from .zh_pron_tags import ZH_PRON_TAGS
4298 forms: list[FormData] = []
4299 for top_span_tag in cell.find_html("span"):
4300 span_style = top_span_tag.attrs.get("style", "")
4301 span_lang = top_span_tag.attrs.get("lang", "")
4302 if span_style == "white-space:nowrap;":
4303 extract_zh_forms_data_cell(
4304 wxr, base_data, top_span_tag, row_header, row_header_tags
4305 )
4306 elif "font-size:80%" in span_style:
4307 raw_tag = clean_node(wxr, None, top_span_tag)
4308 if raw_tag != "": 4308 ↛ 4299line 4308 didn't jump to line 4299 because the condition on line 4308 was always true
4309 for form in forms:
4310 if raw_tag in ZH_PRON_TAGS: 4310 ↛ 4316line 4310 didn't jump to line 4316 because the condition on line 4310 was always true
4311 tr_tag = ZH_PRON_TAGS[raw_tag]
4312 if isinstance(tr_tag, list): 4312 ↛ 4313line 4312 didn't jump to line 4313 because the condition on line 4312 was never true
4313 data_extend(form, "tags", tr_tag)
4314 elif isinstance(tr_tag, str): 4314 ↛ 4309line 4314 didn't jump to line 4309 because the condition on line 4314 was always true
4315 data_append(form, "tags", tr_tag)
4316 elif raw_tag in valid_tags:
4317 data_append(form, "tags", raw_tag)
4318 else:
4319 data_append(form, "raw_tags", raw_tag)
4320 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4320 ↛ 4299line 4320 didn't jump to line 4299 because the condition on line 4320 was always true
4321 word = clean_node(wxr, None, top_span_tag)
4322 if word not in ["", "/", wxr.wtp.title]:
4323 form = {"form": word}
4324 if row_header != "anagram": 4324 ↛ 4330line 4324 didn't jump to line 4330 because the condition on line 4324 was always true
4325 for raw_tag in row_header_tags:
4326 if raw_tag in ZH_FORMS_TAGS: 4326 ↛ 4329line 4326 didn't jump to line 4329 because the condition on line 4326 was always true
4327 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4328 else:
4329 data_append(form, "raw_tags", raw_tag)
4330 if span_lang == "zh-Hant":
4331 data_append(form, "tags", "Traditional-Chinese")
4332 elif span_lang == "zh-Hans":
4333 data_append(form, "tags", "Simplified-Chinese")
4334 forms.append(form)
4336 if row_header == "anagram": 4336 ↛ 4337line 4336 didn't jump to line 4337 because the condition on line 4336 was never true
4337 for form in forms:
4338 l_data: LinkageData = {"word": form["form"]}
4339 for key in TAG_LITERALS_TUPLE:
4340 if key in form:
4341 l_data[key] = form[key]
4342 data_append(base_data, "anagrams", l_data)
4343 else:
4344 data_extend(base_data, "forms", forms)
4347def extract_ja_kanjitab_template(
4348 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4349):
4350 # https://en.wiktionary.org/wiki/Template:ja-kanjitab
4351 expanded_node = wxr.wtp.parse(
4352 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4353 )
4354 for table in expanded_node.find_child(NodeKind.TABLE):
4355 is_alt_form_table = False
4356 for row in table.find_child(NodeKind.TABLE_ROW):
4357 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
4358 header_text = clean_node(wxr, None, header_node)
4359 if header_text.startswith("Alternative spelling"):
4360 is_alt_form_table = True
4361 if not is_alt_form_table:
4362 continue
4363 forms = []
4364 for row in table.find_child(NodeKind.TABLE_ROW):
4365 for cell_node in row.find_child(NodeKind.TABLE_CELL):
4366 for child_node in cell_node.children:
4367 if isinstance(child_node, HTMLNode):
4368 if child_node.tag == "span":
4369 word = clean_node(wxr, None, child_node)
4370 if word != "": 4370 ↛ 4366line 4370 didn't jump to line 4366 because the condition on line 4370 was always true
4371 forms.append(
4372 {
4373 "form": word,
4374 "tags": ["alternative", "kanji"],
4375 }
4376 )
4377 elif child_node.tag == "small":
4378 raw_tag = clean_node(wxr, None, child_node).strip(
4379 "()"
4380 )
4381 if raw_tag != "" and len(forms) > 0: 4381 ↛ 4366line 4381 didn't jump to line 4366 because the condition on line 4381 was always true
4382 data_append(
4383 forms[-1],
4384 "tags"
4385 if raw_tag in valid_tags
4386 else "raw_tags",
4387 raw_tag,
4388 )
4389 data_extend(base_data, "forms", forms)
4390 for link_node in expanded_node.find_child(NodeKind.LINK):
4391 clean_node(wxr, base_data, link_node)