Coverage for src/wiktextract/extractor/en/page.py: 76%
1938 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Optional,
15 Set,
16 Union,
17 cast,
18)
20from mediawiki_langcodes import get_all_names, name_to_code
21from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
22from wikitextprocessor.parser import (
23 LEVEL_KIND_FLAGS,
24 GeneralNode,
25 HTMLNode,
26 LevelNode,
27 NodeKind,
28 TemplateNode,
29 WikiNode,
30)
32from ...clean import clean_template_args, clean_value
33from ...datautils import (
34 data_append,
35 data_extend,
36 ns_title_prefix_tuple,
37)
38from ...page import (
39 LEVEL_KINDS,
40 clean_node,
41 is_panel_template,
42 recursively_extract,
43)
44from ...tags import valid_tags
45from ...wxr_context import WiktextractContext
46from ...wxr_logging import logger
47from ..ruby import extract_ruby, parse_ruby
48from ..share import strip_nodes
49from .descendant import extract_descendant_section
50from .example import extract_example_list_item, extract_template_zh_x
51from .form_descriptions import (
52 classify_desc,
53 decode_tags,
54 distw,
55 parse_alt_or_inflection_of,
56 parse_sense_qualifier,
57 parse_word_head,
58)
59from .inflection import TableContext, parse_inflection_section
60from .info_templates import (
61 INFO_TEMPLATE_FUNCS,
62 parse_info_template_arguments,
63 parse_info_template_node,
64)
65from .linkages import (
66 extract_alt_form_section,
67 extract_zh_dial_template,
68 parse_linkage_item_text,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 LinkageData,
88 ReferenceData,
89 SenseData,
90 SoundData,
91 TemplateData,
92 WordData,
93)
94from .unsupported_titles import unsupported_title_map
96# When determining whether a string is 'english', classify_desc
97# might return 'taxonomic' which is English text 99% of the time.
98ENGLISH_TEXTS = ("english", "taxonomic")
100# Matches head tag
101HEAD_TAG_RE = re.compile(
102 r"^(head|Han char|arabic-noun|arabic-noun-form|"
103 r"hangul-symbol|syllable-hangul)$|"
104 + r"^(latin|"
105 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
106 + r")-("
107 + "|".join(
108 [
109 "abbr",
110 "adj",
111 "adjective",
112 "adjective form",
113 "adjective-form",
114 "adv",
115 "adverb",
116 "affix",
117 "animal command",
118 "art",
119 "article",
120 "aux",
121 "bound pronoun",
122 "bound-pronoun",
123 "Buyla",
124 "card num",
125 "card-num",
126 "cardinal",
127 "chunom",
128 "classifier",
129 "clitic",
130 "cls",
131 "cmene",
132 "cmavo",
133 "colloq-verb",
134 "colverbform",
135 "combining form",
136 "combining-form",
137 "comparative",
138 "con",
139 "concord",
140 "conj",
141 "conjunction",
142 "conjug",
143 "cont",
144 "contr",
145 "converb",
146 "daybox",
147 "decl",
148 "decl noun",
149 "def",
150 "dem",
151 "det",
152 "determ",
153 "Deva",
154 "ending",
155 "entry",
156 "form",
157 "fuhivla",
158 "gerund",
159 "gismu",
160 "hanja",
161 "hantu",
162 "hanzi",
163 "head",
164 "ideophone",
165 "idiom",
166 "inf",
167 "indef",
168 "infixed pronoun",
169 "infixed-pronoun",
170 "infl",
171 "inflection",
172 "initialism",
173 "int",
174 "interfix",
175 "interj",
176 "interjection",
177 "jyut",
178 "latin",
179 "letter",
180 "locative",
181 "lujvo",
182 "monthbox",
183 "mutverb",
184 "name",
185 "nisba",
186 "nom",
187 "noun",
188 "noun form",
189 "noun-form",
190 "noun plural",
191 "noun-plural",
192 "nounprefix",
193 "num",
194 "number",
195 "numeral",
196 "ord",
197 "ordinal",
198 "par",
199 "part",
200 "part form",
201 "part-form",
202 "participle",
203 "particle",
204 "past",
205 "past neg",
206 "past-neg",
207 "past participle",
208 "past-participle",
209 "perfect participle",
210 "perfect-participle",
211 "personal pronoun",
212 "personal-pronoun",
213 "pref",
214 "prefix",
215 "phrase",
216 "pinyin",
217 "plural noun",
218 "plural-noun",
219 "pos",
220 "poss-noun",
221 "post",
222 "postp",
223 "postposition",
224 "PP",
225 "pp",
226 "ppron",
227 "pred",
228 "predicative",
229 "prep",
230 "prep phrase",
231 "prep-phrase",
232 "preposition",
233 "present participle",
234 "present-participle",
235 "pron",
236 "prondem",
237 "pronindef",
238 "pronoun",
239 "prop",
240 "proper noun",
241 "proper-noun",
242 "proper noun form",
243 "proper-noun form",
244 "proper noun-form",
245 "proper-noun-form",
246 "prov",
247 "proverb",
248 "prpn",
249 "prpr",
250 "punctuation mark",
251 "punctuation-mark",
252 "regnoun",
253 "rel",
254 "rom",
255 "romanji",
256 "root",
257 "sign",
258 "suff",
259 "suffix",
260 "syllable",
261 "symbol",
262 "verb",
263 "verb form",
264 "verb-form",
265 "verbal noun",
266 "verbal-noun",
267 "verbnec",
268 "vform",
269 ]
270 )
271 + r")(-|/|\+|$)"
272)
274# Head-templates causing problems (like newlines) that can be squashed into
275# an empty string in the template handler while saving their template
276# data for later.
277WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
279FLOATING_TABLE_TEMPLATES: set[str] = {
280 # az-suffix-form creates a style=floatright div that is otherwise
281 # deleted; if it is not pre-expanded, we can intercept the template
282 # so we add this set into do_not_pre_expand, and intercept the
283 # templates in parse_part_of_speech
284 "az-suffix-forms",
285 "az-inf-p",
286 "kk-suffix-forms",
287 "ky-suffix-forms",
288 "tr-inf-p",
289 "tr-suffix-forms",
290 "tt-suffix-forms",
291 "uz-suffix-forms",
292}
293# These two should contain template names that should always be
294# pre-expanded when *first* processing the tree, or not pre-expanded
295# so that the template are left in place with their identifying
296# name intact for later filtering.
298DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
299DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
301# Additional templates to be expanded in the pre-expand phase
302ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
303 "multitrans",
304 "multitrans-nowiki",
305 "trans-top",
306 "trans-top-also",
307 "trans-bottom",
308 "checktrans-top",
309 "checktrans-bottom",
310 "col1",
311 "col2",
312 "col3",
313 "col4",
314 "col5",
315 "col1-u",
316 "col2-u",
317 "col3-u",
318 "col4-u",
319 "col5-u",
320 "check deprecated lang param usage",
321 "deprecated code",
322 "ru-verb-alt-ё",
323 "ru-noun-alt-ё",
324 "ru-adj-alt-ё",
325 "ru-proper noun-alt-ё",
326 "ru-pos-alt-ё",
327 "ru-alt-ё",
328 "inflection of",
329 "no deprecated lang param usage",
330 "transclude", # these produce sense entries (or other lists)
331 "tcl",
332}
334# Inverse linkage for those that have them
335linkage_inverses: dict[str, str] = {
336 # XXX this is not currently used, move to post-processing
337 "synonyms": "synonyms",
338 "hypernyms": "hyponyms",
339 "hyponyms": "hypernyms",
340 "holonyms": "meronyms",
341 "meronyms": "holonyms",
342 "derived": "derived_from",
343 "coordinate_terms": "coordinate_terms",
344 "troponyms": "hypernyms",
345 "antonyms": "antonyms",
346 "instances": "instance_of",
347 "related": "related",
348}
350# Templates that are used to form panels on pages and that
351# should be ignored in various positions
352PANEL_TEMPLATES: set[str] = {
353 "Character info",
354 "CJKV",
355 "French personal pronouns",
356 "French possessive adjectives",
357 "French possessive pronouns",
358 "Han etym",
359 "Japanese demonstratives",
360 "Latn-script",
361 "LDL",
362 "MW1913Abbr",
363 "Number-encoding",
364 "Nuttall",
365 "Spanish possessive adjectives",
366 "Spanish possessive pronouns",
367 "USRegionDisputed",
368 "Webster 1913",
369 "ase-rfr",
370 "attention",
371 "attn",
372 "beer",
373 "broken ref",
374 "ca-compass",
375 "character info",
376 "character info/var",
377 "checksense",
378 "compass-fi",
379 "copyvio suspected",
380 "delete",
381 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
382 "etystub",
383 "examples",
384 "hu-corr",
385 "hu-suff-pron",
386 "interwiktionary",
387 "ja-kanjitab",
388 "ko-hanja-search",
389 "look",
390 "maintenance box",
391 "maintenance line",
392 "mediagenic terms",
393 "merge",
394 "missing template",
395 "morse links",
396 "move",
397 "multiple images",
398 "no inline",
399 "picdic",
400 "picdicimg",
401 "picdiclabel",
402 "polyominoes",
403 "predidential nomics",
404 "punctuation", # This actually gets pre-expanded
405 "reconstructed",
406 "request box",
407 "rf-sound example",
408 "rfaccents",
409 "rfap",
410 "rfaspect",
411 "rfc",
412 "rfc-auto",
413 "rfc-header",
414 "rfc-level",
415 "rfc-pron-n",
416 "rfc-sense",
417 "rfclarify",
418 "rfd",
419 "rfd-redundant",
420 "rfd-sense",
421 "rfdate",
422 "rfdatek",
423 "rfdef",
424 "rfe",
425 "rfe/dowork",
426 "rfex",
427 "rfexp",
428 "rfform",
429 "rfgender",
430 "rfi",
431 "rfinfl",
432 "rfm",
433 "rfm-sense",
434 "rfp",
435 "rfp-old",
436 "rfquote",
437 "rfquote-sense",
438 "rfquotek",
439 "rfref",
440 "rfscript",
441 "rft2",
442 "rftaxon",
443 "rftone",
444 "rftranslit",
445 "rfv",
446 "rfv-etym",
447 "rfv-pron",
448 "rfv-quote",
449 "rfv-sense",
450 "selfref",
451 "split",
452 "stroke order", # XXX consider capturing this?
453 "stub entry",
454 "t-needed",
455 "tbot entry",
456 "tea room",
457 "tea room sense",
458 # "ttbc", - XXX needed in at least on/Preposition/Translation page
459 "unblock",
460 "unsupportedpage",
461 "video frames",
462 "was wotd",
463 "wrongtitle",
464 "zh-forms",
465 "zh-hanzi-box",
466 "no entry",
467}
469# Template name prefixes used for language-specific panel templates (i.e.,
470# templates that create side boxes or notice boxes or that should generally
471# be ignored).
472PANEL_PREFIXES: set[str] = {
473 "list:compass points/",
474 "list:Gregorian calendar months/",
475 "RQ:",
476}
478# Templates used for wikipedia links.
479wikipedia_templates: set[str] = {
480 "wikipedia",
481 "slim-wikipedia",
482 "w",
483 "W",
484 "swp",
485 "wiki",
486 "Wikipedia",
487 "wtorw",
488}
489for x in PANEL_PREFIXES & wikipedia_templates: 489 ↛ 490line 489 didn't jump to line 490 because the loop on line 489 never started
490 print(
491 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
492 x
493 )
494 )
496# Mapping from a template name (without language prefix) for the main word
497# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
498# it could validly occur. This is used as just a sanity check to give
499# warnings about probably incorrect coding in Wiktionary.
500template_allowed_pos_map: dict[str, list[str]] = {
501 "abbr": ["abbrev"],
502 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
503 "plural noun": ["noun", "name"],
504 "plural-noun": ["noun", "name"],
505 "proper noun": ["noun", "name"],
506 "proper-noun": ["name", "noun"],
507 "prop": ["name", "noun"],
508 "verb": ["verb", "phrase"],
509 "gerund": ["verb"],
510 "particle": ["adv", "particle"],
511 "adj": ["adj", "adj_noun"],
512 "pron": ["pron", "noun"],
513 "name": ["name", "noun"],
514 "adv": ["adv", "intj", "conj", "particle"],
515 "phrase": ["phrase", "prep_phrase"],
516 "noun phrase": ["phrase"],
517 "ordinal": ["num"],
518 "number": ["num"],
519 "pos": ["affix", "name", "num"],
520 "suffix": ["suffix", "affix"],
521 "character": ["character"],
522 "letter": ["character"],
523 "kanji": ["character"],
524 "cont": ["abbrev"],
525 "interj": ["intj"],
526 "con": ["conj"],
527 "part": ["particle"],
528 "prep": ["prep", "postp"],
529 "postp": ["postp"],
530 "misspelling": ["noun", "adj", "verb", "adv"],
531 "part-form": ["verb"],
532}
533for k, v in template_allowed_pos_map.items():
534 for x in v:
535 if x not in PARTS_OF_SPEECH: 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true
536 print(
537 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
538 "".format(x, k, v)
539 )
540 assert False
543# Templates ignored during etymology extraction, i.e., these will not be listed
544# in the extracted etymology templates.
545ignored_etymology_templates: list[str] = [
546 "...",
547 "IPAchar",
548 "ipachar",
549 "ISBN",
550 "isValidPageName",
551 "redlink category",
552 "deprecated code",
553 "check deprecated lang param usage",
554 "para",
555 "p",
556 "cite",
557 "Cite news",
558 "Cite newsgroup",
559 "cite paper",
560 "cite MLLM 1976",
561 "cite journal",
562 "cite news/documentation",
563 "cite paper/documentation",
564 "cite video game",
565 "cite video game/documentation",
566 "cite newsgroup",
567 "cite newsgroup/documentation",
568 "cite web/documentation",
569 "cite news",
570 "Cite book",
571 "Cite-book",
572 "cite book",
573 "cite web",
574 "cite-usenet",
575 "cite-video/documentation",
576 "Cite-journal",
577 "rfe",
578 "catlangname",
579 "cln",
580 "langname-lite",
581 "no deprecated lang param usage",
582 "mention",
583 "m",
584 "m-self",
585 "link",
586 "l",
587 "ll",
588 "l-self",
589]
590# Regexp for matching ignored etymology template names. This adds certain
591# prefixes to the names listed above.
592ignored_etymology_templates_re = re.compile(
593 r"^((cite-|R:|RQ:).*|"
594 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
595 + r")$"
596)
598# Regexp for matching ignored descendants template names. Right now we just
599# copy the ignored etymology templates
600ignored_descendants_templates_re = ignored_etymology_templates_re
602# Set of template names that are used to define usage examples. If the usage
603# example contains one of these templates, then it its type is set to
604# "example"
605usex_templates: set[str] = {
606 "afex",
607 "affixusex",
608 "co", # {{collocation}} acts like a example template, specifically for
609 # pairs of combinations of words that are more common than you'd
610 # except would be randomly; hlavní#Czech
611 "coi",
612 "collocation",
613 "el-example",
614 "el-x",
615 "example",
616 "examples",
617 "he-usex",
618 "he-x",
619 "hi-usex",
620 "hi-x",
621 "ja-usex-inline",
622 "ja-usex",
623 "ja-x",
624 "jbo-example",
625 "jbo-x",
626 "km-usex",
627 "km-x",
628 "ko-usex",
629 "ko-x",
630 "lo-usex",
631 "lo-x",
632 "ne-x",
633 "ne-usex",
634 "prefixusex",
635 "ryu-usex",
636 "ryu-x",
637 "shn-usex",
638 "shn-x",
639 "suffixusex",
640 "th-usex",
641 "th-x",
642 "ur-usex",
643 "ur-x",
644 "usex",
645 "usex-suffix",
646 "ux",
647 "uxi",
648}
650stop_head_at_these_templates: set[str] = {
651 "category",
652 "cat",
653 "topics",
654 "catlangname",
655 "c",
656 "C",
657 "top",
658 "cln",
659}
661# Set of template names that are used to define quotation examples. If the
662# usage example contains one of these templates, then its type is set to
663# "quotation".
664quotation_templates: set[str] = {
665 "collapse-quote",
666 "quote-av",
667 "quote-book",
668 "quote-GYLD",
669 "quote-hansard",
670 "quotei",
671 "quote-journal",
672 "quotelite",
673 "quote-mailing list",
674 "quote-meta",
675 "quote-newsgroup",
676 "quote-song",
677 "quote-text",
678 "quote",
679 "quote-us-patent",
680 "quote-video game",
681 "quote-web",
682 "quote-wikipedia",
683 "wikiquote",
684 "Wikiquote",
685}
687taxonomy_templates = {
688 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
689 "taxfmt",
690 "taxlink",
691 "taxlink2",
692 "taxlinknew",
693 "taxlook",
694}
696# Template name component to linkage section listing. Integer section means
697# default section, starting at that argument.
698# XXX not used anymore, except for the first elements: moved to
699# template_linkages
700# template_linkage_mappings: list[list[Union[str, int]]] = [
701# ["syn", "synonyms"],
702# ["synonyms", "synonyms"],
703# ["ant", "antonyms"],
704# ["antonyms", "antonyms"],
705# ["hyp", "hyponyms"],
706# ["hyponyms", "hyponyms"],
707# ["der", "derived"],
708# ["derived terms", "derived"],
709# ["coordinate terms", "coordinate_terms"],
710# ["rel", "related"],
711# ["col", 2],
712# ]
714# Template names, this was exctracted from template_linkage_mappings,
715# because the code using template_linkage_mappings was actually not used
716# (but not removed).
717template_linkages_to_ignore_in_examples: set[str] = {
718 "syn",
719 "synonyms",
720 "ant",
721 "antonyms",
722 "hyp",
723 "hyponyms",
724 "der",
725 "derived terms",
726 "coordinate terms",
727 "cot",
728 "rel",
729 "col",
730 "inline alt forms",
731 "alti",
732 "comeronyms",
733 "holonyms",
734 "holo",
735 "hypernyms",
736 "hyper",
737 "meronyms",
738 "mero",
739 "troponyms",
740 "perfectives",
741 "pf",
742 "imperfectives",
743 "impf",
744 "syndiff",
745 "synsee",
746 # not linkage nor example templates
747 "sense",
748 "s",
749 "color panel",
750 "colour panel",
751}
753# Maps template name used in a word sense to a linkage field that it adds.
754sense_linkage_templates: dict[str, str] = {
755 "syn": "synonyms",
756 "synonyms": "synonyms",
757 "synsee": "synonyms",
758 "syndiff": "synonyms",
759 "hyp": "hyponyms",
760 "hyponyms": "hyponyms",
761 "ant": "antonyms",
762 "antonyms": "antonyms",
763 "alti": "related",
764 "inline alt forms": "related",
765 "coordinate terms": "coordinate_terms",
766 "cot": "coordinate_terms",
767 "comeronyms": "related",
768 "holonyms": "holonyms",
769 "holo": "holonyms",
770 "hypernyms": "hypernyms",
771 "hyper": "hypernyms",
772 "meronyms": "meronyms",
773 "mero": "meronyms",
774 "troponyms": "troponyms",
775 "perfectives": "related",
776 "pf": "related",
777 "imperfectives": "related",
778 "impf": "related",
779}
781sense_linkage_templates_tags: dict[str, list[str]] = {
782 "alti": ["alternative"],
783 "inline alt forms": ["alternative"],
784 "comeronyms": ["comeronym"],
785 "perfectives": ["perfective"],
786 "pf": ["perfective"],
787 "imperfectives": ["imperfective"],
788 "impf": ["imperfective"],
789}
792def decode_html_entities(v: Union[str, int]) -> str:
793 """Decodes HTML entities from a value, converting them to the respective
794 Unicode characters/strings."""
795 if isinstance(v, int):
796 # I changed this to return str(v) instead of v = str(v),
797 # but there might have been the intention to have more logic
798 # here. html.unescape would not do anything special with an integer,
799 # it needs html escape symbols (&xx;).
800 return str(v)
801 return html.unescape(v)
804def parse_sense_linkage(
805 wxr: WiktextractContext,
806 data: SenseData,
807 name: str,
808 ht: TemplateArgs,
809 pos: str,
810) -> None:
811 """Parses a linkage (synonym, etc) specified in a word sense."""
812 assert isinstance(wxr, WiktextractContext)
813 assert isinstance(data, dict)
814 assert isinstance(name, str)
815 assert isinstance(ht, dict)
816 field = sense_linkage_templates[name]
817 field_tags = sense_linkage_templates_tags.get(name, [])
818 for i in range(2, 20):
819 w = ht.get(i) or ""
820 w = clean_node(wxr, data, w)
821 is_thesaurus = False
822 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
823 if w.startswith(alias): 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true
824 is_thesaurus = True
825 w = w[len(alias) :]
826 if w != wxr.wtp.title:
827 from ...thesaurus import search_thesaurus
829 lang_code = clean_node(wxr, None, ht.get(1, ""))
830 for t_data in search_thesaurus(
831 wxr.thesaurus_db_conn, w, lang_code, pos, field
832 ):
833 l_data = {
834 "word": t_data.term,
835 "source": "Thesaurus:" + w,
836 }
837 if len(t_data.tags) > 0:
838 l_data["tags"] = t_data.tags
839 if len(t_data.raw_tags) > 0:
840 l_data["raw_tags"] = t_data.raw_tags
841 data_append(data, field, l_data)
842 break
843 if not w:
844 break
845 if is_thesaurus: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true
846 continue
847 tags: list[str] = []
848 topics: list[str] = []
849 english: Optional[str] = None
850 # Try to find qualifiers for this synonym
851 q = ht.get("q{}".format(i - 1))
852 if q:
853 cls = classify_desc(q)
854 if cls == "tags":
855 tagsets1, topics1 = decode_tags(q)
856 for ts in tagsets1:
857 tags.extend(ts)
858 topics.extend(topics1)
859 elif cls == "english": 859 ↛ 865line 859 didn't jump to line 865 because the condition on line 859 was always true
860 if english: 860 ↛ 861line 860 didn't jump to line 861 because the condition on line 860 was never true
861 english += "; " + q
862 else:
863 english = q
864 # Try to find English translation for this synonym
865 t = ht.get("t{}".format(i - 1))
866 if t: 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true
867 if english:
868 english += "; " + t
869 else:
870 english = t
872 # See if the linkage contains a parenthesized alt
873 alt = None
874 m = re.search(r"\(([^)]+)\)$", w)
875 if m: 875 ↛ 876line 875 didn't jump to line 876 because the condition on line 875 was never true
876 w = w[: m.start()].strip()
877 alt = m.group(1)
879 dt = {"word": w}
880 if field_tags: 880 ↛ 881line 880 didn't jump to line 881 because the condition on line 880 was never true
881 data_extend(dt, "tags", field_tags)
882 if tags:
883 data_extend(dt, "tags", tags)
884 if topics: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true
885 data_extend(dt, "topics", topics)
886 if english:
887 dt["english"] = english # DEPRECATED for "translation"
888 dt["translation"] = english
889 if alt: 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true
890 dt["alt"] = alt
891 data_append(data, field, dt)
894EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
895example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
896captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
899def synch_splits_with_args(
900 line: str, targs: TemplateArgs
901) -> Optional[list[str]]:
902 """If it looks like there's something weird with how a line of example
903 text has been split, this function will do the splitting after counting
904 occurences of the splitting regex inside the two main template arguments
905 containing the string data for the original language example and the
906 English translations.
907 """
908 # Previously, we split without capturing groups, but here we want to
909 # keep the original splitting hyphen regex intact.
910 fparts = captured_splitters_re.split(line)
911 new_parts = []
912 # ["First", " – ", "second", " – ", "third..."] from OL argument
913 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
914 new_parts.append("".join(fparts[:first]))
915 # Translation argument
916 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
917 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
918 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
919 new_parts.append("".join(fparts[first + 1 : second]))
921 if all(new_parts): # no empty strings from the above spaghetti
922 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
923 return new_parts
924 else:
925 return None
928QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
929QUALIFIERS_RE = re.compile(QUALIFIERS)
930# (...): ... or (...(...)...): ...
933def parse_language(
934 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
935) -> list[WordData]:
936 """Iterates over the text of the page, returning words (parts-of-speech)
937 defined on the page one at a time. (Individual word senses for the
938 same part-of-speech are typically encoded in the same entry.)"""
939 # imported here to avoid circular import
940 from .pronunciation import parse_pronunciation
942 assert isinstance(wxr, WiktextractContext)
943 assert isinstance(langnode, WikiNode)
944 assert isinstance(language, str)
945 assert isinstance(lang_code, str)
946 # print("parse_language", language)
948 is_reconstruction = False
949 word: str = wxr.wtp.title # type: ignore[assignment]
950 unsupported_prefix = "Unsupported titles/"
951 if word.startswith(unsupported_prefix):
952 w = word[len(unsupported_prefix) :]
953 if w in unsupported_title_map: 953 ↛ 956line 953 didn't jump to line 956 because the condition on line 953 was always true
954 word = unsupported_title_map[w]
955 else:
956 wxr.wtp.error(
957 "Unimplemented unsupported title: {}".format(word),
958 sortid="page/870",
959 )
960 word = w
961 elif word.startswith("Reconstruction:"):
962 word = word[word.find("/") + 1 :]
963 is_reconstruction = True
965 base_data: WordData = {
966 "word": word,
967 "lang": language,
968 "lang_code": lang_code,
969 }
970 if is_reconstruction:
971 data_append(base_data, "tags", "reconstruction")
972 sense_data: SenseData = {}
973 pos_data: WordData = {} # For a current part-of-speech
974 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
975 etym_data: WordData = {} # For one etymology
976 sense_datas: list[SenseData] = []
977 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
978 # Never reset, do not use as data
979 level_four_datas: list[WordData] = []
980 etym_datas: list[WordData] = []
981 page_datas: list[WordData] = []
982 have_etym = False
983 inside_level_four = False # This is for checking if the etymology section
984 # or article has a Pronunciation section, for Chinese mostly; because
985 # Chinese articles can have three level three sections (two etymology
986 # sections and pronunciation sections) one after another, we need a kludge
987 # to better keep track of whether we're in a normal "etym" or inside a
988 # "level four" (which is what we've turned the level three Pron sections
989 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
990 # a step.
991 stack: list[str] = [] # names of items on the "stack"
993 def merge_base(data: WordData, base: WordData) -> None:
994 for k, v in base.items():
995 # Copy the value to ensure that we don't share lists or
996 # dicts between structures (even nested ones).
997 v = copy.deepcopy(v)
998 if k not in data:
999 # The list was copied above, so this will not create shared ref
1000 data[k] = v # type: ignore[literal-required]
1001 continue
1002 if data[k] == v: # type: ignore[literal-required]
1003 continue
1004 if ( 1004 ↛ 1012line 1004 didn't jump to line 1012 because the condition on line 1004 was always true
1005 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1006 or isinstance(
1007 v,
1008 (list, tuple), # Should this be "and"?
1009 )
1010 ):
1011 data[k] = list(data[k]) + list(v) # type: ignore
1012 elif data[k] != v: # type: ignore[literal-required]
1013 wxr.wtp.warning(
1014 "conflicting values for {} in merge_base: "
1015 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1016 sortid="page/904",
1017 )
1019 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1020 """Remove unnecessary keys from dict values
1021 in a list comprehension..."""
1022 if key in pron:
1023 pron.pop(key) # type: ignore
1024 return pron
1026 # If the result has sounds, eliminate sounds that have a prefix that
1027 # does not match "word" or one of "forms"
1028 if "sounds" in data and "word" in data:
1029 accepted = [data["word"]]
1030 accepted.extend(f["form"] for f in data.get("forms", dict()))
1031 data["sounds"] = list(
1032 s
1033 for s in data["sounds"]
1034 if "form" not in s or s["form"] in accepted
1035 )
1036 # If the result has sounds, eliminate sounds that have a pos that
1037 # does not match "pos"
1038 if "sounds" in data and "pos" in data:
1039 data["sounds"] = list(
1040 complementary_pop(s, "pos")
1041 for s in data["sounds"]
1042 # "pos" is not a field of SoundData, correctly, so we're
1043 # removing it here. It's a kludge on a kludge on a kludge.
1044 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1045 )
1047 def push_sense(sorting_ordinal: int | None = None) -> bool:
1048 """Starts collecting data for a new word sense. This returns True
1049 if a sense was added."""
1050 nonlocal sense_data
1051 if sorting_ordinal is None:
1052 sorting_ordinal = sense_ordinal
1053 tags = sense_data.get("tags", ())
1054 if (
1055 not sense_data.get("glosses")
1056 and "translation-hub" not in tags
1057 and "no-gloss" not in tags
1058 ):
1059 return False
1061 if ( 1061 ↛ 1071line 1061 didn't jump to line 1071 because the condition on line 1061 was never true
1062 (
1063 "participle" in sense_data.get("tags", ())
1064 or "infinitive" in sense_data.get("tags", ())
1065 )
1066 and "alt_of" not in sense_data
1067 and "form_of" not in sense_data
1068 and "etymology_text" in etym_data
1069 and etym_data["etymology_text"] != ""
1070 ):
1071 etym = etym_data["etymology_text"]
1072 etym = etym.split(". ")[0]
1073 ret = parse_alt_or_inflection_of(wxr, etym, set())
1074 if ret is not None:
1075 tags, lst = ret
1076 assert isinstance(lst, (list, tuple))
1077 if "form-of" in tags:
1078 data_extend(sense_data, "form_of", lst)
1079 data_extend(sense_data, "tags", tags)
1080 elif "alt-of" in tags:
1081 data_extend(sense_data, "alt_of", lst)
1082 data_extend(sense_data, "tags", tags)
1084 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1084 ↛ 1087line 1084 didn't jump to line 1087 because the condition on line 1084 was never true
1085 "tags", ()
1086 ):
1087 data_append(sense_data, "tags", "no-gloss")
1089 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal
1090 sense_datas.append(sense_data)
1091 sense_data = {}
1092 return True
1094 def push_pos(sorting_ordinal: int | None = None) -> None:
1095 """Starts collecting data for a new part-of-speech."""
1096 nonlocal pos_data
1097 nonlocal sense_datas
1098 push_sense(sorting_ordinal)
1099 if wxr.wtp.subsection:
1100 data: WordData = {"senses": sense_datas}
1101 merge_base(data, pos_data)
1102 level_four_datas.append(data)
1103 pos_data = {}
1104 sense_datas = []
1105 wxr.wtp.start_subsection(None)
1107 def push_level_four_section(clear_sound_data: bool) -> None:
1108 """Starts collecting data for a new level four sections, which
1109 is usually virtual and empty, unless the article has Chinese
1110 'Pronunciation' sections that are etymology-section-like but
1111 under etymology, and at the same level in the source. We modify
1112 the source to demote Pronunciation sections like that to level
1113 4, and other sections one step lower."""
1114 nonlocal level_four_data
1115 nonlocal level_four_datas
1116 nonlocal etym_datas
1117 push_pos()
1118 # print(f"======\n{etym_data=}")
1119 # print(f"======\n{etym_datas=}")
1120 # print(f"======\n{level_four_data=}")
1121 # print(f"======\n{level_four_datas=}")
1122 for data in level_four_datas:
1123 merge_base(data, level_four_data)
1124 etym_datas.append(data)
1125 for data in etym_datas:
1126 merge_base(data, etym_data)
1127 page_datas.append(data)
1128 if clear_sound_data:
1129 level_four_data = {}
1130 level_four_datas = []
1131 etym_datas = []
1133 def push_etym() -> None:
1134 """Starts collecting data for a new etymology."""
1135 nonlocal etym_data
1136 nonlocal etym_datas
1137 nonlocal have_etym
1138 nonlocal inside_level_four
1139 have_etym = True
1140 push_level_four_section(False)
1141 inside_level_four = False
1142 # etymology section could under pronunciation section
1143 etym_data = (
1144 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1145 )
1147 def select_data() -> WordData:
1148 """Selects where to store data (pos or etym) based on whether we
1149 are inside a pos (part-of-speech)."""
1150 # print(f"{wxr.wtp.subsection=}")
1151 # print(f"{stack=}")
1152 if wxr.wtp.subsection is not None:
1153 return pos_data
1154 if inside_level_four:
1155 return level_four_data
1156 if stack[-1] == language:
1157 return base_data
1158 return etym_data
1160 term_label_templates: list[TemplateData] = []
1162 def head_post_template_fn(
1163 name: str, ht: TemplateArgs, expansion: str
1164 ) -> Optional[str]:
1165 """Handles special templates in the head section of a word. Head
1166 section is the text after part-of-speech subtitle and before word
1167 sense list. Typically it generates the bold line for the word, but
1168 may also contain other useful information that often ends in
1169 side boxes. We want to capture some of that additional information."""
1170 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1171 if is_panel_template(wxr, name): 1171 ↛ 1174line 1171 didn't jump to line 1174 because the condition on line 1171 was never true
1172 # Completely ignore these templates (not even recorded in
1173 # head_templates)
1174 return ""
1175 if name == "head":
1176 # XXX are these also captured in forms? Should this special case
1177 # be removed?
1178 t = ht.get(2, "")
1179 if t == "pinyin": 1179 ↛ 1180line 1179 didn't jump to line 1180 because the condition on line 1179 was never true
1180 data_append(pos_data, "tags", "Pinyin")
1181 elif t == "romanization": 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true
1182 data_append(pos_data, "tags", "romanization")
1183 if (
1184 HEAD_TAG_RE.search(name) is not None
1185 or name in WORD_LEVEL_HEAD_TEMPLATES
1186 ):
1187 args_ht = clean_template_args(wxr, ht)
1188 cleaned_expansion = clean_node(wxr, None, expansion)
1189 dt: TemplateData = {
1190 "name": name,
1191 "args": args_ht,
1192 "expansion": cleaned_expansion,
1193 }
1194 data_append(pos_data, "head_templates", dt)
1195 if name in WORD_LEVEL_HEAD_TEMPLATES:
1196 term_label_templates.append(dt)
1197 # Squash these, their tags are applied to the whole word,
1198 # and some cause problems like "term-label"
1199 return ""
1201 # The following are both captured in head_templates and parsed
1202 # separately
1204 if name in wikipedia_templates:
1205 # Note: various places expect to have content from wikipedia
1206 # templates, so cannot convert this to empty
1207 parse_wikipedia_template(wxr, pos_data, ht)
1208 return None
1210 if name == "number box": 1210 ↛ 1212line 1210 didn't jump to line 1212 because the condition on line 1210 was never true
1211 # XXX extract numeric value?
1212 return ""
1213 if name == "enum":
1214 # XXX extract?
1215 return ""
1216 if name == "cardinalbox": 1216 ↛ 1219line 1216 didn't jump to line 1219 because the condition on line 1216 was never true
1217 # XXX extract similar to enum?
1218 # XXX this can also occur in top-level under language
1219 return ""
1220 if name == "Han simplified forms": 1220 ↛ 1222line 1220 didn't jump to line 1222 because the condition on line 1220 was never true
1221 # XXX extract?
1222 return ""
1223 # if name == "ja-kanji forms":
1224 # # XXX extract?
1225 # return ""
1226 # if name == "vi-readings":
1227 # # XXX extract?
1228 # return ""
1229 # if name == "ja-kanji":
1230 # # XXX extract?
1231 # return ""
1232 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1232 ↛ 1234line 1232 didn't jump to line 1234 because the condition on line 1232 was never true
1233 # XXX extract?
1234 return ""
1236 return None
1238 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1239 """Parses the subsection for a part-of-speech under a language on
1240 a page."""
1241 assert isinstance(posnode, WikiNode)
1242 assert isinstance(pos, str)
1243 # print("parse_part_of_speech", pos)
1244 pos_data["pos"] = pos
1245 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1246 lists: list[list[WikiNode]] = [[]] # list of lists
1247 first_para = True
1248 first_head_tmplt = True
1249 collecting_head = True
1250 start_of_paragraph = True
1252 # XXX extract templates from posnode with recursively_extract
1253 # that break stuff, like ja-kanji or az-suffix-form.
1254 # Do the extraction with a list of template names, combined from
1255 # different lists, then separate out them into different lists
1256 # that are handled at different points of the POS section.
1257 # First, extract az-suffix-form, put it in `inflection`,
1258 # and parse `inflection`'s content when appropriate later.
1259 # The contents of az-suffix-form (and ja-kanji) that generate
1260 # divs with "floatright" in their style gets deleted by
1261 # clean_value, so templates that slip through from here won't
1262 # break anything.
1263 # XXX bookmark
1264 # print("===================")
1265 # print(posnode.children)
1267 floaters, poschildren = recursively_extract(
1268 posnode.children,
1269 lambda x: (
1270 isinstance(x, WikiNode)
1271 and (
1272 (
1273 x.kind == NodeKind.TEMPLATE
1274 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1275 )
1276 or (
1277 x.kind == NodeKind.LINK
1278 # Need to check for stringiness because some links are
1279 # broken; for example, if a template is missing an
1280 # argument, a link might look like `[[{{{1}}}...]]`
1281 and isinstance(x.largs[0][0], str)
1282 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1283 )
1284 )
1285 ),
1286 )
1287 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1288 tempnode.largs = [["Inflection"]]
1289 tempnode.children = floaters
1290 parse_inflection(tempnode, "Floating Div", pos)
1291 # print(poschildren)
1292 # XXX new above
1294 if not poschildren: 1294 ↛ 1295line 1294 didn't jump to line 1295 because the condition on line 1294 was never true
1295 if not floaters:
1296 wxr.wtp.debug(
1297 "PoS section without contents",
1298 sortid="en/page/1051/20230612",
1299 )
1300 else:
1301 wxr.wtp.debug(
1302 "PoS section without contents except for a floating table",
1303 sortid="en/page/1056/20230612",
1304 )
1305 return
1307 for node in poschildren:
1308 if isinstance(node, str):
1309 for m in re.finditer(r"\n+|[^\n]+", node):
1310 p = m.group(0)
1311 if p.startswith("\n\n") and pre:
1312 first_para = False
1313 start_of_paragraph = True
1314 break
1315 if p and collecting_head:
1316 pre[-1].append(p)
1317 continue
1318 assert isinstance(node, WikiNode)
1319 kind = node.kind
1320 if kind == NodeKind.LIST:
1321 lists[-1].append(node)
1322 collecting_head = False
1323 start_of_paragraph = True
1324 continue
1325 elif kind in LEVEL_KINDS:
1326 # Stop parsing section if encountering any kind of
1327 # level header (like ===Noun=== or ====Further Reading====).
1328 # At a quick glance, this should be the default behavior,
1329 # but if some kinds of source articles have sub-sub-sections
1330 # that should be parsed XXX it should be handled by changing
1331 # this break.
1332 break
1333 elif collecting_head and kind == NodeKind.LINK:
1334 # We might collect relevant links as they are often pictures
1335 # relating to the word
1336 if len(node.largs[0]) >= 1 and isinstance( 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true
1337 node.largs[0][0], str
1338 ):
1339 if node.largs[0][0].startswith( 1339 ↛ 1345line 1339 didn't jump to line 1345 because the condition on line 1339 was never true
1340 ns_title_prefix_tuple(wxr, "Category")
1341 ):
1342 # [[Category:...]]
1343 # We're at the end of the file, probably, so stop
1344 # here. Otherwise the head will get garbage.
1345 break
1346 if node.largs[0][0].startswith( 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true
1347 ns_title_prefix_tuple(wxr, "File")
1348 ):
1349 # Skips file links
1350 continue
1351 start_of_paragraph = False
1352 pre[-1].extend(node.largs[-1])
1353 elif kind == NodeKind.HTML:
1354 if node.sarg == "br":
1355 if pre[-1]: 1355 ↛ 1307line 1355 didn't jump to line 1307 because the condition on line 1355 was always true
1356 pre.append([]) # Switch to next head
1357 lists.append([]) # Lists parallels pre
1358 collecting_head = True
1359 start_of_paragraph = True
1360 elif collecting_head and node.sarg not in ( 1360 ↛ 1366line 1360 didn't jump to line 1366 because the condition on line 1360 was never true
1361 "gallery",
1362 "ref",
1363 "cite",
1364 "caption",
1365 ):
1366 start_of_paragraph = False
1367 pre[-1].append(node)
1368 else:
1369 start_of_paragraph = False
1370 elif isinstance(node, TemplateNode):
1371 # XXX Insert code here that disambiguates between
1372 # templates that generate word heads and templates
1373 # that don't.
1374 # There's head_tag_re that seems like a regex meant
1375 # to identify head templates. Too bad it's None.
1377 # ignore {{category}}, {{cat}}... etc.
1378 if node.template_name in stop_head_at_these_templates:
1379 # we've reached a template that should be at the end,
1380 continue
1382 # skip these templates; panel_templates is already used
1383 # to skip certain templates else, but it also applies to
1384 # head parsing quite well.
1385 # node.largs[0][0] should always be str, but can't type-check
1386 # that.
1387 if is_panel_template(wxr, node.template_name):
1388 continue
1389 # skip these templates
1390 # if node.largs[0][0] in skip_these_templates_in_head:
1391 # first_head_tmplt = False # no first_head_tmplt at all
1392 # start_of_paragraph = False
1393 # continue
1395 if first_head_tmplt and pre[-1]:
1396 first_head_tmplt = False
1397 start_of_paragraph = False
1398 pre[-1].append(node)
1399 elif pre[-1] and start_of_paragraph:
1400 pre.append([]) # Switch to the next head
1401 lists.append([]) # lists parallel pre
1402 collecting_head = True
1403 start_of_paragraph = False
1404 pre[-1].append(node)
1405 else:
1406 pre[-1].append(node)
1407 elif first_para:
1408 start_of_paragraph = False
1409 if collecting_head: 1409 ↛ 1307line 1409 didn't jump to line 1307 because the condition on line 1409 was always true
1410 pre[-1].append(node)
1411 # XXX use template_fn in clean_node to check that the head macro
1412 # is compatible with the current part-of-speech and generate warning
1413 # if not. Use template_allowed_pos_map.
1415 # Clean up empty pairs, and fix messes with extra newlines that
1416 # separate templates that are followed by lists wiktextract issue #314
1418 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1419 cleaned_lists: list[list[WikiNode]] = []
1420 pairless_pre_index = None
1422 for pre1, ls in zip(pre, lists):
1423 if pre1 and not ls:
1424 pairless_pre_index = len(cleaned_pre)
1425 if not pre1 and not ls: 1425 ↛ 1427line 1425 didn't jump to line 1427 because the condition on line 1425 was never true
1426 # skip [] + []
1427 continue
1428 if not ls and all(
1429 (isinstance(x, str) and not x.strip()) for x in pre1
1430 ):
1431 # skip ["\n", " "] + []
1432 continue
1433 if ls and not pre1:
1434 if pairless_pre_index is not None: 1434 ↛ 1435line 1434 didn't jump to line 1435 because the condition on line 1434 was never true
1435 cleaned_lists[pairless_pre_index] = ls
1436 pairless_pre_index = None
1437 continue
1438 cleaned_pre.append(pre1)
1439 cleaned_lists.append(ls)
1441 pre = cleaned_pre
1442 lists = cleaned_lists
1444 there_are_many_heads = len(pre) > 1
1445 header_tags: list[str] = []
1446 header_topics: list[str] = []
1447 previous_head_had_list = False
1449 if not any(g for g in lists):
1450 process_gloss_without_list(
1451 poschildren, pos, pos_data, header_tags, header_topics
1452 )
1453 else:
1454 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1455 # if len(ls) == 0:
1456 # # don't have gloss list
1457 # # XXX add code here to filter out 'garbage', like text
1458 # # that isn't a head template or head.
1459 # continue
1461 if all(not sl for sl in lists[i:]):
1462 if i == 0: 1462 ↛ 1463line 1462 didn't jump to line 1463 because the condition on line 1462 was never true
1463 if isinstance(node, str):
1464 wxr.wtp.debug(
1465 "first head without list of senses,"
1466 "string: '{}[...]', {}/{}".format(
1467 node[:20], word, language
1468 ),
1469 sortid="page/1689/20221215",
1470 )
1471 if isinstance(node, WikiNode):
1472 if node.largs and node.largs[0][0] in [
1473 "Han char",
1474 ]:
1475 # just ignore these templates
1476 pass
1477 else:
1478 wxr.wtp.debug(
1479 "first head without "
1480 "list of senses, "
1481 "template node "
1482 "{}, {}/{}".format(
1483 node.largs, word, language
1484 ),
1485 sortid="page/1694/20221215",
1486 )
1487 else:
1488 wxr.wtp.debug(
1489 "first head without list of senses, "
1490 "{}/{}".format(word, language),
1491 sortid="page/1700/20221215",
1492 )
1493 # no break here so that the first head always
1494 # gets processed.
1495 else:
1496 if isinstance(node, str): 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true
1497 wxr.wtp.debug(
1498 "later head without list of senses,"
1499 "string: '{}[...]', {}/{}".format(
1500 node[:20], word, language
1501 ),
1502 sortid="page/1708/20221215",
1503 )
1504 if isinstance(node, WikiNode): 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true
1505 wxr.wtp.debug(
1506 "later head without list of senses,"
1507 "template node "
1508 "{}, {}/{}".format(
1509 node.sarg if node.sarg else node.largs,
1510 word,
1511 language,
1512 ),
1513 sortid="page/1713/20221215",
1514 )
1515 else:
1516 wxr.wtp.debug(
1517 "later head without list of senses, "
1518 "{}/{}".format(word, language),
1519 sortid="page/1719/20221215",
1520 )
1521 break
1522 head_group = i + 1 if there_are_many_heads else None
1523 # print("parse_part_of_speech: {}: {}: pre={}"
1524 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1526 if previous_head_had_list:
1527 # We use a boolean flag here because we want to be able
1528 # let the header_tags data pass through after the loop
1529 # is over without accidentally emptying it, if there are
1530 # no pos_datas and we need a dummy data.
1531 header_tags.clear()
1532 header_topics.clear()
1534 process_gloss_header(
1535 pre1, pos, head_group, pos_data, header_tags, header_topics
1536 )
1537 for ln in ls:
1538 # Parse each list associated with this head.
1539 for node in ln.children:
1540 # Parse nodes in l.children recursively.
1541 # The recursion function uses push_sense() to
1542 # add stuff into sense_datas, and returns True or
1543 # False if something is added, which bubbles upward.
1544 # If the bubble is "True", then higher levels of
1545 # the recursion will not push_sense(), because
1546 # the data is already pushed into a sub-gloss
1547 # downstream, unless the higher level has examples
1548 # that need to be put somewhere.
1549 common_data: SenseData = {
1550 "tags": list(header_tags),
1551 "topics": list(header_topics),
1552 }
1553 if head_group:
1554 common_data["head_nr"] = head_group
1555 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1557 if len(ls) > 0:
1558 previous_head_had_list = True
1559 else:
1560 previous_head_had_list = False
1562 # If there are no senses extracted, add a dummy sense. We want to
1563 # keep tags extracted from the head for the dummy sense.
1564 push_sense() # Make sure unfinished data pushed, and start clean sense
1565 if len(sense_datas) == 0:
1566 data_extend(sense_data, "tags", header_tags)
1567 data_extend(sense_data, "topics", header_topics)
1568 data_append(sense_data, "tags", "no-gloss")
1569 push_sense()
1571 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0))
1573 for sd in sense_datas:
1574 if "__temp_sense_sorting_ordinal" in sd: 1574 ↛ 1573line 1574 didn't jump to line 1573 because the condition on line 1574 was always true
1575 del sd["__temp_sense_sorting_ordinal"]
1577 def process_gloss_header(
1578 header_nodes: list[Union[WikiNode, str]],
1579 pos_type: str,
1580 header_group: Optional[int],
1581 pos_data: WordData,
1582 header_tags: list[str],
1583 header_topics: list[str],
1584 ) -> None:
1585 ruby = []
1586 links: list[str] = []
1588 # process template parse nodes here
1589 new_nodes = []
1590 info_template_data = []
1591 for node in header_nodes:
1592 # print(f"{node=}")
1593 info_data, info_out = parse_info_template_node(wxr, node, "head")
1594 if info_data or info_out:
1595 if info_data: 1595 ↛ 1597line 1595 didn't jump to line 1597 because the condition on line 1595 was always true
1596 info_template_data.append(info_data)
1597 if info_out: # including just the original node 1597 ↛ 1598line 1597 didn't jump to line 1598 because the condition on line 1597 was never true
1598 new_nodes.append(info_out)
1599 else:
1600 new_nodes.append(node)
1601 header_nodes = new_nodes
1603 if info_template_data:
1604 if "info_templates" not in pos_data: 1604 ↛ 1607line 1604 didn't jump to line 1607 because the condition on line 1604 was always true
1605 pos_data["info_templates"] = info_template_data
1606 else:
1607 pos_data["info_templates"].extend(info_template_data)
1609 if not word.isalnum():
1610 # `-` is kosher, add more of these if needed.
1611 if word.replace("-", "").isalnum():
1612 pass
1613 else:
1614 # if the word contains non-letter or -number characters, it
1615 # might have something that messes with split-at-semi-comma; we
1616 # collect links so that we can skip splitting them.
1617 exp = wxr.wtp.parse(
1618 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1619 )
1620 link_nodes, _ = recursively_extract(
1621 exp.children,
1622 lambda x: isinstance(x, WikiNode)
1623 and x.kind == NodeKind.LINK,
1624 )
1625 for ln in link_nodes:
1626 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1627 if not ltext.isalnum():
1628 links.append(ltext)
1629 if word not in links: 1629 ↛ 1632line 1629 didn't jump to line 1632 because the condition on line 1629 was always true
1630 links.append(word)
1632 if lang_code == "ja":
1633 exp = wxr.wtp.parse(
1634 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1635 )
1636 rub, _ = recursively_extract(
1637 exp.children,
1638 lambda x: isinstance(x, WikiNode)
1639 and x.kind == NodeKind.HTML
1640 and x.sarg == "ruby",
1641 )
1642 if rub is not None: 1642 ↛ 1687line 1642 didn't jump to line 1687 because the condition on line 1642 was always true
1643 for r in rub:
1644 if TYPE_CHECKING:
1645 # we know the lambda above in recursively_extract
1646 # returns only WikiNodes in rub
1647 assert isinstance(r, WikiNode)
1648 rt = parse_ruby(wxr, r)
1649 if rt is not None:
1650 ruby.append(rt)
1651 elif lang_code == "vi":
1652 # Handle vi-readings templates that have a weird structures for
1653 # Chu Nom vietnamese characters heads
1654 # https://en.wiktionary.org/wiki/Template:vi-readings
1655 new_header_nodes = []
1656 related_readings: list[LinkageData] = []
1657 for node in header_nodes:
1658 if ( 1658 ↛ 1682line 1658 didn't jump to line 1682 because the condition on line 1658 was always true
1659 isinstance(node, TemplateNode)
1660 and node.template_name == "vi-readings"
1661 ):
1662 print(node.template_parameters)
1663 for parameter, tag in (
1664 ("hanviet", "han-viet-reading"),
1665 ("nom", "nom-reading"),
1666 # we ignore the fanqie parameter "phienthiet"
1667 ):
1668 arg = node.template_parameters.get(parameter)
1669 if arg is not None: 1669 ↛ 1663line 1669 didn't jump to line 1663 because the condition on line 1669 was always true
1670 text = clean_node(wxr, None, arg)
1671 for w in text.split(","):
1672 # ignore - separated references
1673 if "-" in w:
1674 w = w[: w.index("-")]
1675 w = w.strip()
1676 related_readings.append(
1677 LinkageData(word=w, tags=[tag])
1678 )
1679 continue
1681 # Skip the vi-reading template for the rest of the head parsing
1682 new_header_nodes.append(node)
1683 if len(related_readings) > 0: 1683 ↛ 1687line 1683 didn't jump to line 1687 because the condition on line 1683 was always true
1684 data_extend(pos_data, "related", related_readings)
1685 header_nodes = new_header_nodes
1687 header_text = clean_node(
1688 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1689 )
1691 if not header_text.strip():
1692 return
1694 term_label_tags: list[str] = []
1695 term_label_topics: list[str] = []
1696 if len(term_label_templates) > 0:
1697 # parse term label templates; if there are other similar kinds
1698 # of templates in headers that you want to squash and apply as
1699 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1700 for templ_data in term_label_templates:
1701 # print(templ_data)
1702 expan = templ_data.get("expansion", "").strip("().,; ")
1703 if not expan: 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true
1704 continue
1705 tlb_tagsets, tlb_topics = decode_tags(expan)
1706 for tlb_tags in tlb_tagsets:
1707 if len(tlb_tags) > 0 and not any(
1708 t.startswith("error-") for t in tlb_tags
1709 ):
1710 term_label_tags.extend(tlb_tags)
1711 term_label_topics.extend(tlb_topics)
1712 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1714 header_text = re.sub(r"\s+", " ", header_text)
1715 # print(f"{header_text=}")
1716 parse_word_head(
1717 wxr,
1718 pos_type,
1719 header_text,
1720 pos_data,
1721 is_reconstruction,
1722 header_group,
1723 ruby=ruby,
1724 links=links,
1725 )
1726 if "tags" in pos_data:
1727 # pos_data can get "tags" data from some source; type-checkers
1728 # doesn't like it, so let's ignore it.
1729 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1730 del pos_data["tags"] # type: ignore[typeddict-item]
1731 if len(term_label_tags) > 0:
1732 header_tags.extend(term_label_tags)
1733 if len(term_label_topics) > 0:
1734 header_topics.extend(term_label_topics)
1736 def process_gloss_without_list(
1737 nodes: list[Union[WikiNode, str]],
1738 pos_type: str,
1739 pos_data: WordData,
1740 header_tags: list[str],
1741 header_topics: list[str],
1742 ) -> None:
1743 # gloss text might not inside a list
1744 header_nodes: list[Union[str, WikiNode]] = []
1745 gloss_nodes: list[Union[str, WikiNode]] = []
1746 for node in strip_nodes(nodes):
1747 if isinstance(node, WikiNode):
1748 if isinstance(node, TemplateNode):
1749 if node.template_name in (
1750 "zh-see",
1751 "ja-see",
1752 "ja-see-kango",
1753 ):
1754 continue # soft redirect
1755 elif (
1756 node.template_name == "head"
1757 or node.template_name.startswith(f"{lang_code}-")
1758 ):
1759 header_nodes.append(node)
1760 continue
1761 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1761 ↛ 1763line 1761 didn't jump to line 1763 because the condition on line 1761 was always true
1762 break
1763 gloss_nodes.append(node)
1765 if len(header_nodes) > 0:
1766 process_gloss_header(
1767 header_nodes,
1768 pos_type,
1769 None,
1770 pos_data,
1771 header_tags,
1772 header_topics,
1773 )
1774 if len(gloss_nodes) > 0:
1775 process_gloss_contents(
1776 gloss_nodes,
1777 pos_type,
1778 {"tags": list(header_tags), "topics": list(header_topics)},
1779 )
1781 def parse_sense_node(
1782 node: Union[str, WikiNode], # never receives str
1783 sense_base: SenseData,
1784 pos: str,
1785 ) -> bool:
1786 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1787 Uses push_sense() to attempt adding data to pos_data in the scope
1788 of parse_language() when it reaches deep in the recursion. push_sense()
1789 returns True if it succeeds, and that is bubbled up the stack; if
1790 a sense was added downstream, the higher levels (whose shared data
1791 was already added by a subsense) do not push_sense(), unless it
1792 has examples that need to be put somewhere.
1793 """
1794 assert isinstance(sense_base, dict) # Added to every sense deeper in
1796 nonlocal sense_ordinal
1797 my_ordinal = sense_ordinal # copies, not a reference
1798 sense_ordinal += 1 # only use for sorting
1800 if not isinstance(node, WikiNode): 1800 ↛ 1802line 1800 didn't jump to line 1802 because the condition on line 1800 was never true
1801 # This doesn't seem to ever happen in practice.
1802 wxr.wtp.debug(
1803 "{}: parse_sense_node called with"
1804 "something that isn't a WikiNode".format(pos),
1805 sortid="page/1287/20230119",
1806 )
1807 return False
1809 if node.kind != NodeKind.LIST_ITEM: 1809 ↛ 1810line 1809 didn't jump to line 1810 because the condition on line 1809 was never true
1810 wxr.wtp.debug(
1811 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1812 )
1813 return False
1815 if node.sarg == ":":
1816 # Skip example entries at the highest level, ones without
1817 # a sense ("...#") above them.
1818 # If node.sarg is exactly and only ":", then it's at
1819 # the highest level; lower levels would have more
1820 # "indentation", like "#:" or "##:"
1821 return False
1823 # If a recursion call succeeds in push_sense(), bubble it up with
1824 # `added`.
1825 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1826 added = False
1828 gloss_template_args: set[str] = set()
1830 # For LISTs and LIST_ITEMS, their argument is something like
1831 # "##" or "##:", and using that we can rudimentally determine
1832 # list 'depth' if need be, and also what kind of list or
1833 # entry it is; # is for normal glosses, : for examples (indent)
1834 # and * is used for quotations on wiktionary.
1835 current_depth = node.sarg
1837 children = node.children
1839 # subentries, (presumably) a list
1840 # of subglosses below this. The list's
1841 # argument ends with #, and its depth should
1842 # be bigger than parent node.
1843 subentries = [
1844 x
1845 for x in children
1846 if isinstance(x, WikiNode)
1847 and x.kind == NodeKind.LIST
1848 and x.sarg == current_depth + "#"
1849 ]
1851 # sublists of examples and quotations. .sarg
1852 # does not end with "#".
1853 others = [
1854 x
1855 for x in children
1856 if isinstance(x, WikiNode)
1857 and x.kind == NodeKind.LIST
1858 and x.sarg != current_depth + "#"
1859 ]
1861 # the actual contents of this particular node.
1862 # can be a gloss (or a template that expands into
1863 # many glosses which we can't easily pre-expand)
1864 # or could be an "outer gloss" with more specific
1865 # subglosses, or could be a qualfier for the subglosses.
1866 contents = [
1867 x
1868 for x in children
1869 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1870 ]
1871 # If this entry has sublists of entries, we should combine
1872 # gloss information from both the "outer" and sublist content.
1873 # Sometimes the outer gloss
1874 # is more non-gloss or tags, sometimes it is a coarse sense
1875 # and the inner glosses are more specific. The outer one
1876 # does not seem to have qualifiers.
1878 # If we have one sublist with one element, treat it
1879 # specially as it may be a Wiktionary error; raise
1880 # that nested element to the same level.
1881 # XXX If need be, this block can be easily removed in
1882 # the current recursive logicand the result is one sense entry
1883 # with both glosses in the glosses list, as you would
1884 # expect. If the higher entry has examples, there will
1885 # be a higher entry with some duplicated data.
1886 if len(subentries) == 1:
1887 slc = subentries[0].children
1888 if len(slc) == 1:
1889 # copy current node and modify it so it doesn't
1890 # loop infinitely.
1891 cropped_node = copy.copy(node)
1892 cropped_node.children = [
1893 x
1894 for x in children
1895 if not (
1896 isinstance(x, WikiNode)
1897 and x.kind == NodeKind.LIST
1898 and x.sarg == current_depth + "#"
1899 )
1900 ]
1901 added |= parse_sense_node(cropped_node, sense_base, pos)
1902 nonlocal sense_data # this kludge causes duplicated raw_
1903 # glosses data if this is not done;
1904 # if the top-level (cropped_node)
1905 # does not push_sense() properly or
1906 # parse_sense_node() returns early,
1907 # sense_data is not reset. This happens
1908 # for example when you have a no-gloss
1909 # string like "(intransitive)":
1910 # no gloss, push_sense() returns early
1911 # and sense_data has duplicate data with
1912 # sense_base
1913 sense_data = {}
1914 added |= parse_sense_node(slc[0], sense_base, pos)
1915 return added
1917 return process_gloss_contents(
1918 contents,
1919 pos,
1920 sense_base,
1921 subentries,
1922 others,
1923 gloss_template_args,
1924 added,
1925 my_ordinal,
1926 )
1928 def process_gloss_contents(
1929 contents: list[Union[str, WikiNode]],
1930 pos: str,
1931 sense_base: SenseData,
1932 subentries: list[WikiNode] = [],
1933 others: list[WikiNode] = [],
1934 gloss_template_args: Set[str] = set(),
1935 added: bool = False,
1936 sorting_ordinal: int | None = None,
1937 ) -> bool:
1938 def sense_template_fn(
1939 name: str, ht: TemplateArgs, is_gloss: bool = False
1940 ) -> Optional[str]:
1941 # print(f"sense_template_fn: {name}, {ht}")
1942 if name in wikipedia_templates:
1943 # parse_wikipedia_template(wxr, pos_data, ht)
1944 return None
1945 if is_panel_template(wxr, name):
1946 return ""
1947 if name in INFO_TEMPLATE_FUNCS:
1948 info_data, info_exp = parse_info_template_arguments(
1949 wxr, name, ht, "sense"
1950 )
1951 if info_data or info_exp: 1951 ↛ 1957line 1951 didn't jump to line 1957 because the condition on line 1951 was always true
1952 if info_data: 1952 ↛ 1954line 1952 didn't jump to line 1954 because the condition on line 1952 was always true
1953 data_append(sense_base, "info_templates", info_data)
1954 if info_exp and isinstance(info_exp, str): 1954 ↛ 1956line 1954 didn't jump to line 1956 because the condition on line 1954 was always true
1955 return info_exp
1956 return ""
1957 if name in ("defdate",):
1958 date = clean_node(wxr, None, ht.get(1, ()))
1959 if part_two := ht.get(2): 1959 ↛ 1961line 1959 didn't jump to line 1961 because the condition on line 1959 was never true
1960 # Unicode mdash, not '-'
1961 date += "–" + clean_node(wxr, None, part_two)
1962 refs: dict[str, ReferenceData] = {}
1963 # ref, refn, ref2, ref2n, ref3, ref3n
1964 # ref1 not valid
1965 for k, v in sorted(
1966 (k, v) for k, v in ht.items() if isinstance(k, str)
1967 ):
1968 if m := re.match(r"ref(\d?)(n?)", k): 1968 ↛ 1965line 1968 didn't jump to line 1965 because the condition on line 1968 was always true
1969 ref_v = clean_node(wxr, None, v)
1970 if m.group(1) not in refs: # empty string or digit
1971 refs[m.group(1)] = ReferenceData()
1972 if m.group(2):
1973 refs[m.group(1)]["refn"] = ref_v
1974 else:
1975 refs[m.group(1)]["text"] = ref_v
1976 data_append(
1977 sense_base,
1978 "attestations",
1979 AttestationData(date=date, references=list(refs.values())),
1980 )
1981 return ""
1982 if name == "senseid":
1983 langid = clean_node(wxr, None, ht.get(1, ()))
1984 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1985 if re.match(r"Q\d+$", arg):
1986 data_append(sense_base, "wikidata", arg)
1987 data_append(sense_base, "senseid", langid + ":" + arg)
1988 if name in sense_linkage_templates:
1989 # print(f"SENSE_TEMPLATE_FN: {name}")
1990 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1991 return ""
1992 if name == "†" or name == "zh-obsolete":
1993 data_append(sense_base, "tags", "obsolete")
1994 return ""
1995 if name in {
1996 "ux",
1997 "uxi",
1998 "usex",
1999 "afex",
2000 "prefixusex",
2001 "ko-usex",
2002 "ko-x",
2003 "hi-x",
2004 "ja-usex-inline",
2005 "ja-x",
2006 "quotei",
2007 "he-x",
2008 "hi-x",
2009 "km-x",
2010 "ne-x",
2011 "shn-x",
2012 "th-x",
2013 "ur-x",
2014 }:
2015 # Usage examples are captured separately below. We don't
2016 # want to expand them into glosses even when unusual coding
2017 # is used in the entry.
2018 # These templates may slip through inside another item, but
2019 # currently we're separating out example entries (..#:)
2020 # well enough that there seems to very little contamination.
2021 if is_gloss:
2022 wxr.wtp.warning(
2023 "Example template is used for gloss text",
2024 sortid="extractor.en.page.sense_template_fn/1415",
2025 )
2026 else:
2027 return ""
2028 if name == "w": 2028 ↛ 2029line 2028 didn't jump to line 2029 because the condition on line 2028 was never true
2029 if ht.get(2) == "Wp":
2030 return ""
2031 for k, v in ht.items():
2032 v = v.strip()
2033 if v and "<" not in v:
2034 gloss_template_args.add(v)
2035 return None
2037 def extract_link_texts(item: GeneralNode) -> None:
2038 """Recursively extracts link texts from the gloss source. This
2039 information is used to select whether to remove final "." from
2040 form_of/alt_of (e.g., ihm/Hunsrik)."""
2041 if isinstance(item, (list, tuple)):
2042 for x in item:
2043 extract_link_texts(x)
2044 return
2045 if isinstance(item, str):
2046 # There seem to be HTML sections that may futher contain
2047 # unparsed links.
2048 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2048 ↛ 2049line 2048 didn't jump to line 2049 because the loop on line 2048 never started
2049 print("ITER:", m.group(0))
2050 v = m.group(1).split("|")[-1].strip()
2051 if v:
2052 gloss_template_args.add(v)
2053 return
2054 if not isinstance(item, WikiNode): 2054 ↛ 2055line 2054 didn't jump to line 2055 because the condition on line 2054 was never true
2055 return
2056 if item.kind == NodeKind.LINK:
2057 v = item.largs[-1]
2058 if ( 2058 ↛ 2064line 2058 didn't jump to line 2064 because the condition on line 2058 was always true
2059 isinstance(v, list)
2060 and len(v) == 1
2061 and isinstance(v[0], str)
2062 ):
2063 gloss_template_args.add(v[0].strip())
2064 for x in item.children:
2065 extract_link_texts(x)
2067 extract_link_texts(contents)
2069 # get the raw text of non-list contents of this node, and other stuff
2070 # like tag and category data added to sense_base
2071 # cast = no-op type-setter for the type-checker
2072 partial_template_fn = cast(
2073 TemplateFnCallable,
2074 partial(sense_template_fn, is_gloss=True),
2075 )
2076 rawgloss = clean_node(
2077 wxr,
2078 sense_base,
2079 contents,
2080 template_fn=partial_template_fn,
2081 collect_links=True,
2082 )
2084 if not rawgloss: 2084 ↛ 2085line 2084 didn't jump to line 2085 because the condition on line 2084 was never true
2085 return False
2087 # remove manually typed ordered list text at the start("1. ")
2088 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2090 # get stuff like synonyms and categories from "others",
2091 # maybe examples and quotations
2092 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2094 # The gloss could contain templates that produce more list items.
2095 # This happens commonly with, e.g., {{inflection of|...}}. Split
2096 # to parts. However, e.g. Interlingua generates multiple glosses
2097 # in HTML directly without Wikitext markup, so we must also split
2098 # by just newlines.
2099 subglosses = rawgloss.splitlines()
2101 if len(subglosses) == 0: 2101 ↛ 2102line 2101 didn't jump to line 2102 because the condition on line 2101 was never true
2102 return False
2104 if any(s.startswith("#") for s in subglosses):
2105 subtree = wxr.wtp.parse(rawgloss)
2106 # from wikitextprocessor.parser import print_tree
2107 # print("SUBTREE GENERATED BY TEMPLATE:")
2108 # print_tree(subtree)
2109 new_subentries = [
2110 x
2111 for x in subtree.children
2112 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2113 ]
2115 new_others = [
2116 x
2117 for x in subtree.children
2118 if isinstance(x, WikiNode)
2119 and x.kind == NodeKind.LIST
2120 and not x.sarg.endswith("#")
2121 ]
2123 new_contents = [
2124 clean_node(wxr, [], x)
2125 for x in subtree.children
2126 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2127 ]
2129 subentries = subentries or new_subentries
2130 others = others or new_others
2131 subglosses = new_contents
2132 rawgloss = "".join(subglosses)
2133 # Generate no gloss for translation hub pages, but add the
2134 # "translation-hub" tag for them
2135 if rawgloss == "(This entry is a translation hub.)": 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true
2136 data_append(sense_data, "tags", "translation-hub")
2137 return push_sense(sorting_ordinal)
2139 # Remove certain substrings specific to outer glosses
2140 strip_ends = [", particularly:"]
2141 for x in strip_ends:
2142 if rawgloss.endswith(x):
2143 rawgloss = rawgloss[: -len(x)].strip()
2144 break
2146 # A single gloss, or possibly an outer gloss.
2147 # Check if the possible outer gloss starts with
2148 # parenthesized tags/topics
2150 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2151 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2152 m = QUALIFIERS_RE.match(rawgloss)
2153 # (...): ... or (...(...)...): ...
2154 if m:
2155 q = m.group(1)
2156 rawgloss = rawgloss[m.end() :].strip()
2157 parse_sense_qualifier(wxr, q, sense_base)
2158 if rawgloss == "A pejorative:": 2158 ↛ 2159line 2158 didn't jump to line 2159 because the condition on line 2158 was never true
2159 data_append(sense_base, "tags", "pejorative")
2160 rawgloss = ""
2161 elif rawgloss == "Short forms.": 2161 ↛ 2162line 2161 didn't jump to line 2162 because the condition on line 2161 was never true
2162 data_append(sense_base, "tags", "abbreviation")
2163 rawgloss = ""
2164 elif rawgloss == "Technical or specialized senses.": 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true
2165 rawgloss = ""
2166 elif rawgloss.startswith("inflection of "):
2167 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2168 if parsed is not None: 2168 ↛ 2177line 2168 didn't jump to line 2177 because the condition on line 2168 was always true
2169 tags, origins = parsed
2170 if origins is not None: 2170 ↛ 2172line 2170 didn't jump to line 2172 because the condition on line 2170 was always true
2171 data_extend(sense_base, "form_of", origins)
2172 if tags is not None: 2172 ↛ 2175line 2172 didn't jump to line 2175 because the condition on line 2172 was always true
2173 data_extend(sense_base, "tags", tags)
2174 else:
2175 data_append(sense_base, "tags", "form-of")
2176 else:
2177 data_append(sense_base, "tags", "form-of")
2178 if rawgloss: 2178 ↛ 2209line 2178 didn't jump to line 2209 because the condition on line 2178 was always true
2179 # Code duplicating a lot of clean-up operations from later in
2180 # this block. We want to clean up the "supergloss" as much as
2181 # possible, in almost the same way as a normal gloss.
2182 supergloss = rawgloss
2184 if supergloss.startswith("; "): 2184 ↛ 2185line 2184 didn't jump to line 2185 because the condition on line 2184 was never true
2185 supergloss = supergloss[1:].strip()
2187 if supergloss.startswith(("^†", "†")):
2188 data_append(sense_base, "tags", "obsolete")
2189 supergloss = supergloss[2:].strip()
2190 elif supergloss.startswith("^‡"): 2190 ↛ 2191line 2190 didn't jump to line 2191 because the condition on line 2190 was never true
2191 data_extend(sense_base, "tags", ["obsolete", "historical"])
2192 supergloss = supergloss[2:].strip()
2194 # remove [14th century...] style brackets at the end
2195 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2197 if supergloss.startswith((",", ":")):
2198 supergloss = supergloss[1:]
2199 supergloss = supergloss.strip()
2200 if supergloss.startswith("N. of "): 2200 ↛ 2201line 2200 didn't jump to line 2201 because the condition on line 2200 was never true
2201 supergloss = "Name of " + supergloss[6:]
2202 supergloss = supergloss[2:]
2203 data_append(sense_base, "glosses", supergloss)
2204 if supergloss in ("A person:",):
2205 data_append(sense_base, "tags", "g-person")
2207 # The main recursive call (except for the exceptions at the
2208 # start of this function).
2209 for sublist in subentries:
2210 if not ( 2210 ↛ 2213line 2210 didn't jump to line 2213 because the condition on line 2210 was never true
2211 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2212 ):
2213 wxr.wtp.debug(
2214 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2215 f"with items that are not LISTs",
2216 sortid="page/1511/20230119",
2217 )
2218 continue
2219 for item in sublist.children:
2220 if not ( 2220 ↛ 2224line 2220 didn't jump to line 2224 because the condition on line 2220 was never true
2221 isinstance(item, WikiNode)
2222 and item.kind == NodeKind.LIST_ITEM
2223 ):
2224 continue
2225 # copy sense_base to prevent cross-contamination between
2226 # subglosses and other subglosses and superglosses
2227 sense_base2 = copy.deepcopy(sense_base)
2228 if parse_sense_node(item, sense_base2, pos): 2228 ↛ 2219line 2228 didn't jump to line 2219 because the condition on line 2228 was always true
2229 added = True
2231 # Capture examples.
2232 # This is called after the recursive calls above so that
2233 # sense_base is not contaminated with meta-data from
2234 # example entries for *this* gloss.
2235 examples = []
2236 if wxr.config.capture_examples: 2236 ↛ 2240line 2236 didn't jump to line 2240 because the condition on line 2236 was always true
2237 examples = extract_examples(others, sense_base)
2239 # push_sense() succeeded somewhere down-river, so skip this level
2240 if added:
2241 if examples:
2242 # this higher-up gloss has examples that we do not want to skip
2243 wxr.wtp.debug(
2244 "'{}[...]' gloss has examples we want to keep, "
2245 "but there are subglosses.".format(repr(rawgloss[:30])),
2246 sortid="page/1498/20230118",
2247 )
2248 else:
2249 return True
2251 # Some entries, e.g., "iacebam", have weird sentences in quotes
2252 # after the gloss, but these sentences don't seem to be intended
2253 # as glosses. Skip them.
2254 indexed_subglosses = list(
2255 (i, gl)
2256 for i, gl in enumerate(subglosses)
2257 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2258 )
2260 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2260 ↛ 2261line 2260 didn't jump to line 2261 because the condition on line 2260 was never true
2261 gl = indexed_subglosses[0][1].strip()
2262 if gl.endswith(":"):
2263 gl = gl[:-1].strip()
2264 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2265 if parsed is not None:
2266 infl_tags, infl_dts = parsed
2267 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2268 # Interpret others as a particular form under
2269 # "inflection of"
2270 data_extend(sense_base, "tags", infl_tags)
2271 data_extend(sense_base, "form_of", infl_dts)
2272 indexed_subglosses = indexed_subglosses[1:]
2273 elif not infl_dts:
2274 data_extend(sense_base, "tags", infl_tags)
2275 indexed_subglosses = indexed_subglosses[1:]
2277 # Create senses for remaining subglosses
2278 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2279 gloss = gloss.strip()
2280 if not gloss and len(indexed_subglosses) > 1: 2280 ↛ 2281line 2280 didn't jump to line 2281 because the condition on line 2280 was never true
2281 continue
2282 # Push a new sense (if the last one is not empty)
2283 if push_sense(sorting_ordinal): 2283 ↛ 2284line 2283 didn't jump to line 2284 because the condition on line 2283 was never true
2284 added = True
2285 # if gloss not in sense_data.get("raw_glosses", ()):
2286 # data_append(sense_data, "raw_glosses", gloss)
2287 if i == 0 and examples:
2288 # In a multi-line gloss, associate examples
2289 # with only one of them.
2290 # XXX or you could use gloss_i == len(indexed_subglosses)
2291 # to associate examples with the *last* one.
2292 data_extend(sense_data, "examples", examples)
2293 if gloss.startswith("; ") and gloss_i > 0: 2293 ↛ 2294line 2293 didn't jump to line 2294 because the condition on line 2293 was never true
2294 gloss = gloss[1:].strip()
2295 # If the gloss starts with †, mark as obsolete
2296 if gloss.startswith("^†"): 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true
2297 data_append(sense_data, "tags", "obsolete")
2298 gloss = gloss[2:].strip()
2299 elif gloss.startswith("^‡"): 2299 ↛ 2300line 2299 didn't jump to line 2300 because the condition on line 2299 was never true
2300 data_extend(sense_data, "tags", ["obsolete", "historical"])
2301 gloss = gloss[2:].strip()
2302 # Copy data for all senses to this sense
2303 for k, v in sense_base.items():
2304 if isinstance(v, (list, tuple)):
2305 if k != "tags":
2306 # Tags handled below (countable/uncountable special)
2307 data_extend(sense_data, k, v)
2308 else:
2309 assert k not in ("tags", "categories", "topics")
2310 sense_data[k] = v # type:ignore[literal-required]
2311 # Parse the gloss for this particular sense
2312 m = QUALIFIERS_RE.match(gloss)
2313 # (...): ... or (...(...)...): ...
2314 if m:
2315 parse_sense_qualifier(wxr, m.group(1), sense_data)
2316 gloss = gloss[m.end() :].strip()
2318 # Remove common suffix "[from 14th c.]" and similar
2319 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2321 # Check to make sure we don't have unhandled list items in gloss
2322 ofs = max(gloss.find("#"), gloss.find("* "))
2323 if ofs > 10 and "(#)" not in gloss:
2324 wxr.wtp.debug(
2325 "gloss may contain unhandled list items: {}".format(gloss),
2326 sortid="page/1412",
2327 )
2328 elif "\n" in gloss: 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true
2329 wxr.wtp.debug(
2330 "gloss contains newline: {}".format(gloss),
2331 sortid="page/1416",
2332 )
2334 # Kludge, some glosses have a comma after initial qualifiers in
2335 # parentheses
2336 if gloss.startswith((",", ":")):
2337 gloss = gloss[1:]
2338 gloss = gloss.strip()
2339 if gloss.endswith(":"):
2340 gloss = gloss[:-1].strip()
2341 if gloss.startswith("N. of "): 2341 ↛ 2342line 2341 didn't jump to line 2342 because the condition on line 2341 was never true
2342 gloss = "Name of " + gloss[6:]
2343 if gloss.startswith("†"): 2343 ↛ 2344line 2343 didn't jump to line 2344 because the condition on line 2343 was never true
2344 data_append(sense_data, "tags", "obsolete")
2345 gloss = gloss[1:]
2346 elif gloss.startswith("^†"): 2346 ↛ 2347line 2346 didn't jump to line 2347 because the condition on line 2346 was never true
2347 data_append(sense_data, "tags", "obsolete")
2348 gloss = gloss[2:]
2350 # Copy tags from sense_base if any. This will not copy
2351 # countable/uncountable if either was specified in the sense,
2352 # as sometimes both are specified in word head but only one
2353 # in individual senses.
2354 countability_tags = []
2355 base_tags = sense_base.get("tags", ())
2356 sense_tags = sense_data.get("tags", ())
2357 for tag in base_tags:
2358 if tag in ("countable", "uncountable"):
2359 if tag not in countability_tags: 2359 ↛ 2361line 2359 didn't jump to line 2361 because the condition on line 2359 was always true
2360 countability_tags.append(tag)
2361 continue
2362 if tag not in sense_tags:
2363 data_append(sense_data, "tags", tag)
2364 if countability_tags:
2365 if ( 2365 ↛ 2374line 2365 didn't jump to line 2374 because the condition on line 2365 was always true
2366 "countable" not in sense_tags
2367 and "uncountable" not in sense_tags
2368 ):
2369 data_extend(sense_data, "tags", countability_tags)
2371 # If outer gloss specifies a form-of ("inflection of", see
2372 # aquamarine/German), try to parse the inner glosses as
2373 # tags for an inflected form.
2374 if "form-of" in sense_base.get("tags", ()):
2375 parsed = parse_alt_or_inflection_of(
2376 wxr, gloss, gloss_template_args
2377 )
2378 if parsed is not None: 2378 ↛ 2384line 2378 didn't jump to line 2384 because the condition on line 2378 was always true
2379 infl_tags, infl_dts = parsed
2380 if not infl_dts and infl_tags: 2380 ↛ 2384line 2380 didn't jump to line 2384 because the condition on line 2380 was always true
2381 # Interpret as a particular form under "inflection of"
2382 data_extend(sense_data, "tags", infl_tags)
2384 if not gloss: 2384 ↛ 2385line 2384 didn't jump to line 2385 because the condition on line 2384 was never true
2385 data_append(sense_data, "tags", "empty-gloss")
2386 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2387 if ( 2387 ↛ 2398line 2387 didn't jump to line 2398 because the condition on line 2387 was always true
2388 gloss_i == 0
2389 and len(sense_data.get("glosses", tuple())) >= 1
2390 ):
2391 # If we added a "high-level gloss" from rawgloss, but this
2392 # is that same gloss_i, add this instead of the raw_gloss
2393 # from before if they're different: the rawgloss was not
2394 # cleaned exactly the same as this later gloss
2395 sense_data["glosses"][-1] = gloss
2396 else:
2397 # Add the gloss for the sense.
2398 data_append(sense_data, "glosses", gloss)
2400 # Kludge: there are cases (e.g., etc./Swedish) where there are
2401 # two abbreviations in the same sense, both generated by the
2402 # {{abbreviation of|...}} template. Handle these with some magic.
2403 position = 0
2404 split_glosses = []
2405 for m in re.finditer(r"Abbreviation of ", gloss):
2406 if m.start() != position: 2406 ↛ 2405line 2406 didn't jump to line 2405 because the condition on line 2406 was always true
2407 split_glosses.append(gloss[position : m.start()])
2408 position = m.start()
2409 split_glosses.append(gloss[position:])
2410 for gloss in split_glosses:
2411 # Check if this gloss describes an alt-of or inflection-of
2412 if (
2413 lang_code != "en"
2414 and " " not in gloss
2415 and distw([word], gloss) < 0.3
2416 ):
2417 # Don't try to parse gloss if it is one word
2418 # that is close to the word itself for non-English words
2419 # (probable translations of a tag/form name)
2420 continue
2421 parsed = parse_alt_or_inflection_of(
2422 wxr, gloss, gloss_template_args
2423 )
2424 if parsed is None:
2425 continue
2426 tags, dts = parsed
2427 if not dts and tags:
2428 data_extend(sense_data, "tags", tags)
2429 continue
2430 for dt in dts: # type:ignore[union-attr]
2431 ftags = list(tag for tag in tags if tag != "form-of")
2432 if "alt-of" in tags:
2433 data_extend(sense_data, "tags", ftags)
2434 data_append(sense_data, "alt_of", dt)
2435 elif "compound-of" in tags: 2435 ↛ 2436line 2435 didn't jump to line 2436 because the condition on line 2435 was never true
2436 data_extend(sense_data, "tags", ftags)
2437 data_append(sense_data, "compound_of", dt)
2438 elif "synonym-of" in tags: 2438 ↛ 2439line 2438 didn't jump to line 2439 because the condition on line 2438 was never true
2439 data_extend(dt, "tags", ftags)
2440 data_append(sense_data, "synonyms", dt)
2441 elif tags and dt.get("word", "").startswith("of "): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true
2442 dt["word"] = dt["word"][3:]
2443 data_append(sense_data, "tags", "form-of")
2444 data_extend(sense_data, "tags", ftags)
2445 data_append(sense_data, "form_of", dt)
2446 elif "form-of" in tags: 2446 ↛ 2430line 2446 didn't jump to line 2430 because the condition on line 2446 was always true
2447 data_extend(sense_data, "tags", tags)
2448 data_append(sense_data, "form_of", dt)
2450 if len(sense_data) == 0:
2451 if len(sense_base.get("tags", [])) == 0: 2451 ↛ 2453line 2451 didn't jump to line 2453 because the condition on line 2451 was always true
2452 del sense_base["tags"]
2453 sense_data.update(sense_base)
2454 if push_sense(sorting_ordinal): 2454 ↛ 2458line 2454 didn't jump to line 2458 because the condition on line 2454 was always true
2455 # push_sense succeded in adding a sense to pos_data
2456 added = True
2457 # print("PARSE_SENSE DONE:", pos_datas[-1])
2458 return added
2460 def parse_inflection(
2461 node: WikiNode, section: str, pos: Optional[str]
2462 ) -> None:
2463 """Parses inflection data (declension, conjugation) from the given
2464 page. This retrieves the actual inflection template
2465 parameters, which are very useful for applications that need
2466 to learn the inflection classes and generate inflected
2467 forms."""
2468 assert isinstance(node, WikiNode)
2469 assert isinstance(section, str)
2470 assert pos is None or isinstance(pos, str)
2471 # print("parse_inflection:", node)
2473 if pos is None: 2473 ↛ 2474line 2473 didn't jump to line 2474 because the condition on line 2473 was never true
2474 wxr.wtp.debug(
2475 "inflection table outside part-of-speech", sortid="page/1812"
2476 )
2477 return
2479 def inflection_template_fn(
2480 name: str, ht: TemplateArgs
2481 ) -> Optional[str]:
2482 # print("decl_conj_template_fn", name, ht)
2483 if is_panel_template(wxr, name): 2483 ↛ 2484line 2483 didn't jump to line 2484 because the condition on line 2483 was never true
2484 return ""
2485 if name in ("is-u-mutation",): 2485 ↛ 2488line 2485 didn't jump to line 2488 because the condition on line 2485 was never true
2486 # These are not to be captured as an exception to the
2487 # generic code below
2488 return None
2489 m = re.search(
2490 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2491 r"declension|inflection|mut|mutation)($|-)",
2492 name,
2493 )
2494 if m:
2495 args_ht = clean_template_args(wxr, ht)
2496 dt = {"name": name, "args": args_ht}
2497 data_append(pos_data, "inflection_templates", dt)
2499 return None
2501 # Convert the subtree back to Wikitext, then expand all and parse,
2502 # capturing templates in the process
2503 text = wxr.wtp.node_to_wikitext(node.children)
2505 # Split text into separate sections for each to-level template
2506 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2507 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2508 # The (?:...) creates a non-capturing regex group; if it was capturing,
2509 # like the group around it, it would create elements in brace_matches,
2510 # including None if it doesn't match.
2511 # 20250114: Added {| and |} into the regex because tables were being
2512 # cut into pieces by this code. Issue #973, introduction of two-part
2513 # book-end templates similar to trans-top and tran-bottom.
2514 template_sections = []
2515 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2516 # Because there is the possibility of triple curly braces
2517 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2518 # count nesting depth using pairs of two brackets, but
2519 # instead use singular braces ("{ }").
2520 # Because template delimiters should be balanced, regardless
2521 # of whether {{ or {{{ is used, and because we only care
2522 # about the outer-most delimiters (the highest level template)
2523 # we can just count the single braces when those single
2524 # braces are part of a group.
2525 table_nesting = 0
2526 # However, if we have a stray table ({| ... |}) that should always
2527 # be its own section, and should prevent templates from cutting it
2528 # into sections.
2530 # print(f"Parse inflection: {text=}")
2531 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2532 if len(brace_matches) > 1:
2533 tsection: list[str] = []
2534 after_templates = False # kludge to keep any text
2535 # before first template
2536 # with the first template;
2537 # otherwise, text
2538 # goes with preceding template
2539 for m in brace_matches:
2540 if m.startswith("\n; ") and after_templates: 2540 ↛ 2541line 2540 didn't jump to line 2541 because the condition on line 2540 was never true
2541 after_templates = False
2542 template_sections.append(tsection)
2543 tsection = []
2544 tsection.append(m)
2545 elif m.startswith("{{") or m.endswith("{|"):
2546 if (
2547 template_nesting == 0
2548 and after_templates
2549 and table_nesting == 0
2550 ):
2551 template_sections.append(tsection)
2552 tsection = []
2553 # start new section
2554 after_templates = True
2555 if m.startswith("{{"):
2556 template_nesting += 1
2557 else:
2558 # m.endswith("{|")
2559 table_nesting += 1
2560 tsection.append(m)
2561 elif m.startswith("}}") or m.endswith("|}"):
2562 if m.startswith("}}"):
2563 template_nesting -= 1
2564 if template_nesting < 0: 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true
2565 wxr.wtp.error(
2566 "Negatively nested braces, "
2567 "couldn't split inflection templates, "
2568 "{}/{} section {}".format(
2569 word, language, section
2570 ),
2571 sortid="page/1871",
2572 )
2573 template_sections = [] # use whole text
2574 break
2575 else:
2576 table_nesting -= 1
2577 if table_nesting < 0: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true
2578 wxr.wtp.error(
2579 "Negatively nested table braces, "
2580 "couldn't split inflection section, "
2581 "{}/{} section {}".format(
2582 word, language, section
2583 ),
2584 sortid="page/20250114",
2585 )
2586 template_sections = [] # use whole text
2587 break
2588 tsection.append(m)
2589 else:
2590 tsection.append(m)
2591 if tsection: # dangling tsection 2591 ↛ 2599line 2591 didn't jump to line 2599 because the condition on line 2591 was always true
2592 template_sections.append(tsection)
2593 # Why do it this way around? The parser has a preference
2594 # to associate bits outside of tables with the preceding
2595 # table (`after`-variable), so a new tsection begins
2596 # at {{ and everything before it belongs to the previous
2597 # template.
2599 texts = []
2600 if not template_sections:
2601 texts = [text]
2602 else:
2603 for tsection in template_sections:
2604 texts.append("".join(tsection))
2605 if template_nesting != 0: 2605 ↛ 2606line 2605 didn't jump to line 2606 because the condition on line 2605 was never true
2606 wxr.wtp.error(
2607 "Template nesting error: "
2608 "template_nesting = {} "
2609 "couldn't split inflection templates, "
2610 "{}/{} section {}".format(
2611 template_nesting, word, language, section
2612 ),
2613 sortid="page/1896",
2614 )
2615 texts = [text]
2616 for text in texts:
2617 tree = wxr.wtp.parse(
2618 text, expand_all=True, template_fn=inflection_template_fn
2619 )
2621 if not text.strip():
2622 continue
2624 # Parse inflection tables from the section. The data is stored
2625 # under "forms".
2626 if wxr.config.capture_inflections: 2626 ↛ 2616line 2626 didn't jump to line 2616 because the condition on line 2626 was always true
2627 tablecontext = None
2628 m = re.search(r"{{([^}{|]+)\|?", text)
2629 if m:
2630 template_name = m.group(1)
2631 tablecontext = TableContext(template_name)
2633 parse_inflection_section(
2634 wxr,
2635 pos_data,
2636 word,
2637 language,
2638 pos,
2639 section,
2640 tree,
2641 tablecontext=tablecontext,
2642 )
2644 def get_subpage_section(
2645 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2646 ) -> Optional[Union[WikiNode, str]]:
2647 """Loads a subpage of the given page, and finds the section
2648 for the given language, part-of-speech, and section title. This
2649 is used for finding translations and other sections on subpages."""
2650 assert isinstance(language, str)
2651 assert isinstance(title, str)
2652 assert isinstance(subtitle, str)
2653 assert isinstance(seqs, (list, tuple))
2654 for seq in seqs:
2655 for x in seq:
2656 assert isinstance(x, str)
2657 subpage_title = word + "/" + subtitle
2658 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2659 if subpage_content is None:
2660 wxr.wtp.error(
2661 "/translations not found despite "
2662 "{{see translation subpage|...}}",
2663 sortid="page/1934",
2664 )
2665 return None
2667 def recurse(
2668 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2669 ) -> Optional[Union[str, WikiNode]]:
2670 # print(f"seq: {seq}")
2671 if not seq:
2672 return node
2673 if not isinstance(node, WikiNode):
2674 return None
2675 # print(f"node.kind: {node.kind}")
2676 if node.kind in LEVEL_KINDS:
2677 t = clean_node(wxr, None, node.largs[0])
2678 # print(f"t: {t} == seq[0]: {seq[0]}?")
2679 if t.lower() == seq[0].lower():
2680 seq = seq[1:]
2681 if not seq:
2682 return node
2683 for n in node.children:
2684 ret = recurse(n, seq)
2685 if ret is not None:
2686 return ret
2687 return None
2689 tree = wxr.wtp.parse(
2690 subpage_content,
2691 pre_expand=True,
2692 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2693 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2694 )
2695 assert tree.kind == NodeKind.ROOT
2696 for seq in seqs:
2697 ret = recurse(tree, seq)
2698 if ret is None:
2699 wxr.wtp.debug(
2700 "Failed to find subpage section {}/{} seq {}".format(
2701 title, subtitle, seq
2702 ),
2703 sortid="page/1963",
2704 )
2705 return ret
2707 def parse_linkage(
2708 data: WordData, field: str, linkagenode: LevelNode
2709 ) -> None:
2710 assert isinstance(data, dict)
2711 assert isinstance(field, str)
2712 assert isinstance(linkagenode, WikiNode)
2713 # if field == "synonyms":
2714 # print("field", field)
2715 # print("data", data)
2716 # print("children:")
2717 # print(linkagenode.children)
2718 if not wxr.config.capture_linkages: 2718 ↛ 2719line 2718 didn't jump to line 2719 because the condition on line 2718 was never true
2719 return
2720 have_panel_template = False
2721 toplevel_text = []
2722 next_navframe_sense = None # Used for "(sense):" before NavFrame
2724 def parse_linkage_item(
2725 contents: list[Union[str, WikiNode]],
2726 field: str,
2727 sense: Optional[str] = None,
2728 ):
2729 assert isinstance(contents, (list, tuple))
2730 assert isinstance(field, str)
2731 assert sense is None or isinstance(sense, str)
2733 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2734 # .format(field, sense, contents))
2736 parts: list[str] = []
2737 ruby: list[tuple[str, str]] = []
2738 urls: list[str] = []
2739 # data about link text; this is used to skip splitting on
2740 # linkage text items that contain stuff like commas; for
2741 # example "Hunde, die bellen, beißen nicht" in article
2742 # beißen is split into "Hunde", "die bellen" etc.
2743 # We take that link text and use it, eventually,
2744 # in split_at_comma_semi to skip splitting on those
2745 # commas.
2746 links_that_should_not_be_split: list[str] = []
2748 def item_recurse(
2749 contents: list[Union[str, WikiNode]], italic=False
2750 ) -> None:
2751 assert isinstance(contents, (list, tuple))
2752 nonlocal sense
2753 nonlocal ruby
2754 nonlocal parts
2755 # print("ITEM_RECURSE:", contents)
2756 for node in contents:
2757 if isinstance(node, str):
2758 parts.append(node)
2759 continue
2760 kind = node.kind
2761 # print("ITEM_RECURSE KIND:", kind,
2762 # node.sarg if node.sarg else node.largs)
2763 if kind == NodeKind.LIST:
2764 if parts: 2764 ↛ 2779line 2764 didn't jump to line 2779 because the condition on line 2764 was always true
2765 sense1: Optional[str]
2766 sense1 = clean_node(wxr, None, parts)
2767 if sense1.endswith(":"):
2768 sense1 = sense1[:-1].strip()
2769 if sense1.startswith("(") and sense1.endswith(")"): 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true
2770 sense1 = sense1[1:-1].strip()
2771 if sense1.lower() == TRANSLATIONS_TITLE: 2771 ↛ 2772line 2771 didn't jump to line 2772 because the condition on line 2771 was never true
2772 sense1 = None
2773 # print("linkage item_recurse LIST sense1:", sense1)
2774 parse_linkage_recurse(
2775 node.children, field, sense=sense1 or sense
2776 )
2777 parts = []
2778 else:
2779 parse_linkage_recurse(node.children, field, sense)
2780 elif kind in ( 2780 ↛ 2785line 2780 didn't jump to line 2785 because the condition on line 2780 was never true
2781 NodeKind.TABLE,
2782 NodeKind.TABLE_ROW,
2783 NodeKind.TABLE_CELL,
2784 ):
2785 parse_linkage_recurse(node.children, field, sense)
2786 elif kind in ( 2786 ↛ 2790line 2786 didn't jump to line 2790 because the condition on line 2786 was never true
2787 NodeKind.TABLE_HEADER_CELL,
2788 NodeKind.TABLE_CAPTION,
2789 ):
2790 continue
2791 elif kind == NodeKind.HTML: 2791 ↛ 2792line 2791 didn't jump to line 2792 because the condition on line 2791 was never true
2792 classes = (node.attrs.get("class") or "").split()
2793 if node.sarg in ("gallery", "ref", "cite", "caption"):
2794 continue
2795 elif node.sarg == "ruby":
2796 rb = parse_ruby(wxr, node)
2797 if rb:
2798 ruby.append(rb)
2799 parts.append(rb[0])
2800 continue
2801 elif node.sarg == "math":
2802 parts.append(clean_node(wxr, None, node))
2803 continue
2804 elif "interProject" in classes:
2805 continue # These do not seem to be displayed
2806 if "NavFrame" in classes:
2807 parse_linkage_recurse(node.children, field, sense)
2808 else:
2809 item_recurse(node.children, italic=italic)
2810 elif kind == NodeKind.ITALIC:
2811 item_recurse(node.children, italic=True)
2812 elif kind == NodeKind.LINK:
2813 ignore = False
2814 if isinstance(node.largs[0][0], str): 2814 ↛ 2756line 2814 didn't jump to line 2756 because the condition on line 2814 was always true
2815 v1 = node.largs[0][0].strip().lower()
2816 if v1.startswith( 2816 ↛ 2820line 2816 didn't jump to line 2820 because the condition on line 2816 was never true
2817 ns_title_prefix_tuple(wxr, "Category", True)
2818 + ns_title_prefix_tuple(wxr, "File", True)
2819 ):
2820 ignore = True
2821 if not ignore: 2821 ↛ 2756line 2821 didn't jump to line 2756 because the condition on line 2821 was always true
2822 v = node.largs[-1]
2823 if (
2824 len(node.largs) == 1
2825 and len(v) > 0
2826 and isinstance(v[0], str)
2827 and v[0][0] == ":"
2828 ):
2829 v = [v[0][1:]] + list(v[1:]) # type:ignore
2830 if isinstance(v[0], str) and not v[0].isalnum():
2831 links_that_should_not_be_split.append(
2832 "".join(v[0])
2833 ) # type: ignore
2834 item_recurse(v, italic=italic)
2835 elif kind == NodeKind.URL:
2836 if len(node.largs) < 2 and node.largs:
2837 # Naked url captured
2838 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2839 continue
2840 if len(node.largs) == 2: 2840 ↛ 2845line 2840 didn't jump to line 2845 because the condition on line 2840 was always true
2841 # Url from link with text
2842 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2843 # print(f"{node.largs=!r}")
2844 # print("linkage recurse URL {}".format(node))
2845 item_recurse(node.largs[-1], italic=italic)
2846 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2846 ↛ 2849line 2846 didn't jump to line 2849 because the condition on line 2846 was always true
2847 item_recurse(node.children, italic=italic)
2848 else:
2849 wxr.wtp.debug(
2850 "linkage item_recurse unhandled {}: {}".format(
2851 node.kind, node
2852 ),
2853 sortid="page/2073",
2854 )
2856 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2857 # .format(contents))
2859 item_recurse(contents)
2860 item = clean_node(wxr, None, parts)
2861 # print("LINKAGE ITEM CONTENTS:", parts)
2862 # print("CLEANED ITEM: {!r}".format(item))
2863 # print(f"URLS {urls=!r}")
2865 return parse_linkage_item_text(
2866 wxr,
2867 word,
2868 data,
2869 field,
2870 item,
2871 sense,
2872 ruby,
2873 sense_datas,
2874 is_reconstruction,
2875 urls or None,
2876 links_that_should_not_be_split or None,
2877 )
2879 def parse_linkage_recurse(
2880 contents: list[Union[WikiNode, str]],
2881 field: str,
2882 sense: Optional[str],
2883 ) -> None:
2884 assert isinstance(contents, (list, tuple))
2885 assert sense is None or isinstance(sense, str)
2886 nonlocal next_navframe_sense
2887 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2888 for node in contents:
2889 if isinstance(node, str):
2890 # Ignore top-level text, generally comments before the
2891 # linkages list. However, if no linkages are found, then
2892 # use this for linkages (not all words use bullet points
2893 # for linkages).
2894 toplevel_text.append(node)
2895 continue
2896 assert isinstance(node, WikiNode)
2897 kind = node.kind
2898 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2899 if kind == NodeKind.LIST:
2900 parse_linkage_recurse(node.children, field, sense)
2901 elif kind == NodeKind.LIST_ITEM:
2902 v = parse_linkage_item(node.children, field, sense)
2903 if v: 2903 ↛ 2907line 2903 didn't jump to line 2907 because the condition on line 2903 was never true
2904 # parse_linkage_item() can return a value that should
2905 # be used as the sense for the follow-on linkages,
2906 # which are typically provided in a table (see 滿)
2907 next_navframe_sense = v
2908 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2909 parse_linkage_recurse(node.children, field, sense)
2910 elif kind == NodeKind.TABLE_CELL:
2911 parse_linkage_item(node.children, field, sense)
2912 elif kind in (
2913 NodeKind.TABLE_CAPTION,
2914 NodeKind.TABLE_HEADER_CELL,
2915 NodeKind.PREFORMATTED,
2916 NodeKind.BOLD,
2917 ):
2918 continue
2919 elif kind == NodeKind.HTML: 2919 ↛ 2921line 2919 didn't jump to line 2921 because the condition on line 2919 was never true
2920 # Recurse to process inside the HTML for most tags
2921 if node.sarg in ("gallery", "ref", "cite", "caption"):
2922 continue
2923 classes = (node.attrs.get("class") or "").split()
2924 if node.sarg == "li":
2925 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2926 v = parse_linkage_item(node.children, field, sense)
2927 if v:
2928 next_navframe_sense = v
2929 elif "qualifier-content" in classes:
2930 sense1 = clean_node(wxr, None, node.children)
2931 if sense1.endswith(":"):
2932 sense1 = sense1[:-1].strip()
2933 if sense and sense1:
2934 wxr.wtp.debug(
2935 "linkage qualifier-content on multiple "
2936 "levels: {!r} and {!r}".format(sense, sense1),
2937 sortid="page/2170",
2938 )
2939 parse_linkage_recurse(node.children, field, sense1)
2940 elif "NavFrame" in classes:
2941 # NavFrame uses previously assigned next_navframe_sense
2942 # (from a "(sense):" item) and clears it afterwards
2943 parse_linkage_recurse(
2944 node.children, field, sense or next_navframe_sense
2945 )
2946 next_navframe_sense = None
2947 else:
2948 parse_linkage_recurse(node.children, field, sense)
2949 elif kind in LEVEL_KINDS: 2949 ↛ 2951line 2949 didn't jump to line 2951 because the condition on line 2949 was never true
2950 # Just recurse to any possible subsections
2951 parse_linkage_recurse(node.children, field, sense)
2952 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2953 # Skip these on top level; at least sometimes bold is
2954 # used for indicating a subtitle
2955 continue
2956 elif kind == NodeKind.LINK: 2956 ↛ 2962line 2956 didn't jump to line 2962 because the condition on line 2956 was always true
2957 # Recurse into the last argument
2958 # Apparently ":/" is used as a link to "/", so strip
2959 # initial value
2960 parse_linkage_recurse(node.largs[-1], field, sense)
2961 else:
2962 wxr.wtp.debug(
2963 "parse_linkage_recurse unhandled {}: {}".format(
2964 kind, node
2965 ),
2966 sortid="page/2196",
2967 )
2969 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2970 nonlocal have_panel_template
2971 if is_panel_template(wxr, name):
2972 have_panel_template = True
2973 return ""
2974 return None
2976 # Main body of parse_linkage()
2977 l_nodes = []
2978 l_sense = ""
2979 for node in linkagenode.children:
2980 if (
2981 isinstance(node, TemplateNode)
2982 and node.template_name == "zh-dial"
2983 ):
2984 extract_zh_dial_template(wxr, data, node, l_sense)
2985 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
2986 for list_item in node.find_child(NodeKind.LIST_ITEM):
2987 for t_node in list_item.find_child(NodeKind.TEMPLATE):
2988 if t_node.template_name in ["s", "sense"]:
2989 l_sense = clean_node(wxr, None, t_node).strip(
2990 "(): "
2991 )
2992 l_nodes.append(node)
2993 else:
2994 l_nodes.append(node)
2995 text = wxr.wtp.node_to_wikitext(l_nodes)
2996 parsed = wxr.wtp.parse(
2997 text, expand_all=True, template_fn=linkage_template_fn1
2998 )
2999 parse_linkage_recurse(parsed.children, field, None)
3000 if not data.get(field) and not have_panel_template:
3001 text = "".join(toplevel_text).strip()
3002 if "\n" not in text and "," in text and text.count(",") > 3:
3003 if not text.startswith("See "): 3003 ↛ exitline 3003 didn't return from function 'parse_linkage' because the condition on line 3003 was always true
3004 parse_linkage_item([text], field, None)
3006 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
3007 """Parses translations for a word. This may also pull in translations
3008 from separate translation subpages."""
3009 assert isinstance(data, dict)
3010 assert isinstance(xlatnode, WikiNode)
3011 # print("===== PARSE_TRANSLATIONS {} {} {}"
3012 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
3013 # print("parse_translations xlatnode={}".format(xlatnode))
3014 if not wxr.config.capture_translations: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true
3015 return
3016 sense_parts: list[Union[WikiNode, str]] = []
3017 sense: Optional[str] = None
3019 def parse_translation_item(
3020 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
3021 ) -> None:
3022 nonlocal sense
3023 assert isinstance(contents, list)
3024 assert lang is None or isinstance(lang, str)
3025 # print("PARSE_TRANSLATION_ITEM:", contents)
3027 langcode: Optional[str] = None
3028 if sense is None:
3029 sense = clean_node(wxr, data, sense_parts).strip()
3030 # print("sense <- clean_node: ", sense)
3031 idx = sense.find("See also translations at")
3032 if idx > 0: 3032 ↛ 3033line 3032 didn't jump to line 3033 because the condition on line 3032 was never true
3033 wxr.wtp.debug(
3034 "Skipping translation see also: {}".format(sense),
3035 sortid="page/2361",
3036 )
3037 sense = sense[:idx].strip()
3038 if sense.endswith(":"): 3038 ↛ 3039line 3038 didn't jump to line 3039 because the condition on line 3038 was never true
3039 sense = sense[:-1].strip()
3040 if sense.endswith("—"): 3040 ↛ 3041line 3040 didn't jump to line 3041 because the condition on line 3040 was never true
3041 sense = sense[:-1].strip()
3042 translations_from_template: list[str] = []
3044 def translation_item_template_fn(
3045 name: str, ht: TemplateArgs
3046 ) -> Optional[str]:
3047 nonlocal langcode
3048 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
3049 if is_panel_template(wxr, name):
3050 return ""
3051 if name in ("t+check", "t-check", "t-needed"):
3052 # We ignore these templates. They seem to have outright
3053 # garbage in some entries, and very varying formatting in
3054 # others. These should be transitory and unreliable
3055 # anyway.
3056 return "__IGNORE__"
3057 if name in ("t", "t+", "t-simple", "tt", "tt+"):
3058 code = ht.get(1)
3059 if code: 3059 ↛ 3069line 3059 didn't jump to line 3069 because the condition on line 3059 was always true
3060 if langcode and code != langcode:
3061 wxr.wtp.debug(
3062 "inconsistent language codes {} vs "
3063 "{} in translation item: {!r} {}".format(
3064 langcode, code, name, ht
3065 ),
3066 sortid="page/2386",
3067 )
3068 langcode = code
3069 tr = ht.get(2)
3070 if tr:
3071 tr = clean_node(wxr, None, [tr])
3072 translations_from_template.append(tr)
3073 return None
3074 if name == "t-egy":
3075 langcode = "egy"
3076 return None
3077 if name == "ttbc":
3078 code = ht.get(1)
3079 if code: 3079 ↛ 3081line 3079 didn't jump to line 3081 because the condition on line 3079 was always true
3080 langcode = code
3081 return None
3082 if name == "trans-see": 3082 ↛ 3083line 3082 didn't jump to line 3083 because the condition on line 3082 was never true
3083 wxr.wtp.error(
3084 "UNIMPLEMENTED trans-see template", sortid="page/2405"
3085 )
3086 return ""
3087 if name.endswith("-top"): 3087 ↛ 3088line 3087 didn't jump to line 3088 because the condition on line 3087 was never true
3088 return ""
3089 if name.endswith("-bottom"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true
3090 return ""
3091 if name.endswith("-mid"): 3091 ↛ 3092line 3091 didn't jump to line 3092 because the condition on line 3091 was never true
3092 return ""
3093 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
3094 # .format(name),
3095 # sortid="page/2414")
3096 return None
3098 sublists = list(
3099 x
3100 for x in contents
3101 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
3102 )
3103 contents = list(
3104 x
3105 for x in contents
3106 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
3107 )
3109 item = clean_node(
3110 wxr, data, contents, template_fn=translation_item_template_fn
3111 )
3112 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
3114 # Parse the translation item.
3115 if item: 3115 ↛ exitline 3115 didn't return from function 'parse_translation_item' because the condition on line 3115 was always true
3116 lang = parse_translation_item_text(
3117 wxr,
3118 word,
3119 data,
3120 item,
3121 sense,
3122 lang,
3123 langcode,
3124 translations_from_template,
3125 is_reconstruction,
3126 )
3128 # Handle sublists. They are frequently used for different
3129 # scripts for the language and different variants of the
3130 # language. We will include the lower-level header as a
3131 # tag in those cases.
3132 for listnode in sublists:
3133 assert listnode.kind == NodeKind.LIST
3134 for node in listnode.children:
3135 if not isinstance(node, WikiNode): 3135 ↛ 3136line 3135 didn't jump to line 3136 because the condition on line 3135 was never true
3136 continue
3137 if node.kind == NodeKind.LIST_ITEM: 3137 ↛ 3134line 3137 didn't jump to line 3134 because the condition on line 3137 was always true
3138 parse_translation_item(node.children, lang=lang)
3140 def parse_translation_template(node: WikiNode) -> None:
3141 assert isinstance(node, WikiNode)
3143 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3144 nonlocal sense_parts
3145 nonlocal sense
3146 if is_panel_template(wxr, name):
3147 return ""
3148 if name == "see also":
3149 # XXX capture
3150 # XXX for example, "/" has top-level list containing
3151 # see also items. So also should parse those.
3152 return ""
3153 if name == "trans-see":
3154 # XXX capture
3155 return ""
3156 if name == "see translation subpage": 3156 ↛ 3157line 3156 didn't jump to line 3157 because the condition on line 3156 was never true
3157 sense_parts = []
3158 sense = None
3159 sub = ht.get(1, "")
3160 if sub:
3161 m = re.match(
3162 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3163 )
3164 else:
3165 m = None
3166 etym = ""
3167 etym_numbered = ""
3168 pos = ""
3169 if m:
3170 etym_numbered = m.group(1)
3171 etym = m.group(2)
3172 pos = m.group(3)
3173 if not sub:
3174 wxr.wtp.debug(
3175 "no part-of-speech in "
3176 "{{see translation subpage|...}}, "
3177 "defaulting to just wxr.wtp.section "
3178 "(= language)",
3179 sortid="page/2468",
3180 )
3181 # seq sent to get_subpage_section without sub and pos
3182 seq = [
3183 language,
3184 TRANSLATIONS_TITLE,
3185 ]
3186 elif (
3187 m
3188 and etym.lower().strip() in ETYMOLOGY_TITLES
3189 and pos.lower() in POS_TITLES
3190 ):
3191 seq = [
3192 language,
3193 etym_numbered,
3194 pos,
3195 TRANSLATIONS_TITLE,
3196 ]
3197 elif sub.lower() in POS_TITLES:
3198 # seq with sub but not pos
3199 seq = [
3200 language,
3201 sub,
3202 TRANSLATIONS_TITLE,
3203 ]
3204 else:
3205 # seq with sub and pos
3206 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3207 if pos.lower() not in POS_TITLES:
3208 wxr.wtp.debug(
3209 "unhandled see translation subpage: "
3210 "language={} sub={} "
3211 "wxr.wtp.subsection={}".format(
3212 language, sub, wxr.wtp.subsection
3213 ),
3214 sortid="page/2478",
3215 )
3216 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3217 subnode = get_subpage_section(
3218 wxr.wtp.title or "MISSING_TITLE",
3219 TRANSLATIONS_TITLE,
3220 [seq],
3221 )
3222 if subnode is None or not isinstance(subnode, WikiNode):
3223 # Failed to find the normal subpage section
3224 # seq with sub and pos
3225 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3226 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
3227 seqs: list[list[str] | tuple[str, ...]] = [
3228 [TRANSLATIONS_TITLE],
3229 [language, pos],
3230 ]
3231 subnode = get_subpage_section(
3232 wxr.wtp.title or "MISSING_TITLE",
3233 TRANSLATIONS_TITLE,
3234 seqs,
3235 )
3236 if subnode is not None and isinstance(subnode, WikiNode):
3237 parse_translations(data, subnode)
3238 return ""
3239 if name in (
3240 "c",
3241 "C",
3242 "categorize",
3243 "cat",
3244 "catlangname",
3245 "topics",
3246 "top",
3247 "qualifier",
3248 "cln",
3249 ):
3250 # These are expanded in the default way
3251 return None
3252 if name in (
3253 "trans-top",
3254 "trans-top-see",
3255 ):
3256 # XXX capture id from trans-top? Capture sense here
3257 # instead of trying to parse it from expanded content?
3258 if ht.get(1):
3259 sense_parts = []
3260 sense = ht.get(1)
3261 else:
3262 sense_parts = []
3263 sense = None
3264 return None
3265 if name in (
3266 "trans-bottom",
3267 "trans-mid",
3268 "checktrans-mid",
3269 "checktrans-bottom",
3270 ):
3271 return None
3272 if name == "checktrans-top":
3273 sense_parts = []
3274 sense = None
3275 return ""
3276 if name == "trans-top-also":
3277 # XXX capture?
3278 sense_parts = []
3279 sense = None
3280 return ""
3281 wxr.wtp.error(
3282 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3283 name, ht
3284 ),
3285 sortid="page/2517",
3286 )
3287 return ""
3289 wxr.wtp.expand(
3290 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3291 )
3293 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3294 nonlocal sense
3295 nonlocal sense_parts
3296 for node in xlatnode.children:
3297 # print(node)
3298 if isinstance(node, str):
3299 if sense:
3300 if not node.isspace():
3301 wxr.wtp.debug(
3302 "skipping string in the middle of "
3303 "translations: {}".format(node),
3304 sortid="page/2530",
3305 )
3306 continue
3307 # Add a part to the sense
3308 sense_parts.append(node)
3309 sense = None
3310 continue
3311 assert isinstance(node, WikiNode)
3312 kind = node.kind
3313 if kind == NodeKind.LIST:
3314 for item in node.children:
3315 if not isinstance(item, WikiNode): 3315 ↛ 3316line 3315 didn't jump to line 3316 because the condition on line 3315 was never true
3316 continue
3317 if item.kind != NodeKind.LIST_ITEM: 3317 ↛ 3318line 3317 didn't jump to line 3318 because the condition on line 3317 was never true
3318 continue
3319 if item.sarg == ":": 3319 ↛ 3320line 3319 didn't jump to line 3320 because the condition on line 3319 was never true
3320 continue
3321 parse_translation_item(item.children)
3322 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3322 ↛ 3326line 3322 didn't jump to line 3326 because the condition on line 3322 was never true
3323 # Silently skip list items that are just indented; these
3324 # are used for text between translations, such as indicating
3325 # translations that need to be checked.
3326 pass
3327 elif kind == NodeKind.TEMPLATE:
3328 parse_translation_template(node)
3329 elif kind in ( 3329 ↛ 3334line 3329 didn't jump to line 3334 because the condition on line 3329 was never true
3330 NodeKind.TABLE,
3331 NodeKind.TABLE_ROW,
3332 NodeKind.TABLE_CELL,
3333 ):
3334 parse_translation_recurse(node)
3335 elif kind == NodeKind.HTML:
3336 if node.attrs.get("class") == "NavFrame": 3336 ↛ 3342line 3336 didn't jump to line 3342 because the condition on line 3336 was never true
3337 # Reset ``sense_parts`` (and force recomputing
3338 # by clearing ``sense``) as each NavFrame specifies
3339 # its own sense. This helps eliminate garbage coming
3340 # from text at the beginning at the translations
3341 # section.
3342 sense_parts = []
3343 sense = None
3344 # for item in node.children:
3345 # if not isinstance(item, WikiNode):
3346 # continue
3347 # parse_translation_recurse(item)
3348 parse_translation_recurse(node)
3349 elif kind in LEVEL_KINDS: 3349 ↛ 3351line 3349 didn't jump to line 3351 because the condition on line 3349 was never true
3350 # Sub-levels will be recursed elsewhere
3351 pass
3352 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3353 parse_translation_recurse(node)
3354 elif kind == NodeKind.PREFORMATTED: 3354 ↛ 3355line 3354 didn't jump to line 3355 because the condition on line 3354 was never true
3355 print("parse_translation_recurse: PREFORMATTED:", node)
3356 elif kind == NodeKind.LINK: 3356 ↛ 3410line 3356 didn't jump to line 3410 because the condition on line 3356 was always true
3357 arg0 = node.largs[0]
3358 # Kludge: I've seen occasional normal links to translation
3359 # subpages from main pages (e.g., language/English/Noun
3360 # in July 2021) instead of the normal
3361 # {{see translation subpage|...}} template. This should
3362 # handle them. Note: must be careful not to read other
3363 # links, particularly things like in "human being":
3364 # "a human being -- see [[man/translations]]" (group title)
3365 if ( 3365 ↛ 3373line 3365 didn't jump to line 3373 because the condition on line 3365 was never true
3366 isinstance(arg0, (list, tuple))
3367 and arg0
3368 and isinstance(arg0[0], str)
3369 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3370 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3371 == wxr.wtp.title
3372 ):
3373 wxr.wtp.debug(
3374 "translations subpage link found on main "
3375 "page instead "
3376 "of normal {{see translation subpage|...}}",
3377 sortid="page/2595",
3378 )
3379 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3380 if sub.lower() in POS_TITLES:
3381 seq = [
3382 language,
3383 sub,
3384 TRANSLATIONS_TITLE,
3385 ]
3386 subnode = get_subpage_section(
3387 wxr.wtp.title,
3388 TRANSLATIONS_TITLE,
3389 [seq],
3390 )
3391 if subnode is not None and isinstance(
3392 subnode, WikiNode
3393 ):
3394 parse_translations(data, subnode)
3395 else:
3396 wxr.wtp.error(
3397 "/translations link outside part-of-speech"
3398 )
3400 if (
3401 len(arg0) >= 1
3402 and isinstance(arg0[0], str)
3403 and not arg0[0].lower().startswith("category:")
3404 ):
3405 for x in node.largs[-1]:
3406 if isinstance(x, str): 3406 ↛ 3409line 3406 didn't jump to line 3409 because the condition on line 3406 was always true
3407 sense_parts.append(x)
3408 else:
3409 parse_translation_recurse(x)
3410 elif not sense:
3411 sense_parts.append(node)
3412 else:
3413 wxr.wtp.debug(
3414 "skipping text between translation items/senses: "
3415 "{}".format(node),
3416 sortid="page/2621",
3417 )
3419 # Main code of parse_translation(). We want ``sense`` to be assigned
3420 # regardless of recursion levels, and thus the code is structured
3421 # to define at this level and recurse in parse_translation_recurse().
3422 parse_translation_recurse(xlatnode)
3424 def parse_etymology(data: WordData, node: WikiNode) -> None:
3425 """Parses an etymology section."""
3426 assert isinstance(data, dict)
3427 assert isinstance(node, WikiNode)
3429 templates: list[TemplateData] = []
3431 # Counter for preventing the capture of etymology templates
3432 # when we are inside templates that we want to ignore (i.e.,
3433 # not capture).
3434 ignore_count = 0
3436 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3437 nonlocal ignore_count
3438 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3439 return ""
3440 if re.match(ignored_etymology_templates_re, name):
3441 ignore_count += 1
3442 return None
3444 # CONTINUE_HERE
3446 def etym_post_template_fn(
3447 name: str, ht: TemplateArgs, expansion: str
3448 ) -> None:
3449 nonlocal ignore_count
3450 if name in wikipedia_templates:
3451 parse_wikipedia_template(wxr, data, ht)
3452 return None
3453 if re.match(ignored_etymology_templates_re, name):
3454 ignore_count -= 1
3455 return None
3456 if ignore_count == 0: 3456 ↛ 3462line 3456 didn't jump to line 3462 because the condition on line 3456 was always true
3457 ht = clean_template_args(wxr, ht)
3458 expansion = clean_node(wxr, None, expansion)
3459 templates.append(
3460 {"name": name, "args": ht, "expansion": expansion}
3461 )
3462 return None
3464 # Remove any subsections
3465 contents = list(
3466 x
3467 for x in node.children
3468 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3469 )
3470 # Convert to text, also capturing templates using post_template_fn
3471 text = clean_node(
3472 wxr,
3473 None,
3474 contents,
3475 template_fn=etym_template_fn,
3476 post_template_fn=etym_post_template_fn,
3477 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3478 # Save the collected information.
3479 if len(text) > 0:
3480 data["etymology_text"] = text
3481 if len(templates) > 0:
3482 # Some etymology templates, like Template:root do not generate
3483 # text, so they should be added here. Elsewhere, we check
3484 # for Template:root and add some text to the expansion to please
3485 # the validation.
3486 data["etymology_templates"] = templates
3488 for child_node in node.find_child_recursively( 3488 ↛ exitline 3488 didn't return from function 'parse_etymology' because the loop on line 3488 didn't complete
3489 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3490 ):
3491 if child_node.kind in LEVEL_KIND_FLAGS:
3492 break
3493 elif isinstance( 3493 ↛ 3496line 3493 didn't jump to line 3496 because the condition on line 3493 was never true
3494 child_node, TemplateNode
3495 ) and child_node.template_name in ["zh-x", "zh-q"]:
3496 if "etymology_examples" not in data:
3497 data["etymology_examples"] = []
3498 data["etymology_examples"].extend(
3499 extract_template_zh_x(
3500 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3501 )
3502 )
3504 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3505 """This recurses into a subtree in the parse tree for a page."""
3506 nonlocal etym_data
3507 nonlocal pos_data
3508 nonlocal inside_level_four
3510 redirect_list: list[str] = [] # for `zh-see` template
3512 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3513 """This is called for otherwise unprocessed parts of the page.
3514 We still expand them so that e.g. Category links get captured."""
3515 if name in wikipedia_templates:
3516 data = select_data()
3517 parse_wikipedia_template(wxr, data, ht)
3518 return None
3519 if is_panel_template(wxr, name):
3520 return ""
3521 return None
3523 for node in treenode.children:
3524 if not isinstance(node, WikiNode):
3525 # print(" X{}".format(repr(node)[:40]))
3526 continue
3527 if isinstance(node, TemplateNode):
3528 if process_soft_redirect_template(wxr, node, redirect_list):
3529 continue
3530 elif node.template_name == "zh-forms":
3531 extract_zh_forms_template(wxr, node, select_data())
3533 if node.kind not in LEVEL_KINDS:
3534 # XXX handle e.g. wikipedia links at the top of a language
3535 # XXX should at least capture "also" at top of page
3536 if node.kind in (
3537 NodeKind.HLINE,
3538 NodeKind.LIST,
3539 NodeKind.LIST_ITEM,
3540 ):
3541 continue
3542 # print(" UNEXPECTED: {}".format(node))
3543 # Clean the node to collect category links
3544 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3545 continue
3546 t = clean_node(
3547 wxr, etym_data, node.sarg if node.sarg else node.largs
3548 )
3549 t = t.lower()
3550 # XXX these counts were never implemented fully, and even this
3551 # gets discarded: Search STATISTICS_IMPLEMENTATION
3552 wxr.config.section_counts[t] += 1
3553 # print("PROCESS_CHILDREN: T:", repr(t))
3554 if t in IGNORED_TITLES:
3555 pass
3556 elif t.startswith(PRONUNCIATION_TITLE):
3557 # Chinese Pronunciation section kludge; we demote these to
3558 # be level 4 instead of 3 so that they're part of a larger
3559 # etymology hierarchy; usually the data here is empty and
3560 # acts as an inbetween between POS and Etymology data
3561 inside_level_four = True
3562 if t.startswith(PRONUNCIATION_TITLE + " "):
3563 # Pronunciation 1, etc, are used in Chinese Glyphs,
3564 # and each of them may have senses under Definition
3565 push_level_four_section(True)
3566 wxr.wtp.start_subsection(None)
3567 if wxr.config.capture_pronunciation: 3567 ↛ 3649line 3567 didn't jump to line 3649 because the condition on line 3567 was always true
3568 data = select_data()
3569 parse_pronunciation(
3570 wxr,
3571 node,
3572 data,
3573 etym_data,
3574 have_etym,
3575 base_data,
3576 lang_code,
3577 )
3578 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3579 push_etym()
3580 wxr.wtp.start_subsection(None)
3581 if wxr.config.capture_etymologies: 3581 ↛ 3649line 3581 didn't jump to line 3649 because the condition on line 3581 was always true
3582 m = re.search(r"\s(\d+)$", t)
3583 if m:
3584 etym_data["etymology_number"] = int(m.group(1))
3585 parse_etymology(etym_data, node)
3586 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3587 data = select_data()
3588 extract_descendant_section(wxr, data, node, False)
3589 elif (
3590 t in PROTO_ROOT_DERIVED_TITLES
3591 and pos == "root"
3592 and is_reconstruction
3593 and wxr.config.capture_descendants
3594 ):
3595 data = select_data()
3596 extract_descendant_section(wxr, data, node, True)
3597 elif t == TRANSLATIONS_TITLE:
3598 data = select_data()
3599 parse_translations(data, node)
3600 elif t in INFLECTION_TITLES:
3601 parse_inflection(node, t, pos)
3602 elif t == "alternative forms":
3603 extract_alt_form_section(wxr, select_data(), node)
3604 else:
3605 lst = t.split()
3606 while len(lst) > 1 and lst[-1].isdigit(): 3606 ↛ 3607line 3606 didn't jump to line 3607 because the condition on line 3606 was never true
3607 lst = lst[:-1]
3608 t_no_number = " ".join(lst).lower()
3609 if t_no_number in POS_TITLES:
3610 push_pos()
3611 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3612 pos = dt["pos"] or "MISSING_POS"
3613 wxr.wtp.start_subsection(t)
3614 if "debug" in dt:
3615 wxr.wtp.debug(
3616 "{} in section {}".format(dt["debug"], t),
3617 sortid="page/2755",
3618 )
3619 if "warning" in dt: 3619 ↛ 3620line 3619 didn't jump to line 3620 because the condition on line 3619 was never true
3620 wxr.wtp.warning(
3621 "{} in section {}".format(dt["warning"], t),
3622 sortid="page/2759",
3623 )
3624 if "error" in dt: 3624 ↛ 3625line 3624 didn't jump to line 3625 because the condition on line 3624 was never true
3625 wxr.wtp.error(
3626 "{} in section {}".format(dt["error"], t),
3627 sortid="page/2763",
3628 )
3629 # Parse word senses for the part-of-speech
3630 parse_part_of_speech(node, pos)
3631 if "tags" in dt:
3632 for pdata in sense_datas:
3633 data_extend(pdata, "tags", dt["tags"])
3634 elif t_no_number in LINKAGE_TITLES:
3635 # print(f"LINKAGE_TITLES NODE {node=}")
3636 rel = LINKAGE_TITLES[t_no_number]
3637 data = select_data()
3638 parse_linkage(data, rel, node)
3639 elif t_no_number == COMPOUNDS_TITLE:
3640 data = select_data()
3641 if wxr.config.capture_compounds: 3641 ↛ 3649line 3641 didn't jump to line 3649 because the condition on line 3641 was always true
3642 parse_linkage(data, "derived", node)
3644 # XXX parse interesting templates also from other sections. E.g.,
3645 # {{Letter|...}} in ===See also===
3646 # Also <gallery>
3648 # Recurse to children of this node, processing subtitles therein
3649 stack.append(t)
3650 process_children(node, pos)
3651 stack.pop()
3653 if len(redirect_list) > 0:
3654 if len(pos_data) > 0:
3655 pos_data["redirects"] = redirect_list
3656 if "pos" not in pos_data: 3656 ↛ 3657line 3656 didn't jump to line 3657 because the condition on line 3656 was never true
3657 pos_data["pos"] = "soft-redirect"
3658 else:
3659 new_page_data = copy.deepcopy(base_data)
3660 new_page_data["redirects"] = redirect_list
3661 if "pos" not in new_page_data: 3661 ↛ 3663line 3661 didn't jump to line 3663 because the condition on line 3661 was always true
3662 new_page_data["pos"] = "soft-redirect"
3663 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3664 page_datas.append(new_page_data)
3666 def extract_examples(
3667 others: list[WikiNode], sense_base: SenseData
3668 ) -> list[ExampleData]:
3669 """Parses through a list of definitions and quotes to find examples.
3670 Returns a list of example dicts to be added to sense data. Adds
3671 meta-data, mostly categories, into sense_base."""
3672 assert isinstance(others, list)
3673 examples: list[ExampleData] = []
3675 for sub in others:
3676 if not sub.sarg.endswith((":", "*")): 3676 ↛ 3677line 3676 didn't jump to line 3677 because the condition on line 3676 was never true
3677 continue
3678 for item in sub.children:
3679 if not isinstance(item, WikiNode): 3679 ↛ 3680line 3679 didn't jump to line 3680 because the condition on line 3679 was never true
3680 continue
3681 if item.kind != NodeKind.LIST_ITEM: 3681 ↛ 3682line 3681 didn't jump to line 3682 because the condition on line 3681 was never true
3682 continue
3683 usex_type = None
3684 example_template_args = []
3685 example_template_names = []
3686 taxons = set()
3688 # Bypass this function when parsing Chinese, Japanese and
3689 # quotation templates.
3690 new_example_lists = extract_example_list_item(
3691 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3692 )
3693 if len(new_example_lists) > 0:
3694 examples.extend(new_example_lists)
3695 continue
3697 def usex_template_fn(
3698 name: str, ht: TemplateArgs
3699 ) -> Optional[str]:
3700 nonlocal usex_type
3701 if is_panel_template(wxr, name):
3702 return ""
3703 if name in usex_templates:
3704 usex_type = "example"
3705 example_template_args.append(ht)
3706 example_template_names.append(name)
3707 elif name in quotation_templates:
3708 usex_type = "quotation"
3709 elif name in taxonomy_templates: 3709 ↛ 3710line 3709 didn't jump to line 3710 because the condition on line 3709 was never true
3710 taxons.update(ht.get(1, "").split())
3711 for prefix in template_linkages_to_ignore_in_examples:
3712 if re.search(
3713 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3714 ):
3715 return ""
3716 return None
3718 # bookmark
3719 ruby: list[tuple[str, str]] = []
3720 contents = item.children
3721 if lang_code == "ja":
3722 # Capture ruby contents if this is a Japanese language
3723 # example.
3724 # print(contents)
3725 if ( 3725 ↛ 3730line 3725 didn't jump to line 3730 because the condition on line 3725 was never true
3726 contents
3727 and isinstance(contents, str)
3728 and re.match(r"\s*$", contents[0])
3729 ):
3730 contents = contents[1:]
3731 exp = wxr.wtp.parse(
3732 wxr.wtp.node_to_wikitext(contents),
3733 # post_template_fn=head_post_template_fn,
3734 expand_all=True,
3735 )
3736 rub, rest = extract_ruby(wxr, exp.children)
3737 if rub:
3738 for rtup in rub:
3739 ruby.append(rtup)
3740 contents = rest
3741 subtext = clean_node(
3742 wxr, sense_base, contents, template_fn=usex_template_fn
3743 )
3745 frozen_taxons = frozenset(taxons)
3746 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3748 # print(f"{subtext=}")
3749 subtext = re.sub(
3750 r"\s*\(please add an English "
3751 r"translation of this "
3752 r"(example|usage example|quote)\)",
3753 "",
3754 subtext,
3755 ).strip()
3756 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3757 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3758 # print("subtext:", repr(subtext))
3760 lines = subtext.splitlines()
3761 # print(lines)
3763 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3764 lines = list(
3765 x
3766 for x in lines
3767 if not re.match(
3768 r"(Synonyms: |Antonyms: |Hyponyms: |"
3769 r"Synonym: |Antonym: |Hyponym: |"
3770 r"Hypernyms: |Derived terms: |"
3771 r"Related terms: |"
3772 r"Hypernym: |Derived term: |"
3773 r"Coordinate terms:|"
3774 r"Related term: |"
3775 r"For more quotations using )",
3776 x,
3777 )
3778 )
3779 tr = ""
3780 ref = ""
3781 roman = ""
3782 # for line in lines:
3783 # print("LINE:", repr(line))
3784 # print(classify_desc(line))
3785 if len(lines) == 1 and lang_code != "en":
3786 parts = example_splitter_re.split(lines[0])
3787 if ( 3787 ↛ 3795line 3787 didn't jump to line 3795 because the condition on line 3787 was never true
3788 len(parts) > 2
3789 and len(example_template_args) == 1
3790 and any(
3791 ("―" in s) or ("—" in s)
3792 for s in example_template_args[0].values()
3793 )
3794 ):
3795 if nparts := synch_splits_with_args(
3796 lines[0], example_template_args[0]
3797 ):
3798 parts = nparts
3799 if ( 3799 ↛ 3804line 3799 didn't jump to line 3804 because the condition on line 3799 was never true
3800 len(example_template_args) == 1
3801 and "lit" in example_template_args[0]
3802 ):
3803 # ugly brute-force kludge in case there's a lit= arg
3804 literally = example_template_args[0].get("lit", "")
3805 if literally:
3806 literally = (
3807 " (literally, “"
3808 + clean_value(wxr, literally)
3809 + "”)"
3810 )
3811 else:
3812 literally = ""
3813 if ( 3813 ↛ 3852line 3813 didn't jump to line 3852 because the condition on line 3813 was never true
3814 len(example_template_args) == 1
3815 and len(parts) == 2
3816 and len(example_template_args[0])
3817 - (
3818 # horrible kludge to ignore these arguments
3819 # when calculating how many there are
3820 sum(
3821 s in example_template_args[0]
3822 for s in (
3823 "lit", # generates text, but we handle it
3824 "inline",
3825 "noenum",
3826 "nocat",
3827 "sort",
3828 )
3829 )
3830 )
3831 == 3
3832 and clean_value(
3833 wxr, example_template_args[0].get(2, "")
3834 )
3835 == parts[0].strip()
3836 and clean_value(
3837 wxr,
3838 (
3839 example_template_args[0].get(3)
3840 or example_template_args[0].get("translation")
3841 or example_template_args[0].get("t", "")
3842 )
3843 + literally, # in case there's a lit= argument
3844 )
3845 == parts[1].strip()
3846 ):
3847 # {{exampletemplate|ex|Foo bar baz|English translation}}
3848 # is a pretty reliable 'heuristic', so we use it here
3849 # before the others. To be extra sure the template
3850 # doesn't do anything weird, we compare the arguments
3851 # and the output to each other.
3852 lines = [parts[0].strip()]
3853 tr = parts[1].strip()
3854 elif (
3855 len(parts) == 2
3856 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3857 ):
3858 # These other branches just do some simple heuristics w/
3859 # the expanded output of the template (if applicable).
3860 lines = [parts[0].strip()]
3861 tr = parts[1].strip()
3862 elif ( 3862 ↛ 3868line 3862 didn't jump to line 3868 because the condition on line 3862 was never true
3863 len(parts) == 3
3864 and classify_desc2(parts[1])
3865 in ("romanization", "english")
3866 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3867 ):
3868 lines = [parts[0].strip()]
3869 roman = parts[1].strip()
3870 tr = parts[2].strip()
3871 else:
3872 parts = re.split(r"\s+-\s+", lines[0])
3873 if ( 3873 ↛ 3877line 3873 didn't jump to line 3877 because the condition on line 3873 was never true
3874 len(parts) == 2
3875 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3876 ):
3877 lines = [parts[0].strip()]
3878 tr = parts[1].strip()
3879 elif len(lines) > 1:
3880 if any(
3881 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3882 ) and not (len(example_template_names) == 1):
3883 refs: list[str] = []
3884 for i in range(len(lines)): 3884 ↛ 3890line 3884 didn't jump to line 3890 because the loop on line 3884 didn't complete
3885 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3885 ↛ 3886line 3885 didn't jump to line 3886 because the condition on line 3885 was never true
3886 break
3887 refs.append(lines[i].strip())
3888 if re.search(r"[]\d:)]\s*$", lines[i]):
3889 break
3890 ref = " ".join(refs)
3891 lines = lines[i + 1 :]
3892 if (
3893 lang_code != "en"
3894 and len(lines) >= 2
3895 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3896 ):
3897 i = len(lines) - 1
3898 while ( 3898 ↛ 3903line 3898 didn't jump to line 3903 because the condition on line 3898 was never true
3899 i > 1
3900 and classify_desc2(lines[i - 1])
3901 in ENGLISH_TEXTS
3902 ):
3903 i -= 1
3904 tr = "\n".join(lines[i:])
3905 lines = lines[:i]
3906 if len(lines) >= 2:
3907 if classify_desc2(lines[-1]) == "romanization":
3908 roman = lines[-1].strip()
3909 lines = lines[:-1]
3911 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3912 ref = lines[0]
3913 lines = lines[1:]
3914 elif lang_code != "en" and len(lines) == 2:
3915 cls1 = classify_desc2(lines[0])
3916 cls2 = classify_desc2(lines[1])
3917 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3918 tr = lines[1]
3919 lines = [lines[0]]
3920 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3920 ↛ 3921line 3920 didn't jump to line 3921 because the condition on line 3920 was never true
3921 tr = lines[0]
3922 lines = [lines[1]]
3923 elif ( 3923 ↛ 3930line 3923 didn't jump to line 3930 because the condition on line 3923 was never true
3924 re.match(r"^[#*]*:+", lines[1])
3925 and classify_desc2(
3926 re.sub(r"^[#*:]+\s*", "", lines[1])
3927 )
3928 in ENGLISH_TEXTS
3929 ):
3930 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3931 lines = [lines[0]]
3932 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3933 # Both were classified as English, but
3934 # presumably one is not. Assume first is
3935 # non-English, as that seems more common.
3936 tr = lines[1]
3937 lines = [lines[0]]
3938 elif (
3939 usex_type != "quotation"
3940 and lang_code != "en"
3941 and len(lines) == 3
3942 ):
3943 cls1 = classify_desc2(lines[0])
3944 cls2 = classify_desc2(lines[1])
3945 cls3 = classify_desc2(lines[2])
3946 if (
3947 cls3 == "english"
3948 and cls2 in ("english", "romanization")
3949 and cls1 != "english"
3950 ):
3951 tr = lines[2].strip()
3952 roman = lines[1].strip()
3953 lines = [lines[0].strip()]
3954 elif ( 3954 ↛ 3962line 3954 didn't jump to line 3962 because the condition on line 3954 was never true
3955 usex_type == "quotation"
3956 and lang_code != "en"
3957 and len(lines) > 2
3958 ):
3959 # for x in lines:
3960 # print(" LINE: {}: {}"
3961 # .format(classify_desc2(x), x))
3962 if re.match(r"^[#*]*:+\s*$", lines[1]):
3963 ref = lines[0]
3964 lines = lines[2:]
3965 cls1 = classify_desc2(lines[-1])
3966 if cls1 == "english":
3967 i = len(lines) - 1
3968 while (
3969 i > 1
3970 and classify_desc2(lines[i - 1])
3971 == ENGLISH_TEXTS
3972 ):
3973 i -= 1
3974 tr = "\n".join(lines[i:])
3975 lines = lines[:i]
3977 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3978 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3979 tr = re.sub(r"^[#*:]+\s*", "", tr)
3980 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3981 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3982 ref = re.sub(r"^[#*:]+\s*", "", ref)
3983 ref = re.sub(
3984 r", (volume |number |page )?“?"
3985 r"\(please specify ([^)]|\(s\))*\)”?|"
3986 ", text here$",
3987 "",
3988 ref,
3989 )
3990 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3991 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3992 subtext = "\n".join(x for x in lines if x)
3993 if not tr and lang_code != "en":
3994 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3995 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true
3996 tr = m.group(2)
3997 subtext = subtext[: m.start()] + m.group(1)
3998 elif lines:
3999 parts = re.split(r"\s*[―—]+\s*", lines[0])
4000 if ( 4000 ↛ 4004line 4000 didn't jump to line 4004 because the condition on line 4000 was never true
4001 len(parts) == 2
4002 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4003 ):
4004 subtext = parts[0].strip()
4005 tr = parts[1].strip()
4006 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
4007 subtext = re.sub(
4008 r"(please add an English translation of "
4009 r"this (quote|usage example))",
4010 "",
4011 subtext,
4012 )
4013 subtext = re.sub(
4014 r"\s*→New International Version " "translation$",
4015 "",
4016 subtext,
4017 ) # e.g. pis/Tok Pisin (Bible)
4018 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4019 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4020 note = None
4021 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4022 if ( 4022 ↛ 4030line 4022 didn't jump to line 4030 because the condition on line 4022 was never true
4023 m is not None
4024 and lang_code != "en"
4025 and (
4026 m.group(1).startswith("with ")
4027 or classify_desc2(m.group(1)) == "english"
4028 )
4029 ):
4030 note = m.group(1)
4031 subtext = subtext[m.end() :]
4032 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4033 ref = re.sub(r",\s*→ISBN", "", ref)
4034 ref = ref.strip()
4035 if ref.endswith(":") or ref.endswith(","):
4036 ref = ref[:-1].strip()
4037 ref = re.sub(r"\s+,\s+", ", ", ref)
4038 ref = re.sub(r"\s+", " ", ref)
4039 if ref and not subtext: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true
4040 subtext = ref
4041 ref = ""
4042 if subtext:
4043 dt: ExampleData = {"text": subtext}
4044 if ref:
4045 dt["ref"] = ref
4046 if tr:
4047 dt["english"] = tr # DEPRECATED for "translation"
4048 dt["translation"] = tr
4049 if usex_type:
4050 dt["type"] = usex_type
4051 if note: 4051 ↛ 4052line 4051 didn't jump to line 4052 because the condition on line 4051 was never true
4052 dt["note"] = note
4053 if roman:
4054 dt["roman"] = roman
4055 if ruby:
4056 dt["ruby"] = ruby
4057 examples.append(dt)
4059 return examples
4061 # Main code of parse_language()
4062 # Process the section
4063 stack.append(language)
4064 process_children(langnode, None)
4065 stack.pop()
4067 # Finalize word entires
4068 push_etym()
4069 ret = []
4070 for data in page_datas:
4071 merge_base(data, base_data)
4072 ret.append(data)
4074 # Copy all tags to word senses
4075 for data in ret:
4076 if "senses" not in data: 4076 ↛ 4077line 4076 didn't jump to line 4077 because the condition on line 4076 was never true
4077 continue
4078 # WordData should not have a 'tags' field, but if it does, it's
4079 # deleted and its contents removed and placed in each sense;
4080 # that's why the type ignores.
4081 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4082 if "tags" in data:
4083 del data["tags"] # type: ignore[typeddict-item]
4084 for sense in data["senses"]:
4085 data_extend(sense, "tags", tags)
4087 return ret
4090def parse_wikipedia_template(
4091 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4092) -> None:
4093 """Helper function for parsing {{wikipedia|...}} and related templates."""
4094 assert isinstance(wxr, WiktextractContext)
4095 assert isinstance(data, dict)
4096 assert isinstance(ht, dict)
4097 langid = clean_node(wxr, data, ht.get("lang", ()))
4098 pagename = (
4099 clean_node(wxr, data, ht.get(1, ()))
4100 or wxr.wtp.title
4101 or "MISSING_PAGE_TITLE"
4102 )
4103 if langid:
4104 data_append(data, "wikipedia", langid + ":" + pagename)
4105 else:
4106 data_append(data, "wikipedia", pagename)
4109def parse_top_template(
4110 wxr: WiktextractContext, node: WikiNode, data: WordData
4111) -> None:
4112 """Parses a template that occurs on the top-level in a page, before any
4113 language subtitles."""
4114 assert isinstance(wxr, WiktextractContext)
4115 assert isinstance(node, WikiNode)
4116 assert isinstance(data, dict)
4118 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4119 if name in wikipedia_templates:
4120 parse_wikipedia_template(wxr, data, ht)
4121 return None
4122 if is_panel_template(wxr, name):
4123 return ""
4124 if name in ("reconstruction",): 4124 ↛ 4125line 4124 didn't jump to line 4125 because the condition on line 4124 was never true
4125 return ""
4126 if name.lower() == "also" or name.lower().startswith("also/"):
4127 # XXX shows related words that might really have been the intended
4128 # word, capture them
4129 return ""
4130 if name == "see also": 4130 ↛ 4132line 4130 didn't jump to line 4132 because the condition on line 4130 was never true
4131 # XXX capture
4132 return ""
4133 if name == "cardinalbox": 4133 ↛ 4135line 4133 didn't jump to line 4135 because the condition on line 4133 was never true
4134 # XXX capture
4135 return ""
4136 if name == "character info": 4136 ↛ 4138line 4136 didn't jump to line 4138 because the condition on line 4136 was never true
4137 # XXX capture
4138 return ""
4139 if name == "commonscat": 4139 ↛ 4141line 4139 didn't jump to line 4141 because the condition on line 4139 was never true
4140 # XXX capture link to Wikimedia commons
4141 return ""
4142 if name == "wrongtitle": 4142 ↛ 4145line 4142 didn't jump to line 4145 because the condition on line 4142 was never true
4143 # XXX this should be captured to replace page title with the
4144 # correct title. E.g. ⿰亻革家
4145 return ""
4146 if name == "wikidata": 4146 ↛ 4147line 4146 didn't jump to line 4147 because the condition on line 4146 was never true
4147 arg = clean_node(wxr, data, ht.get(1, ()))
4148 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4149 data_append(data, "wikidata", arg)
4150 return ""
4151 wxr.wtp.debug(
4152 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4153 sortid="page/2870",
4154 )
4155 return ""
4157 clean_node(wxr, None, [node], template_fn=top_template_fn)
4160def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4161 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4162 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4163 that are next to each other."""
4165 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4166 # section get overwritten. In this case, let's just combine the two.
4168 # In Chinese entries, Pronunciation can be preceded on the
4169 # same level 3 by its Etymology *and* Glyph Origin sections:
4170 # ===Glyph Origin===
4171 # ===Etymology===
4172 # ===Pronunciation===
4173 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4174 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4175 # are now level 6
4177 # Known lowercase PoS names are in part_of_speech_map
4178 # Known lowercase linkage section names are in linkage_map
4180 old = re.split(
4181 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4182 )
4184 parts = []
4185 npar = 4 # Number of parentheses in above expression
4186 parts.append(old[0])
4187 prev_level = None
4188 level = None
4189 skip_level_title = False # When combining etymology sections
4190 for i in range(1, len(old), npar + 1):
4191 left = old[i]
4192 right = old[i + npar - 1]
4193 # remove Wikilinks in title
4194 title = re.sub(r"^\[\[", "", old[i + 1])
4195 title = re.sub(r"\]\]$", "", title)
4196 prev_level = level
4197 level = len(left)
4198 part = old[i + npar]
4199 if level != len(right): 4199 ↛ 4200line 4199 didn't jump to line 4200 because the condition on line 4199 was never true
4200 wxr.wtp.debug(
4201 "subtitle has unbalanced levels: "
4202 "{!r} has {} on the left and {} on the right".format(
4203 title, left, right
4204 ),
4205 sortid="page/2904",
4206 )
4207 lc = title.lower()
4208 if name_to_code(title, "en") != "":
4209 if level > 2: 4209 ↛ 4210line 4209 didn't jump to line 4210 because the condition on line 4209 was never true
4210 wxr.wtp.debug(
4211 "subtitle has language name {} at level {}".format(
4212 title, level
4213 ),
4214 sortid="page/2911",
4215 )
4216 level = 2
4217 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4218 if level > 3: 4218 ↛ 4219line 4218 didn't jump to line 4219 because the condition on line 4218 was never true
4219 wxr.wtp.debug(
4220 "etymology section {} at level {}".format(title, level),
4221 sortid="page/2917",
4222 )
4223 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
4224 # sections cheek-to-cheek
4225 skip_level_title = True
4226 # Modify the title of previous ("Glyph Origin") section, in
4227 # case we have a meaningful title like "Etymology 1"
4228 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4229 level = 3
4230 elif lc.startswith(PRONUNCIATION_TITLE):
4231 # Pronunciation is now a level between POS and Etymology, so
4232 # we need to shift everything down by one
4233 level = 4
4234 elif lc in POS_TITLES:
4235 level = 5
4236 elif lc == TRANSLATIONS_TITLE:
4237 level = 6
4238 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4239 level = 6
4240 elif lc in INFLECTION_TITLES:
4241 level = 6
4242 elif lc == DESCENDANTS_TITLE:
4243 level = 6
4244 elif title in PROTO_ROOT_DERIVED_TITLES: 4244 ↛ 4245line 4244 didn't jump to line 4245 because the condition on line 4244 was never true
4245 level = 6
4246 elif lc in IGNORED_TITLES:
4247 level = 6
4248 else:
4249 level = 6
4250 if skip_level_title:
4251 skip_level_title = False
4252 parts.append(part)
4253 else:
4254 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4255 parts.append(part)
4256 # print("=" * level, title)
4257 # if level != len(left):
4258 # print(" FIXED LEVEL OF {} {} -> {}"
4259 # .format(title, len(left), level))
4261 text = "".join(parts)
4262 # print(text)
4263 return text
4266def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4267 # Skip translation pages
4268 if word.endswith("/" + TRANSLATIONS_TITLE): 4268 ↛ 4269line 4268 didn't jump to line 4269 because the condition on line 4268 was never true
4269 return []
4271 if wxr.config.verbose: 4271 ↛ 4272line 4271 didn't jump to line 4272 because the condition on line 4271 was never true
4272 logger.info(f"Parsing page: {word}")
4274 wxr.config.word = word
4275 wxr.wtp.start_page(word)
4277 # Remove <noinclude> and similar tags from main pages. They
4278 # should not appear there, but at least net/Elfdala has one and it
4279 # is probably not the only one.
4280 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4281 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4282 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4284 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4285 # pages that have, for example, Translations section under Linkage, or
4286 # Translations section on the same level as Noun. Enforce a proper
4287 # hierarchy by manipulating the subtitle levels in certain cases.
4288 text = fix_subtitle_hierarchy(wxr, text)
4290 # Parse the page, pre-expanding those templates that are likely to
4291 # influence parsing
4292 tree = wxr.wtp.parse(
4293 text,
4294 pre_expand=True,
4295 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4296 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4297 )
4298 # from wikitextprocessor.parser import print_tree
4299 # print("PAGE PARSE:", print_tree(tree))
4301 top_data: WordData = {}
4303 # Iterate over top-level titles, which should be languages for normal
4304 # pages
4305 by_lang = defaultdict(list)
4306 for langnode in tree.children:
4307 if not isinstance(langnode, WikiNode):
4308 continue
4309 if langnode.kind == NodeKind.TEMPLATE:
4310 parse_top_template(wxr, langnode, top_data)
4311 continue
4312 if langnode.kind == NodeKind.LINK:
4313 # Some pages have links at top level, e.g., "trees" in Wiktionary
4314 continue
4315 if langnode.kind != NodeKind.LEVEL2: 4315 ↛ 4316line 4315 didn't jump to line 4316 because the condition on line 4315 was never true
4316 wxr.wtp.debug(
4317 f"unexpected top-level node: {langnode}", sortid="page/3014"
4318 )
4319 continue
4320 lang = clean_node(
4321 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4322 )
4323 lang_code = name_to_code(lang, "en")
4324 if lang_code == "": 4324 ↛ 4325line 4324 didn't jump to line 4325 because the condition on line 4324 was never true
4325 wxr.wtp.debug(
4326 f"unrecognized language name: {lang}", sortid="page/3019"
4327 )
4328 if (
4329 wxr.config.capture_language_codes
4330 and lang_code not in wxr.config.capture_language_codes
4331 ):
4332 continue
4333 wxr.wtp.start_section(lang)
4335 # Collect all words from the page.
4336 # print(f"{langnode=}")
4337 datas = parse_language(wxr, langnode, lang, lang_code)
4339 # Propagate fields resulting from top-level templates to this
4340 # part-of-speech.
4341 for data in datas:
4342 if "lang" not in data: 4342 ↛ 4343line 4342 didn't jump to line 4343 because the condition on line 4342 was never true
4343 wxr.wtp.debug(
4344 "internal error -- no lang in data: {}".format(data),
4345 sortid="page/3034",
4346 )
4347 continue
4348 for k, v in top_data.items():
4349 assert isinstance(v, (list, tuple))
4350 data_extend(data, k, v)
4351 by_lang[data["lang"]].append(data)
4353 # XXX this code is clearly out of date. There is no longer a "conjugation"
4354 # field. FIX OR REMOVE.
4355 # Do some post-processing on the words. For example, we may distribute
4356 # conjugation information to all the words.
4357 ret = []
4358 for lang, lang_datas in by_lang.items():
4359 ret.extend(lang_datas)
4361 for x in ret:
4362 if x["word"] != word:
4363 if word.startswith("Unsupported titles/"):
4364 wxr.wtp.debug(
4365 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4366 sortid="20231101/3578page.py",
4367 )
4368 else:
4369 wxr.wtp.debug(
4370 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4371 sortid="20231101/3582page.py",
4372 )
4373 x["original_title"] = word
4374 # validate tag data
4375 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4376 return ret
4379def recursively_separate_raw_tags(
4380 wxr: WiktextractContext, data: dict[str, Any]
4381) -> None:
4382 if not isinstance(data, dict): 4382 ↛ 4383line 4382 didn't jump to line 4383 because the condition on line 4382 was never true
4383 wxr.wtp.error(
4384 "'data' is not dict; most probably "
4385 "data has a list that contains at least one dict and "
4386 "at least one non-dict item",
4387 sortid="en/page-4016/20240419",
4388 )
4389 return
4390 new_tags: list[str] = []
4391 raw_tags: list[str] = data.get("raw_tags", [])
4392 for field, val in data.items():
4393 if field == "tags":
4394 for tag in val:
4395 if tag not in valid_tags:
4396 raw_tags.append(tag)
4397 else:
4398 new_tags.append(tag)
4399 if isinstance(val, list):
4400 if len(val) > 0 and isinstance(val[0], dict):
4401 for d in val:
4402 recursively_separate_raw_tags(wxr, d)
4403 if "tags" in data and not new_tags:
4404 del data["tags"]
4405 elif new_tags:
4406 data["tags"] = new_tags
4407 if raw_tags:
4408 data["raw_tags"] = raw_tags
4411def process_soft_redirect_template(
4412 wxr: WiktextractContext,
4413 template_node: TemplateNode,
4414 redirect_pages: list[str],
4415) -> bool:
4416 # return `True` if the template is soft redirect template
4417 if template_node.template_name == "zh-see":
4418 # https://en.wiktionary.org/wiki/Template:zh-see
4419 title = clean_node(
4420 wxr, None, template_node.template_parameters.get(1, "")
4421 )
4422 if title != "": 4422 ↛ 4424line 4422 didn't jump to line 4424 because the condition on line 4422 was always true
4423 redirect_pages.append(title)
4424 return True
4425 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4426 # https://en.wiktionary.org/wiki/Template:ja-see
4427 for key, value in template_node.template_parameters.items():
4428 if isinstance(key, int): 4428 ↛ 4427line 4428 didn't jump to line 4427 because the condition on line 4428 was always true
4429 title = clean_node(wxr, None, value)
4430 if title != "": 4430 ↛ 4427line 4430 didn't jump to line 4427 because the condition on line 4430 was always true
4431 redirect_pages.append(title)
4432 return True
4433 return False
4436ZH_FORMS_TAGS = {
4437 "trad.": "Traditional-Chinese",
4438 "simp.": "Simplified-Chinese",
4439 "alternative forms": "alternative",
4440}
4443def extract_zh_forms_template(
4444 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4445):
4446 # https://en.wiktionary.org/wiki/Template:zh-forms
4447 lit_meaning = clean_node(
4448 wxr, None, t_node.template_parameters.get("lit", "")
4449 )
4450 if lit_meaning != "": 4450 ↛ 4451line 4450 didn't jump to line 4451 because the condition on line 4450 was never true
4451 base_data["literal_meaning"] = lit_meaning
4452 expanded_node = wxr.wtp.parse(
4453 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4454 )
4455 for table in expanded_node.find_child(NodeKind.TABLE):
4456 for row in table.find_child(NodeKind.TABLE_ROW):
4457 row_header = ""
4458 row_header_tags = []
4459 header_has_span = False
4460 for cell in row.find_child(
4461 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4462 ):
4463 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4464 row_header, row_header_tags, header_has_span = (
4465 extract_zh_forms_header_cell(wxr, base_data, cell)
4466 )
4467 elif not header_has_span:
4468 extract_zh_forms_data_cell(
4469 wxr, base_data, cell, row_header, row_header_tags
4470 )
4472 if "forms" in base_data and len(base_data["forms"]) == 0: 4472 ↛ 4473line 4472 didn't jump to line 4473 because the condition on line 4472 was never true
4473 del base_data["forms"]
4476def extract_zh_forms_header_cell(
4477 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4478) -> tuple[str, list[str], bool]:
4479 row_header = ""
4480 row_header_tags = []
4481 header_has_span = False
4482 first_span_index = len(header_cell.children)
4483 for index, span_tag in header_cell.find_html("span", with_index=True):
4484 if index < first_span_index: 4484 ↛ 4486line 4484 didn't jump to line 4486 because the condition on line 4484 was always true
4485 first_span_index = index
4486 header_has_span = True
4487 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4488 for raw_tag in row_header.split(" and "):
4489 raw_tag = raw_tag.strip()
4490 if raw_tag != "": 4490 ↛ 4488line 4490 didn't jump to line 4488 because the condition on line 4490 was always true
4491 row_header_tags.append(raw_tag)
4492 for span_tag in header_cell.find_html_recursively("span"):
4493 span_lang = span_tag.attrs.get("lang", "")
4494 form_nodes = []
4495 sup_title = ""
4496 for node in span_tag.children:
4497 if isinstance(node, HTMLNode) and node.tag == "sup": 4497 ↛ 4498line 4497 didn't jump to line 4498 because the condition on line 4497 was never true
4498 for sup_span in node.find_html("span"):
4499 sup_title = sup_span.attrs.get("title", "")
4500 else:
4501 form_nodes.append(node)
4502 if span_lang in ["zh-Hant", "zh-Hans"]: 4502 ↛ 4503line 4502 didn't jump to line 4503 because the condition on line 4502 was never true
4503 for word in clean_node(wxr, None, form_nodes).split("/"):
4504 if word not in [wxr.wtp.title, ""]:
4505 form = {"form": word}
4506 for raw_tag in row_header_tags:
4507 if raw_tag in ZH_FORMS_TAGS:
4508 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4509 else:
4510 data_append(form, "raw_tags", raw_tag)
4511 if sup_title != "":
4512 data_append(form, "raw_tags", sup_title)
4513 data_append(base_data, "forms", form)
4514 return row_header, row_header_tags, header_has_span
4517def extract_zh_forms_data_cell(
4518 wxr: WiktextractContext,
4519 base_data: WordData,
4520 cell: WikiNode,
4521 row_header: str,
4522 row_header_tags: list[str],
4523):
4524 from .zh_pron_tags import ZH_PRON_TAGS
4526 for top_span_tag in cell.find_html("span"):
4527 forms = []
4528 for span_tag in top_span_tag.find_html("span"):
4529 span_lang = span_tag.attrs.get("lang", "")
4530 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:
4531 word = clean_node(wxr, None, span_tag)
4532 if word not in ["", "/", wxr.wtp.title]:
4533 form = {"form": word}
4534 if row_header != "anagram": 4534 ↛ 4542line 4534 didn't jump to line 4542 because the condition on line 4534 was always true
4535 for raw_tag in row_header_tags:
4536 if raw_tag in ZH_FORMS_TAGS: 4536 ↛ 4541line 4536 didn't jump to line 4541 because the condition on line 4536 was always true
4537 data_append(
4538 form, "tags", ZH_FORMS_TAGS[raw_tag]
4539 )
4540 else:
4541 data_append(form, "raw_tags", raw_tag)
4542 if span_lang == "zh-Hant":
4543 data_append(form, "tags", "Traditional-Chinese")
4544 elif span_lang == "zh-Hans":
4545 data_append(form, "tags", "Simplified-Chinese")
4546 forms.append(form)
4547 elif "font-size:80%" in span_tag.attrs.get("style", ""): 4547 ↛ 4528line 4547 didn't jump to line 4528 because the condition on line 4547 was always true
4548 raw_tag = clean_node(wxr, None, span_tag)
4549 if raw_tag != "": 4549 ↛ 4528line 4549 didn't jump to line 4528 because the condition on line 4549 was always true
4550 for form in forms:
4551 if raw_tag in ZH_PRON_TAGS: 4551 ↛ 4557line 4551 didn't jump to line 4557 because the condition on line 4551 was always true
4552 tr_tag = ZH_PRON_TAGS[raw_tag]
4553 if isinstance(tr_tag, list): 4553 ↛ 4554line 4553 didn't jump to line 4554 because the condition on line 4553 was never true
4554 data_extend(form, "tags", tr_tag)
4555 elif isinstance(tr_tag, str): 4555 ↛ 4550line 4555 didn't jump to line 4550 because the condition on line 4555 was always true
4556 data_append(form, "tags", tr_tag)
4557 elif raw_tag in valid_tags:
4558 data_append(form, "tags", raw_tag)
4559 else:
4560 data_append(form, "raw_tags", raw_tag)
4562 if row_header == "anagram": 4562 ↛ 4563line 4562 didn't jump to line 4563 because the condition on line 4562 was never true
4563 for form in forms:
4564 l_data = {"word": form["form"]}
4565 for key in ["tags", "raw_tags"]:
4566 if key in form:
4567 l_data[key] = form[key]
4568 data_append(base_data, "anagrams", l_data)
4569 else:
4570 data_extend(base_data, "forms", forms)