Coverage for src/wiktextract/extractor/en/page.py: 76%
1944 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Optional,
15 Set,
16 Union,
17 cast,
18)
20from mediawiki_langcodes import get_all_names, name_to_code
21from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
22from wikitextprocessor.parser import (
23 LEVEL_KIND_FLAGS,
24 GeneralNode,
25 HTMLNode,
26 LevelNode,
27 NodeKind,
28 TemplateNode,
29 WikiNode,
30 is_list,
31 is_list_item,
32)
34from ...clean import clean_template_args, clean_value
35from ...datautils import (
36 data_append,
37 data_extend,
38 ns_title_prefix_tuple,
39)
40from ...page import (
41 LEVEL_KINDS,
42 clean_node,
43 is_panel_template,
44 recursively_extract,
45)
46from ...tags import valid_tags
47from ...wxr_context import WiktextractContext
48from ...wxr_logging import logger
49from ..ruby import extract_ruby, parse_ruby
50from ..share import strip_nodes
51from .descendant import extract_descendant_section
52from .example import extract_example_list_item, extract_template_zh_x
53from .form_descriptions import (
54 classify_desc,
55 decode_tags,
56 distw,
57 parse_alt_or_inflection_of,
58 parse_sense_qualifier,
59 parse_word_head,
60)
61from .inflection import TableContext, parse_inflection_section
62from .info_templates import (
63 INFO_TEMPLATE_FUNCS,
64 parse_info_template_arguments,
65 parse_info_template_node,
66)
67from .linkages import (
68 extract_alt_form_section,
69 extract_zh_dial_template,
70 parse_linkage_item_text,
71)
72from .parts_of_speech import PARTS_OF_SPEECH
73from .section_titles import (
74 COMPOUNDS_TITLE,
75 DESCENDANTS_TITLE,
76 ETYMOLOGY_TITLES,
77 IGNORED_TITLES,
78 INFLECTION_TITLES,
79 LINKAGE_TITLES,
80 POS_TITLES,
81 PRONUNCIATION_TITLE,
82 PROTO_ROOT_DERIVED_TITLES,
83 TRANSLATIONS_TITLE,
84)
85from .translations import parse_translation_item_text
86from .type_utils import (
87 AttestationData,
88 ExampleData,
89 LinkageData,
90 ReferenceData,
91 SenseData,
92 SoundData,
93 TemplateData,
94 WordData,
95)
96from .unsupported_titles import unsupported_title_map
98# When determining whether a string is 'english', classify_desc
99# might return 'taxonomic' which is English text 99% of the time.
100ENGLISH_TEXTS = ("english", "taxonomic")
102# Matches head tag
103HEAD_TAG_RE = re.compile(
104 r"^(head|Han char|arabic-noun|arabic-noun-form|"
105 r"hangul-symbol|syllable-hangul)$|"
106 + r"^(latin|"
107 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
108 + r")-("
109 + "|".join(
110 [
111 "abbr",
112 "adj",
113 "adjective",
114 "adjective form",
115 "adjective-form",
116 "adv",
117 "adverb",
118 "affix",
119 "animal command",
120 "art",
121 "article",
122 "aux",
123 "bound pronoun",
124 "bound-pronoun",
125 "Buyla",
126 "card num",
127 "card-num",
128 "cardinal",
129 "chunom",
130 "classifier",
131 "clitic",
132 "cls",
133 "cmene",
134 "cmavo",
135 "colloq-verb",
136 "colverbform",
137 "combining form",
138 "combining-form",
139 "comparative",
140 "con",
141 "concord",
142 "conj",
143 "conjunction",
144 "conjug",
145 "cont",
146 "contr",
147 "converb",
148 "daybox",
149 "decl",
150 "decl noun",
151 "def",
152 "dem",
153 "det",
154 "determ",
155 "Deva",
156 "ending",
157 "entry",
158 "form",
159 "fuhivla",
160 "gerund",
161 "gismu",
162 "hanja",
163 "hantu",
164 "hanzi",
165 "head",
166 "ideophone",
167 "idiom",
168 "inf",
169 "indef",
170 "infixed pronoun",
171 "infixed-pronoun",
172 "infl",
173 "inflection",
174 "initialism",
175 "int",
176 "interfix",
177 "interj",
178 "interjection",
179 "jyut",
180 "latin",
181 "letter",
182 "locative",
183 "lujvo",
184 "monthbox",
185 "mutverb",
186 "name",
187 "nisba",
188 "nom",
189 "noun",
190 "noun form",
191 "noun-form",
192 "noun plural",
193 "noun-plural",
194 "nounprefix",
195 "num",
196 "number",
197 "numeral",
198 "ord",
199 "ordinal",
200 "par",
201 "part",
202 "part form",
203 "part-form",
204 "participle",
205 "particle",
206 "past",
207 "past neg",
208 "past-neg",
209 "past participle",
210 "past-participle",
211 "perfect participle",
212 "perfect-participle",
213 "personal pronoun",
214 "personal-pronoun",
215 "pref",
216 "prefix",
217 "phrase",
218 "pinyin",
219 "plural noun",
220 "plural-noun",
221 "pos",
222 "poss-noun",
223 "post",
224 "postp",
225 "postposition",
226 "PP",
227 "pp",
228 "ppron",
229 "pred",
230 "predicative",
231 "prep",
232 "prep phrase",
233 "prep-phrase",
234 "preposition",
235 "present participle",
236 "present-participle",
237 "pron",
238 "prondem",
239 "pronindef",
240 "pronoun",
241 "prop",
242 "proper noun",
243 "proper-noun",
244 "proper noun form",
245 "proper-noun form",
246 "proper noun-form",
247 "proper-noun-form",
248 "prov",
249 "proverb",
250 "prpn",
251 "prpr",
252 "punctuation mark",
253 "punctuation-mark",
254 "regnoun",
255 "rel",
256 "rom",
257 "romanji",
258 "root",
259 "sign",
260 "suff",
261 "suffix",
262 "syllable",
263 "symbol",
264 "verb",
265 "verb form",
266 "verb-form",
267 "verbal noun",
268 "verbal-noun",
269 "verbnec",
270 "vform",
271 ]
272 )
273 + r")(-|/|\+|$)"
274)
276# Head-templates causing problems (like newlines) that can be squashed into
277# an empty string in the template handler while saving their template
278# data for later.
279WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
281FLOATING_TABLE_TEMPLATES: set[str] = {
282 # az-suffix-form creates a style=floatright div that is otherwise
283 # deleted; if it is not pre-expanded, we can intercept the template
284 # so we add this set into do_not_pre_expand, and intercept the
285 # templates in parse_part_of_speech
286 "az-suffix-forms",
287 "az-inf-p",
288 "kk-suffix-forms",
289 "ky-suffix-forms",
290 "tr-inf-p",
291 "tr-suffix-forms",
292 "tt-suffix-forms",
293 "uz-suffix-forms",
294}
295# These two should contain template names that should always be
296# pre-expanded when *first* processing the tree, or not pre-expanded
297# so that the template are left in place with their identifying
298# name intact for later filtering.
300DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
301DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
303# Additional templates to be expanded in the pre-expand phase
304ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
305 "multitrans",
306 "multitrans-nowiki",
307 "trans-top",
308 "trans-top-also",
309 "trans-bottom",
310 "checktrans-top",
311 "checktrans-bottom",
312 "col",
313 "col1",
314 "col2",
315 "col3",
316 "col4",
317 "col5",
318 "col1-u",
319 "col2-u",
320 "col3-u",
321 "col4-u",
322 "col5-u",
323 "check deprecated lang param usage",
324 "deprecated code",
325 "ru-verb-alt-ё",
326 "ru-noun-alt-ё",
327 "ru-adj-alt-ё",
328 "ru-proper noun-alt-ё",
329 "ru-pos-alt-ё",
330 "ru-alt-ё",
331 "inflection of",
332 "no deprecated lang param usage",
333 "transclude", # these produce sense entries (or other lists)
334 "tcl",
335}
337# Inverse linkage for those that have them
338linkage_inverses: dict[str, str] = {
339 # XXX this is not currently used, move to post-processing
340 "synonyms": "synonyms",
341 "hypernyms": "hyponyms",
342 "hyponyms": "hypernyms",
343 "holonyms": "meronyms",
344 "meronyms": "holonyms",
345 "derived": "derived_from",
346 "coordinate_terms": "coordinate_terms",
347 "troponyms": "hypernyms",
348 "antonyms": "antonyms",
349 "instances": "instance_of",
350 "related": "related",
351}
353# Templates that are used to form panels on pages and that
354# should be ignored in various positions
355PANEL_TEMPLATES: set[str] = {
356 "Character info",
357 "CJKV",
358 "French personal pronouns",
359 "French possessive adjectives",
360 "French possessive pronouns",
361 "Han etym",
362 "Japanese demonstratives",
363 "Latn-script",
364 "LDL",
365 "MW1913Abbr",
366 "Number-encoding",
367 "Nuttall",
368 "Spanish possessive adjectives",
369 "Spanish possessive pronouns",
370 "USRegionDisputed",
371 "Webster 1913",
372 "ase-rfr",
373 "attention",
374 "attn",
375 "beer",
376 "broken ref",
377 "ca-compass",
378 "character info",
379 "character info/var",
380 "checksense",
381 "compass-fi",
382 "copyvio suspected",
383 "delete",
384 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
385 "etystub",
386 "examples",
387 "hu-corr",
388 "hu-suff-pron",
389 "interwiktionary",
390 "ja-kanjitab",
391 "ko-hanja-search",
392 "look",
393 "maintenance box",
394 "maintenance line",
395 "mediagenic terms",
396 "merge",
397 "missing template",
398 "morse links",
399 "move",
400 "multiple images",
401 "no inline",
402 "picdic",
403 "picdicimg",
404 "picdiclabel",
405 "polyominoes",
406 "predidential nomics",
407 "punctuation", # This actually gets pre-expanded
408 "reconstructed",
409 "request box",
410 "rf-sound example",
411 "rfaccents",
412 "rfap",
413 "rfaspect",
414 "rfc",
415 "rfc-auto",
416 "rfc-header",
417 "rfc-level",
418 "rfc-pron-n",
419 "rfc-sense",
420 "rfclarify",
421 "rfd",
422 "rfd-redundant",
423 "rfd-sense",
424 "rfdate",
425 "rfdatek",
426 "rfdef",
427 "rfe",
428 "rfe/dowork",
429 "rfex",
430 "rfexp",
431 "rfform",
432 "rfgender",
433 "rfi",
434 "rfinfl",
435 "rfm",
436 "rfm-sense",
437 "rfp",
438 "rfp-old",
439 "rfquote",
440 "rfquote-sense",
441 "rfquotek",
442 "rfref",
443 "rfscript",
444 "rft2",
445 "rftaxon",
446 "rftone",
447 "rftranslit",
448 "rfv",
449 "rfv-etym",
450 "rfv-pron",
451 "rfv-quote",
452 "rfv-sense",
453 "selfref",
454 "split",
455 "stroke order", # XXX consider capturing this?
456 "stub entry",
457 "t-needed",
458 "tbot entry",
459 "tea room",
460 "tea room sense",
461 # "ttbc", - XXX needed in at least on/Preposition/Translation page
462 "unblock",
463 "unsupportedpage",
464 "video frames",
465 "was wotd",
466 "wrongtitle",
467 "zh-forms",
468 "zh-hanzi-box",
469 "no entry",
470}
472# Template name prefixes used for language-specific panel templates (i.e.,
473# templates that create side boxes or notice boxes or that should generally
474# be ignored).
475PANEL_PREFIXES: set[str] = {
476 "list:compass points/",
477 "list:Gregorian calendar months/",
478 "RQ:",
479}
481# Templates used for wikipedia links.
482wikipedia_templates: set[str] = {
483 "wikipedia",
484 "slim-wikipedia",
485 "w",
486 "W",
487 "swp",
488 "wiki",
489 "Wikipedia",
490 "wtorw",
491}
492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started
493 print(
494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
495 x
496 )
497 )
499# Mapping from a template name (without language prefix) for the main word
500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
501# it could validly occur. This is used as just a sanity check to give
502# warnings about probably incorrect coding in Wiktionary.
503template_allowed_pos_map: dict[str, list[str]] = {
504 "abbr": ["abbrev"],
505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
506 "plural noun": ["noun", "name"],
507 "plural-noun": ["noun", "name"],
508 "proper noun": ["noun", "name"],
509 "proper-noun": ["name", "noun"],
510 "prop": ["name", "noun"],
511 "verb": ["verb", "phrase"],
512 "gerund": ["verb"],
513 "particle": ["adv", "particle"],
514 "adj": ["adj", "adj_noun"],
515 "pron": ["pron", "noun"],
516 "name": ["name", "noun"],
517 "adv": ["adv", "intj", "conj", "particle"],
518 "phrase": ["phrase", "prep_phrase"],
519 "noun phrase": ["phrase"],
520 "ordinal": ["num"],
521 "number": ["num"],
522 "pos": ["affix", "name", "num"],
523 "suffix": ["suffix", "affix"],
524 "character": ["character"],
525 "letter": ["character"],
526 "kanji": ["character"],
527 "cont": ["abbrev"],
528 "interj": ["intj"],
529 "con": ["conj"],
530 "part": ["particle"],
531 "prep": ["prep", "postp"],
532 "postp": ["postp"],
533 "misspelling": ["noun", "adj", "verb", "adv"],
534 "part-form": ["verb"],
535}
536for k, v in template_allowed_pos_map.items():
537 for x in v:
538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 print(
540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
541 "".format(x, k, v)
542 )
543 assert False
546# Templates ignored during etymology extraction, i.e., these will not be listed
547# in the extracted etymology templates.
548ignored_etymology_templates: list[str] = [
549 "...",
550 "IPAchar",
551 "ipachar",
552 "ISBN",
553 "isValidPageName",
554 "redlink category",
555 "deprecated code",
556 "check deprecated lang param usage",
557 "para",
558 "p",
559 "cite",
560 "Cite news",
561 "Cite newsgroup",
562 "cite paper",
563 "cite MLLM 1976",
564 "cite journal",
565 "cite news/documentation",
566 "cite paper/documentation",
567 "cite video game",
568 "cite video game/documentation",
569 "cite newsgroup",
570 "cite newsgroup/documentation",
571 "cite web/documentation",
572 "cite news",
573 "Cite book",
574 "Cite-book",
575 "cite book",
576 "cite web",
577 "cite-usenet",
578 "cite-video/documentation",
579 "Cite-journal",
580 "rfe",
581 "catlangname",
582 "cln",
583 "langname-lite",
584 "no deprecated lang param usage",
585 "mention",
586 "m",
587 "m-self",
588 "link",
589 "l",
590 "ll",
591 "l-self",
592]
593# Regexp for matching ignored etymology template names. This adds certain
594# prefixes to the names listed above.
595ignored_etymology_templates_re = re.compile(
596 r"^((cite-|R:|RQ:).*|"
597 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
598 + r")$"
599)
601# Regexp for matching ignored descendants template names. Right now we just
602# copy the ignored etymology templates
603ignored_descendants_templates_re = ignored_etymology_templates_re
605# Set of template names that are used to define usage examples. If the usage
606# example contains one of these templates, then it its type is set to
607# "example"
608usex_templates: set[str] = {
609 "afex",
610 "affixusex",
611 "co", # {{collocation}} acts like a example template, specifically for
612 # pairs of combinations of words that are more common than you'd
613 # except would be randomly; hlavní#Czech
614 "coi",
615 "collocation",
616 "el-example",
617 "el-x",
618 "example",
619 "examples",
620 "he-usex",
621 "he-x",
622 "hi-usex",
623 "hi-x",
624 "ja-usex-inline",
625 "ja-usex",
626 "ja-x",
627 "jbo-example",
628 "jbo-x",
629 "km-usex",
630 "km-x",
631 "ko-usex",
632 "ko-x",
633 "lo-usex",
634 "lo-x",
635 "ne-x",
636 "ne-usex",
637 "prefixusex",
638 "ryu-usex",
639 "ryu-x",
640 "shn-usex",
641 "shn-x",
642 "suffixusex",
643 "th-usex",
644 "th-x",
645 "ur-usex",
646 "ur-x",
647 "usex",
648 "usex-suffix",
649 "ux",
650 "uxi",
651}
653stop_head_at_these_templates: set[str] = {
654 "category",
655 "cat",
656 "topics",
657 "catlangname",
658 "c",
659 "C",
660 "top",
661 "cln",
662}
664# Set of template names that are used to define quotation examples. If the
665# usage example contains one of these templates, then its type is set to
666# "quotation".
667quotation_templates: set[str] = {
668 "collapse-quote",
669 "quote-av",
670 "quote-book",
671 "quote-GYLD",
672 "quote-hansard",
673 "quotei",
674 "quote-journal",
675 "quotelite",
676 "quote-mailing list",
677 "quote-meta",
678 "quote-newsgroup",
679 "quote-song",
680 "quote-text",
681 "quote",
682 "quote-us-patent",
683 "quote-video game",
684 "quote-web",
685 "quote-wikipedia",
686 "wikiquote",
687 "Wikiquote",
688}
690taxonomy_templates = {
691 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
692 "taxfmt",
693 "taxlink",
694 "taxlink2",
695 "taxlinknew",
696 "taxlook",
697}
699# Template names, this was exctracted from template_linkage_mappings,
700# because the code using template_linkage_mappings was actually not used
701# (but not removed).
702template_linkages_to_ignore_in_examples: set[str] = {
703 "syn",
704 "synonyms",
705 "ant",
706 "antonyms",
707 "hyp",
708 "hyponyms",
709 "der",
710 "derived terms",
711 "coordinate terms",
712 "cot",
713 "rel",
714 "col",
715 "inline alt forms",
716 "alti",
717 "comeronyms",
718 "holonyms",
719 "holo",
720 "hypernyms",
721 "hyper",
722 "meronyms",
723 "mero",
724 "troponyms",
725 "perfectives",
726 "pf",
727 "imperfectives",
728 "impf",
729 "syndiff",
730 "synsee",
731 # not linkage nor example templates
732 "sense",
733 "s",
734 "color panel",
735 "colour panel",
736}
738# Maps template name used in a word sense to a linkage field that it adds.
739sense_linkage_templates: dict[str, str] = {
740 "syn": "synonyms",
741 "synonyms": "synonyms",
742 "synsee": "synonyms",
743 "syndiff": "synonyms",
744 "hyp": "hyponyms",
745 "hyponyms": "hyponyms",
746 "ant": "antonyms",
747 "antonyms": "antonyms",
748 "alti": "related",
749 "inline alt forms": "related",
750 "coordinate terms": "coordinate_terms",
751 "cot": "coordinate_terms",
752 "comeronyms": "related",
753 "holonyms": "holonyms",
754 "holo": "holonyms",
755 "hypernyms": "hypernyms",
756 "hyper": "hypernyms",
757 "meronyms": "meronyms",
758 "mero": "meronyms",
759 "troponyms": "troponyms",
760 "perfectives": "related",
761 "pf": "related",
762 "imperfectives": "related",
763 "impf": "related",
764}
766sense_linkage_templates_tags: dict[str, list[str]] = {
767 "alti": ["alternative"],
768 "inline alt forms": ["alternative"],
769 "comeronyms": ["comeronym"],
770 "perfectives": ["perfective"],
771 "pf": ["perfective"],
772 "imperfectives": ["imperfective"],
773 "impf": ["imperfective"],
774}
777def decode_html_entities(v: Union[str, int]) -> str:
778 """Decodes HTML entities from a value, converting them to the respective
779 Unicode characters/strings."""
780 if isinstance(v, int):
781 # I changed this to return str(v) instead of v = str(v),
782 # but there might have been the intention to have more logic
783 # here. html.unescape would not do anything special with an integer,
784 # it needs html escape symbols (&xx;).
785 return str(v)
786 return html.unescape(v)
789def parse_sense_linkage(
790 wxr: WiktextractContext,
791 data: SenseData,
792 name: str,
793 ht: TemplateArgs,
794 pos: str,
795) -> None:
796 """Parses a linkage (synonym, etc) specified in a word sense."""
797 assert isinstance(wxr, WiktextractContext)
798 assert isinstance(data, dict)
799 assert isinstance(name, str)
800 assert isinstance(ht, dict)
801 field = sense_linkage_templates[name]
802 field_tags = sense_linkage_templates_tags.get(name, [])
803 for i in range(2, 20):
804 w = ht.get(i) or ""
805 w = clean_node(wxr, data, w)
806 is_thesaurus = False
807 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
808 if w.startswith(alias): 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true
809 is_thesaurus = True
810 w = w[len(alias) :]
811 if w != wxr.wtp.title:
812 from ...thesaurus import search_thesaurus
814 lang_code = clean_node(wxr, None, ht.get(1, ""))
815 for t_data in search_thesaurus(
816 wxr.thesaurus_db_conn, w, lang_code, pos, field
817 ):
818 l_data = {
819 "word": t_data.term,
820 "source": "Thesaurus:" + w,
821 }
822 if len(t_data.tags) > 0:
823 l_data["tags"] = t_data.tags
824 if len(t_data.raw_tags) > 0:
825 l_data["raw_tags"] = t_data.raw_tags
826 data_append(data, field, l_data)
827 break
828 if not w:
829 break
830 if is_thesaurus: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 continue
832 tags: list[str] = []
833 topics: list[str] = []
834 english: Optional[str] = None
835 # Try to find qualifiers for this synonym
836 q = ht.get("q{}".format(i - 1))
837 if q:
838 cls = classify_desc(q)
839 if cls == "tags":
840 tagsets1, topics1 = decode_tags(q)
841 for ts in tagsets1:
842 tags.extend(ts)
843 topics.extend(topics1)
844 elif cls == "english": 844 ↛ 850line 844 didn't jump to line 850 because the condition on line 844 was always true
845 if english: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true
846 english += "; " + q
847 else:
848 english = q
849 # Try to find English translation for this synonym
850 t = ht.get("t{}".format(i - 1))
851 if t: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true
852 if english:
853 english += "; " + t
854 else:
855 english = t
857 # See if the linkage contains a parenthesized alt
858 alt = None
859 m = re.search(r"\(([^)]+)\)$", w)
860 if m: 860 ↛ 861line 860 didn't jump to line 861 because the condition on line 860 was never true
861 w = w[: m.start()].strip()
862 alt = m.group(1)
864 dt = {"word": w}
865 if field_tags: 865 ↛ 866line 865 didn't jump to line 866 because the condition on line 865 was never true
866 data_extend(dt, "tags", field_tags)
867 if tags:
868 data_extend(dt, "tags", tags)
869 if topics: 869 ↛ 870line 869 didn't jump to line 870 because the condition on line 869 was never true
870 data_extend(dt, "topics", topics)
871 if english:
872 dt["english"] = english # DEPRECATED for "translation"
873 dt["translation"] = english
874 if alt: 874 ↛ 875line 874 didn't jump to line 875 because the condition on line 874 was never true
875 dt["alt"] = alt
876 data_append(data, field, dt)
879EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
880example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
881captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
884def synch_splits_with_args(
885 line: str, targs: TemplateArgs
886) -> Optional[list[str]]:
887 """If it looks like there's something weird with how a line of example
888 text has been split, this function will do the splitting after counting
889 occurences of the splitting regex inside the two main template arguments
890 containing the string data for the original language example and the
891 English translations.
892 """
893 # Previously, we split without capturing groups, but here we want to
894 # keep the original splitting hyphen regex intact.
895 fparts = captured_splitters_re.split(line)
896 new_parts = []
897 # ["First", " – ", "second", " – ", "third..."] from OL argument
898 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
899 new_parts.append("".join(fparts[:first]))
900 # Translation argument
901 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
902 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
903 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
904 new_parts.append("".join(fparts[first + 1 : second]))
906 if all(new_parts): # no empty strings from the above spaghetti
907 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
908 return new_parts
909 else:
910 return None
913QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
914QUALIFIERS_RE = re.compile(QUALIFIERS)
915# (...): ... or (...(...)...): ...
918def parse_language(
919 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
920) -> list[WordData]:
921 """Iterates over the text of the page, returning words (parts-of-speech)
922 defined on the page one at a time. (Individual word senses for the
923 same part-of-speech are typically encoded in the same entry.)"""
924 # imported here to avoid circular import
925 from .pronunciation import parse_pronunciation
927 assert isinstance(wxr, WiktextractContext)
928 assert isinstance(langnode, WikiNode)
929 assert isinstance(language, str)
930 assert isinstance(lang_code, str)
931 # print("parse_language", language)
933 is_reconstruction = False
934 word: str = wxr.wtp.title # type: ignore[assignment]
935 unsupported_prefix = "Unsupported titles/"
936 if word.startswith(unsupported_prefix):
937 w = word[len(unsupported_prefix) :]
938 if w in unsupported_title_map: 938 ↛ 941line 938 didn't jump to line 941 because the condition on line 938 was always true
939 word = unsupported_title_map[w]
940 else:
941 wxr.wtp.error(
942 "Unimplemented unsupported title: {}".format(word),
943 sortid="page/870",
944 )
945 word = w
946 elif word.startswith("Reconstruction:"):
947 word = word[word.find("/") + 1 :]
948 is_reconstruction = True
950 base_data: WordData = {
951 "word": word,
952 "lang": language,
953 "lang_code": lang_code,
954 }
955 if is_reconstruction:
956 data_append(base_data, "tags", "reconstruction")
957 sense_data: SenseData = {}
958 pos_data: WordData = {} # For a current part-of-speech
959 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
960 etym_data: WordData = {} # For one etymology
961 sense_datas: list[SenseData] = []
962 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
963 # Never reset, do not use as data
964 level_four_datas: list[WordData] = []
965 etym_datas: list[WordData] = []
966 page_datas: list[WordData] = []
967 have_etym = False
968 inside_level_four = False # This is for checking if the etymology section
969 # or article has a Pronunciation section, for Chinese mostly; because
970 # Chinese articles can have three level three sections (two etymology
971 # sections and pronunciation sections) one after another, we need a kludge
972 # to better keep track of whether we're in a normal "etym" or inside a
973 # "level four" (which is what we've turned the level three Pron sections
974 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
975 # a step.
976 stack: list[str] = [] # names of items on the "stack"
978 def merge_base(data: WordData, base: WordData) -> None:
979 for k, v in base.items():
980 # Copy the value to ensure that we don't share lists or
981 # dicts between structures (even nested ones).
982 v = copy.deepcopy(v)
983 if k not in data:
984 # The list was copied above, so this will not create shared ref
985 data[k] = v # type: ignore[literal-required]
986 continue
987 if data[k] == v: # type: ignore[literal-required]
988 continue
989 if ( 989 ↛ 997line 989 didn't jump to line 997 because the condition on line 989 was always true
990 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
991 or isinstance(
992 v,
993 (list, tuple), # Should this be "and"?
994 )
995 ):
996 data[k] = list(data[k]) + list(v) # type: ignore
997 elif data[k] != v: # type: ignore[literal-required]
998 wxr.wtp.warning(
999 "conflicting values for {} in merge_base: "
1000 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1001 sortid="page/904",
1002 )
1004 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1005 """Remove unnecessary keys from dict values
1006 in a list comprehension..."""
1007 if key in pron:
1008 pron.pop(key) # type: ignore
1009 return pron
1011 # If the result has sounds, eliminate sounds that have a prefix that
1012 # does not match "word" or one of "forms"
1013 if "sounds" in data and "word" in data:
1014 accepted = [data["word"]]
1015 accepted.extend(f["form"] for f in data.get("forms", dict()))
1016 data["sounds"] = list(
1017 s
1018 for s in data["sounds"]
1019 if "form" not in s or s["form"] in accepted
1020 )
1021 # If the result has sounds, eliminate sounds that have a pos that
1022 # does not match "pos"
1023 if "sounds" in data and "pos" in data:
1024 data["sounds"] = list(
1025 complementary_pop(s, "pos")
1026 for s in data["sounds"]
1027 # "pos" is not a field of SoundData, correctly, so we're
1028 # removing it here. It's a kludge on a kludge on a kludge.
1029 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1030 )
1032 def push_sense(sorting_ordinal: int | None = None) -> bool:
1033 """Starts collecting data for a new word sense. This returns True
1034 if a sense was added."""
1035 nonlocal sense_data
1036 if sorting_ordinal is None:
1037 sorting_ordinal = sense_ordinal
1038 tags = sense_data.get("tags", ())
1039 if (
1040 not sense_data.get("glosses")
1041 and "translation-hub" not in tags
1042 and "no-gloss" not in tags
1043 ):
1044 return False
1046 if ( 1046 ↛ 1056line 1046 didn't jump to line 1056 because the condition on line 1046 was never true
1047 (
1048 "participle" in sense_data.get("tags", ())
1049 or "infinitive" in sense_data.get("tags", ())
1050 )
1051 and "alt_of" not in sense_data
1052 and "form_of" not in sense_data
1053 and "etymology_text" in etym_data
1054 and etym_data["etymology_text"] != ""
1055 ):
1056 etym = etym_data["etymology_text"]
1057 etym = etym.split(". ")[0]
1058 ret = parse_alt_or_inflection_of(wxr, etym, set())
1059 if ret is not None:
1060 tags, lst = ret
1061 assert isinstance(lst, (list, tuple))
1062 if "form-of" in tags:
1063 data_extend(sense_data, "form_of", lst)
1064 data_extend(sense_data, "tags", tags)
1065 elif "alt-of" in tags:
1066 data_extend(sense_data, "alt_of", lst)
1067 data_extend(sense_data, "tags", tags)
1069 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1069 ↛ 1072line 1069 didn't jump to line 1072 because the condition on line 1069 was never true
1070 "tags", ()
1071 ):
1072 data_append(sense_data, "tags", "no-gloss")
1074 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal
1075 sense_datas.append(sense_data)
1076 sense_data = {}
1077 return True
1079 def push_pos(sorting_ordinal: int | None = None) -> None:
1080 """Starts collecting data for a new part-of-speech."""
1081 nonlocal pos_data
1082 nonlocal sense_datas
1083 push_sense(sorting_ordinal)
1084 if wxr.wtp.subsection:
1085 data: WordData = {"senses": sense_datas}
1086 merge_base(data, pos_data)
1087 level_four_datas.append(data)
1088 pos_data = {}
1089 sense_datas = []
1090 wxr.wtp.start_subsection(None)
1092 def push_level_four_section(clear_sound_data: bool) -> None:
1093 """Starts collecting data for a new level four sections, which
1094 is usually virtual and empty, unless the article has Chinese
1095 'Pronunciation' sections that are etymology-section-like but
1096 under etymology, and at the same level in the source. We modify
1097 the source to demote Pronunciation sections like that to level
1098 4, and other sections one step lower."""
1099 nonlocal level_four_data
1100 nonlocal level_four_datas
1101 nonlocal etym_datas
1102 push_pos()
1103 # print(f"======\n{etym_data=}")
1104 # print(f"======\n{etym_datas=}")
1105 # print(f"======\n{level_four_data=}")
1106 # print(f"======\n{level_four_datas=}")
1107 for data in level_four_datas:
1108 merge_base(data, level_four_data)
1109 etym_datas.append(data)
1110 for data in etym_datas:
1111 merge_base(data, etym_data)
1112 page_datas.append(data)
1113 if clear_sound_data:
1114 level_four_data = {}
1115 level_four_datas = []
1116 etym_datas = []
1118 def push_etym() -> None:
1119 """Starts collecting data for a new etymology."""
1120 nonlocal etym_data
1121 nonlocal etym_datas
1122 nonlocal have_etym
1123 nonlocal inside_level_four
1124 have_etym = True
1125 push_level_four_section(False)
1126 inside_level_four = False
1127 # etymology section could under pronunciation section
1128 etym_data = (
1129 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1130 )
1132 def select_data() -> WordData:
1133 """Selects where to store data (pos or etym) based on whether we
1134 are inside a pos (part-of-speech)."""
1135 # print(f"{wxr.wtp.subsection=}")
1136 # print(f"{stack=}")
1137 if wxr.wtp.subsection is not None:
1138 return pos_data
1139 if inside_level_four:
1140 return level_four_data
1141 if stack[-1] == language:
1142 return base_data
1143 return etym_data
1145 term_label_templates: list[TemplateData] = []
1147 def head_post_template_fn(
1148 name: str, ht: TemplateArgs, expansion: str
1149 ) -> Optional[str]:
1150 """Handles special templates in the head section of a word. Head
1151 section is the text after part-of-speech subtitle and before word
1152 sense list. Typically it generates the bold line for the word, but
1153 may also contain other useful information that often ends in
1154 side boxes. We want to capture some of that additional information."""
1155 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1156 if is_panel_template(wxr, name): 1156 ↛ 1159line 1156 didn't jump to line 1159 because the condition on line 1156 was never true
1157 # Completely ignore these templates (not even recorded in
1158 # head_templates)
1159 return ""
1160 if name == "head":
1161 # XXX are these also captured in forms? Should this special case
1162 # be removed?
1163 t = ht.get(2, "")
1164 if t == "pinyin": 1164 ↛ 1165line 1164 didn't jump to line 1165 because the condition on line 1164 was never true
1165 data_append(pos_data, "tags", "Pinyin")
1166 elif t == "romanization": 1166 ↛ 1167line 1166 didn't jump to line 1167 because the condition on line 1166 was never true
1167 data_append(pos_data, "tags", "romanization")
1168 if (
1169 HEAD_TAG_RE.search(name) is not None
1170 or name in WORD_LEVEL_HEAD_TEMPLATES
1171 ):
1172 args_ht = clean_template_args(wxr, ht)
1173 cleaned_expansion = clean_node(wxr, None, expansion)
1174 dt: TemplateData = {
1175 "name": name,
1176 "args": args_ht,
1177 "expansion": cleaned_expansion,
1178 }
1179 data_append(pos_data, "head_templates", dt)
1180 if name in WORD_LEVEL_HEAD_TEMPLATES:
1181 term_label_templates.append(dt)
1182 # Squash these, their tags are applied to the whole word,
1183 # and some cause problems like "term-label"
1184 return ""
1186 # The following are both captured in head_templates and parsed
1187 # separately
1189 if name in wikipedia_templates:
1190 # Note: various places expect to have content from wikipedia
1191 # templates, so cannot convert this to empty
1192 parse_wikipedia_template(wxr, pos_data, ht)
1193 return None
1195 if name == "number box": 1195 ↛ 1197line 1195 didn't jump to line 1197 because the condition on line 1195 was never true
1196 # XXX extract numeric value?
1197 return ""
1198 if name == "enum":
1199 # XXX extract?
1200 return ""
1201 if name == "cardinalbox": 1201 ↛ 1204line 1201 didn't jump to line 1204 because the condition on line 1201 was never true
1202 # XXX extract similar to enum?
1203 # XXX this can also occur in top-level under language
1204 return ""
1205 if name == "Han simplified forms": 1205 ↛ 1207line 1205 didn't jump to line 1207 because the condition on line 1205 was never true
1206 # XXX extract?
1207 return ""
1208 # if name == "ja-kanji forms":
1209 # # XXX extract?
1210 # return ""
1211 # if name == "vi-readings":
1212 # # XXX extract?
1213 # return ""
1214 # if name == "ja-kanji":
1215 # # XXX extract?
1216 # return ""
1217 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1217 ↛ 1219line 1217 didn't jump to line 1219 because the condition on line 1217 was never true
1218 # XXX extract?
1219 return ""
1221 return None
1223 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1224 """Parses the subsection for a part-of-speech under a language on
1225 a page."""
1226 assert isinstance(posnode, WikiNode)
1227 assert isinstance(pos, str)
1228 # print("parse_part_of_speech", pos)
1229 pos_data["pos"] = pos
1230 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1231 lists: list[list[WikiNode]] = [[]] # list of lists
1232 first_para = True
1233 first_head_tmplt = True
1234 collecting_head = True
1235 start_of_paragraph = True
1237 # XXX extract templates from posnode with recursively_extract
1238 # that break stuff, like ja-kanji or az-suffix-form.
1239 # Do the extraction with a list of template names, combined from
1240 # different lists, then separate out them into different lists
1241 # that are handled at different points of the POS section.
1242 # First, extract az-suffix-form, put it in `inflection`,
1243 # and parse `inflection`'s content when appropriate later.
1244 # The contents of az-suffix-form (and ja-kanji) that generate
1245 # divs with "floatright" in their style gets deleted by
1246 # clean_value, so templates that slip through from here won't
1247 # break anything.
1248 # XXX bookmark
1249 # print("===================")
1250 # print(posnode.children)
1252 floaters, poschildren = recursively_extract(
1253 posnode.children,
1254 lambda x: (
1255 isinstance(x, WikiNode)
1256 and (
1257 (
1258 x.kind == NodeKind.TEMPLATE
1259 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1260 )
1261 or (
1262 x.kind == NodeKind.LINK
1263 # Need to check for stringiness because some links are
1264 # broken; for example, if a template is missing an
1265 # argument, a link might look like `[[{{{1}}}...]]`
1266 and isinstance(x.largs[0][0], str)
1267 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1268 )
1269 )
1270 ),
1271 )
1272 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1273 tempnode.largs = [["Inflection"]]
1274 tempnode.children = floaters
1275 parse_inflection(tempnode, "Floating Div", pos)
1276 # print(poschildren)
1277 # XXX new above
1279 if not poschildren: 1279 ↛ 1280line 1279 didn't jump to line 1280 because the condition on line 1279 was never true
1280 if not floaters:
1281 wxr.wtp.debug(
1282 "PoS section without contents",
1283 sortid="en/page/1051/20230612",
1284 )
1285 else:
1286 wxr.wtp.debug(
1287 "PoS section without contents except for a floating table",
1288 sortid="en/page/1056/20230612",
1289 )
1290 return
1292 for node in poschildren:
1293 if isinstance(node, str):
1294 for m in re.finditer(r"\n+|[^\n]+", node):
1295 p = m.group(0)
1296 if p.startswith("\n\n") and pre:
1297 first_para = False
1298 start_of_paragraph = True
1299 break
1300 if p and collecting_head:
1301 pre[-1].append(p)
1302 continue
1303 assert isinstance(node, WikiNode)
1304 kind = node.kind
1305 if kind == NodeKind.LIST:
1306 lists[-1].append(node)
1307 collecting_head = False
1308 start_of_paragraph = True
1309 continue
1310 elif kind in LEVEL_KINDS:
1311 # Stop parsing section if encountering any kind of
1312 # level header (like ===Noun=== or ====Further Reading====).
1313 # At a quick glance, this should be the default behavior,
1314 # but if some kinds of source articles have sub-sub-sections
1315 # that should be parsed XXX it should be handled by changing
1316 # this break.
1317 break
1318 elif collecting_head and kind == NodeKind.LINK:
1319 # We might collect relevant links as they are often pictures
1320 # relating to the word
1321 if len(node.largs[0]) >= 1 and isinstance( 1321 ↛ 1336line 1321 didn't jump to line 1336 because the condition on line 1321 was always true
1322 node.largs[0][0], str
1323 ):
1324 if node.largs[0][0].startswith( 1324 ↛ 1330line 1324 didn't jump to line 1330 because the condition on line 1324 was never true
1325 ns_title_prefix_tuple(wxr, "Category")
1326 ):
1327 # [[Category:...]]
1328 # We're at the end of the file, probably, so stop
1329 # here. Otherwise the head will get garbage.
1330 break
1331 if node.largs[0][0].startswith( 1331 ↛ 1336line 1331 didn't jump to line 1336 because the condition on line 1331 was always true
1332 ns_title_prefix_tuple(wxr, "File")
1333 ):
1334 # Skips file links
1335 continue
1336 start_of_paragraph = False
1337 pre[-1].extend(node.largs[-1])
1338 elif kind == NodeKind.HTML:
1339 if node.sarg == "br":
1340 if pre[-1]: 1340 ↛ 1292line 1340 didn't jump to line 1292 because the condition on line 1340 was always true
1341 pre.append([]) # Switch to next head
1342 lists.append([]) # Lists parallels pre
1343 collecting_head = True
1344 start_of_paragraph = True
1345 elif collecting_head and node.sarg not in ( 1345 ↛ 1351line 1345 didn't jump to line 1351 because the condition on line 1345 was never true
1346 "gallery",
1347 "ref",
1348 "cite",
1349 "caption",
1350 ):
1351 start_of_paragraph = False
1352 pre[-1].append(node)
1353 else:
1354 start_of_paragraph = False
1355 elif isinstance(node, TemplateNode):
1356 # XXX Insert code here that disambiguates between
1357 # templates that generate word heads and templates
1358 # that don't.
1359 # There's head_tag_re that seems like a regex meant
1360 # to identify head templates. Too bad it's None.
1362 # ignore {{category}}, {{cat}}... etc.
1363 if node.template_name in stop_head_at_these_templates:
1364 # we've reached a template that should be at the end,
1365 continue
1367 # skip these templates; panel_templates is already used
1368 # to skip certain templates else, but it also applies to
1369 # head parsing quite well.
1370 # node.largs[0][0] should always be str, but can't type-check
1371 # that.
1372 if is_panel_template(wxr, node.template_name):
1373 continue
1374 # skip these templates
1375 # if node.largs[0][0] in skip_these_templates_in_head:
1376 # first_head_tmplt = False # no first_head_tmplt at all
1377 # start_of_paragraph = False
1378 # continue
1380 if first_head_tmplt and pre[-1]:
1381 first_head_tmplt = False
1382 start_of_paragraph = False
1383 pre[-1].append(node)
1384 elif pre[-1] and start_of_paragraph:
1385 pre.append([]) # Switch to the next head
1386 lists.append([]) # lists parallel pre
1387 collecting_head = True
1388 start_of_paragraph = False
1389 pre[-1].append(node)
1390 else:
1391 pre[-1].append(node)
1392 elif first_para:
1393 start_of_paragraph = False
1394 if collecting_head: 1394 ↛ 1292line 1394 didn't jump to line 1292 because the condition on line 1394 was always true
1395 pre[-1].append(node)
1396 # XXX use template_fn in clean_node to check that the head macro
1397 # is compatible with the current part-of-speech and generate warning
1398 # if not. Use template_allowed_pos_map.
1400 # Clean up empty pairs, and fix messes with extra newlines that
1401 # separate templates that are followed by lists wiktextract issue #314
1403 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1404 cleaned_lists: list[list[WikiNode]] = []
1405 pairless_pre_index = None
1407 for pre1, ls in zip(pre, lists):
1408 if pre1 and not ls:
1409 pairless_pre_index = len(cleaned_pre)
1410 if not pre1 and not ls: 1410 ↛ 1412line 1410 didn't jump to line 1412 because the condition on line 1410 was never true
1411 # skip [] + []
1412 continue
1413 if not ls and all(
1414 (isinstance(x, str) and not x.strip()) for x in pre1
1415 ):
1416 # skip ["\n", " "] + []
1417 continue
1418 if ls and not pre1:
1419 if pairless_pre_index is not None: 1419 ↛ 1420line 1419 didn't jump to line 1420 because the condition on line 1419 was never true
1420 cleaned_lists[pairless_pre_index] = ls
1421 pairless_pre_index = None
1422 continue
1423 cleaned_pre.append(pre1)
1424 cleaned_lists.append(ls)
1426 pre = cleaned_pre
1427 lists = cleaned_lists
1429 there_are_many_heads = len(pre) > 1
1430 header_tags: list[str] = []
1431 header_topics: list[str] = []
1432 previous_head_had_list = False
1434 if not any(g for g in lists):
1435 process_gloss_without_list(
1436 poschildren, pos, pos_data, header_tags, header_topics
1437 )
1438 else:
1439 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1440 # if len(ls) == 0:
1441 # # don't have gloss list
1442 # # XXX add code here to filter out 'garbage', like text
1443 # # that isn't a head template or head.
1444 # continue
1446 if all(not sl for sl in lists[i:]):
1447 if i == 0: 1447 ↛ 1448line 1447 didn't jump to line 1448 because the condition on line 1447 was never true
1448 if isinstance(node, str):
1449 wxr.wtp.debug(
1450 "first head without list of senses,"
1451 "string: '{}[...]', {}/{}".format(
1452 node[:20], word, language
1453 ),
1454 sortid="page/1689/20221215",
1455 )
1456 if isinstance(node, WikiNode):
1457 if node.largs and node.largs[0][0] in [
1458 "Han char",
1459 ]:
1460 # just ignore these templates
1461 pass
1462 else:
1463 wxr.wtp.debug(
1464 "first head without "
1465 "list of senses, "
1466 "template node "
1467 "{}, {}/{}".format(
1468 node.largs, word, language
1469 ),
1470 sortid="page/1694/20221215",
1471 )
1472 else:
1473 wxr.wtp.debug(
1474 "first head without list of senses, "
1475 "{}/{}".format(word, language),
1476 sortid="page/1700/20221215",
1477 )
1478 # no break here so that the first head always
1479 # gets processed.
1480 else:
1481 if isinstance(node, str): 1481 ↛ 1482line 1481 didn't jump to line 1482 because the condition on line 1481 was never true
1482 wxr.wtp.debug(
1483 "later head without list of senses,"
1484 "string: '{}[...]', {}/{}".format(
1485 node[:20], word, language
1486 ),
1487 sortid="page/1708/20221215",
1488 )
1489 if isinstance(node, WikiNode): 1489 ↛ 1501line 1489 didn't jump to line 1501 because the condition on line 1489 was always true
1490 wxr.wtp.debug(
1491 "later head without list of senses,"
1492 "template node "
1493 "{}, {}/{}".format(
1494 node.sarg if node.sarg else node.largs,
1495 word,
1496 language,
1497 ),
1498 sortid="page/1713/20221215",
1499 )
1500 else:
1501 wxr.wtp.debug(
1502 "later head without list of senses, "
1503 "{}/{}".format(word, language),
1504 sortid="page/1719/20221215",
1505 )
1506 break
1507 head_group = i + 1 if there_are_many_heads else None
1508 # print("parse_part_of_speech: {}: {}: pre={}"
1509 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1511 if previous_head_had_list:
1512 # We use a boolean flag here because we want to be able
1513 # let the header_tags data pass through after the loop
1514 # is over without accidentally emptying it, if there are
1515 # no pos_datas and we need a dummy data.
1516 header_tags.clear()
1517 header_topics.clear()
1519 process_gloss_header(
1520 pre1, pos, head_group, pos_data, header_tags, header_topics
1521 )
1522 for ln in ls:
1523 # Parse each list associated with this head.
1524 for node in ln.children:
1525 # Parse nodes in l.children recursively.
1526 # The recursion function uses push_sense() to
1527 # add stuff into sense_datas, and returns True or
1528 # False if something is added, which bubbles upward.
1529 # If the bubble is "True", then higher levels of
1530 # the recursion will not push_sense(), because
1531 # the data is already pushed into a sub-gloss
1532 # downstream, unless the higher level has examples
1533 # that need to be put somewhere.
1534 common_data: SenseData = {
1535 "tags": list(header_tags),
1536 "topics": list(header_topics),
1537 }
1538 if head_group:
1539 common_data["head_nr"] = head_group
1540 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1542 if len(ls) > 0:
1543 previous_head_had_list = True
1544 else:
1545 previous_head_had_list = False
1547 # If there are no senses extracted, add a dummy sense. We want to
1548 # keep tags extracted from the head for the dummy sense.
1549 push_sense() # Make sure unfinished data pushed, and start clean sense
1550 if len(sense_datas) == 0:
1551 data_extend(sense_data, "tags", header_tags)
1552 data_extend(sense_data, "topics", header_topics)
1553 data_append(sense_data, "tags", "no-gloss")
1554 push_sense()
1556 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0))
1558 for sd in sense_datas:
1559 if "__temp_sense_sorting_ordinal" in sd: 1559 ↛ 1558line 1559 didn't jump to line 1558 because the condition on line 1559 was always true
1560 del sd["__temp_sense_sorting_ordinal"]
1562 def process_gloss_header(
1563 header_nodes: list[Union[WikiNode, str]],
1564 pos_type: str,
1565 header_group: Optional[int],
1566 pos_data: WordData,
1567 header_tags: list[str],
1568 header_topics: list[str],
1569 ) -> None:
1570 ruby = []
1571 links: list[str] = []
1573 # process template parse nodes here
1574 new_nodes = []
1575 info_template_data = []
1576 for node in header_nodes:
1577 # print(f"{node=}")
1578 info_data, info_out = parse_info_template_node(wxr, node, "head")
1579 if info_data or info_out:
1580 if info_data: 1580 ↛ 1582line 1580 didn't jump to line 1582 because the condition on line 1580 was always true
1581 info_template_data.append(info_data)
1582 if info_out: # including just the original node 1582 ↛ 1583line 1582 didn't jump to line 1583 because the condition on line 1582 was never true
1583 new_nodes.append(info_out)
1584 else:
1585 new_nodes.append(node)
1586 header_nodes = new_nodes
1588 if info_template_data:
1589 if "info_templates" not in pos_data: 1589 ↛ 1592line 1589 didn't jump to line 1592 because the condition on line 1589 was always true
1590 pos_data["info_templates"] = info_template_data
1591 else:
1592 pos_data["info_templates"].extend(info_template_data)
1594 if not word.isalnum():
1595 # `-` is kosher, add more of these if needed.
1596 if word.replace("-", "").isalnum():
1597 pass
1598 else:
1599 # if the word contains non-letter or -number characters, it
1600 # might have something that messes with split-at-semi-comma; we
1601 # collect links so that we can skip splitting them.
1602 exp = wxr.wtp.parse(
1603 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1604 )
1605 link_nodes, _ = recursively_extract(
1606 exp.children,
1607 lambda x: isinstance(x, WikiNode)
1608 and x.kind == NodeKind.LINK,
1609 )
1610 for ln in link_nodes:
1611 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1612 if not ltext.isalnum():
1613 links.append(ltext)
1614 if word not in links: 1614 ↛ 1617line 1614 didn't jump to line 1617 because the condition on line 1614 was always true
1615 links.append(word)
1617 if lang_code == "ja":
1618 exp = wxr.wtp.parse(
1619 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1620 )
1621 rub, _ = recursively_extract(
1622 exp.children,
1623 lambda x: isinstance(x, WikiNode)
1624 and x.kind == NodeKind.HTML
1625 and x.sarg == "ruby",
1626 )
1627 if rub is not None: 1627 ↛ 1672line 1627 didn't jump to line 1672 because the condition on line 1627 was always true
1628 for r in rub:
1629 if TYPE_CHECKING:
1630 # we know the lambda above in recursively_extract
1631 # returns only WikiNodes in rub
1632 assert isinstance(r, WikiNode)
1633 rt = parse_ruby(wxr, r)
1634 if rt is not None:
1635 ruby.append(rt)
1636 elif lang_code == "vi":
1637 # Handle vi-readings templates that have a weird structures for
1638 # Chu Nom vietnamese characters heads
1639 # https://en.wiktionary.org/wiki/Template:vi-readings
1640 new_header_nodes = []
1641 related_readings: list[LinkageData] = []
1642 for node in header_nodes:
1643 if ( 1643 ↛ 1667line 1643 didn't jump to line 1667 because the condition on line 1643 was always true
1644 isinstance(node, TemplateNode)
1645 and node.template_name == "vi-readings"
1646 ):
1647 print(node.template_parameters)
1648 for parameter, tag in (
1649 ("hanviet", "han-viet-reading"),
1650 ("nom", "nom-reading"),
1651 # we ignore the fanqie parameter "phienthiet"
1652 ):
1653 arg = node.template_parameters.get(parameter)
1654 if arg is not None: 1654 ↛ 1648line 1654 didn't jump to line 1648 because the condition on line 1654 was always true
1655 text = clean_node(wxr, None, arg)
1656 for w in text.split(","):
1657 # ignore - separated references
1658 if "-" in w:
1659 w = w[: w.index("-")]
1660 w = w.strip()
1661 related_readings.append(
1662 LinkageData(word=w, tags=[tag])
1663 )
1664 continue
1666 # Skip the vi-reading template for the rest of the head parsing
1667 new_header_nodes.append(node)
1668 if len(related_readings) > 0: 1668 ↛ 1672line 1668 didn't jump to line 1672 because the condition on line 1668 was always true
1669 data_extend(pos_data, "related", related_readings)
1670 header_nodes = new_header_nodes
1672 header_text = clean_node(
1673 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1674 )
1676 if not header_text.strip():
1677 return
1679 term_label_tags: list[str] = []
1680 term_label_topics: list[str] = []
1681 if len(term_label_templates) > 0:
1682 # parse term label templates; if there are other similar kinds
1683 # of templates in headers that you want to squash and apply as
1684 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1685 for templ_data in term_label_templates:
1686 # print(templ_data)
1687 expan = templ_data.get("expansion", "").strip("().,; ")
1688 if not expan: 1688 ↛ 1689line 1688 didn't jump to line 1689 because the condition on line 1688 was never true
1689 continue
1690 tlb_tagsets, tlb_topics = decode_tags(expan)
1691 for tlb_tags in tlb_tagsets:
1692 if len(tlb_tags) > 0 and not any(
1693 t.startswith("error-") for t in tlb_tags
1694 ):
1695 term_label_tags.extend(tlb_tags)
1696 term_label_topics.extend(tlb_topics)
1697 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1699 header_text = re.sub(r"\s+", " ", header_text)
1700 # print(f"{header_text=}")
1701 parse_word_head(
1702 wxr,
1703 pos_type,
1704 header_text,
1705 pos_data,
1706 is_reconstruction,
1707 header_group,
1708 ruby=ruby,
1709 links=links,
1710 )
1711 if "tags" in pos_data:
1712 # pos_data can get "tags" data from some source; type-checkers
1713 # doesn't like it, so let's ignore it.
1714 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1715 del pos_data["tags"] # type: ignore[typeddict-item]
1716 if len(term_label_tags) > 0:
1717 header_tags.extend(term_label_tags)
1718 if len(term_label_topics) > 0:
1719 header_topics.extend(term_label_topics)
1721 def process_gloss_without_list(
1722 nodes: list[Union[WikiNode, str]],
1723 pos_type: str,
1724 pos_data: WordData,
1725 header_tags: list[str],
1726 header_topics: list[str],
1727 ) -> None:
1728 # gloss text might not inside a list
1729 header_nodes: list[Union[str, WikiNode]] = []
1730 gloss_nodes: list[Union[str, WikiNode]] = []
1731 for node in strip_nodes(nodes):
1732 if isinstance(node, WikiNode):
1733 if isinstance(node, TemplateNode):
1734 if node.template_name in (
1735 "zh-see",
1736 "ja-see",
1737 "ja-see-kango",
1738 ):
1739 continue # soft redirect
1740 elif (
1741 node.template_name == "head"
1742 or node.template_name.startswith(f"{lang_code}-")
1743 ):
1744 header_nodes.append(node)
1745 continue
1746 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1746 ↛ 1748line 1746 didn't jump to line 1748 because the condition on line 1746 was always true
1747 break
1748 gloss_nodes.append(node)
1750 if len(header_nodes) > 0:
1751 process_gloss_header(
1752 header_nodes,
1753 pos_type,
1754 None,
1755 pos_data,
1756 header_tags,
1757 header_topics,
1758 )
1759 if len(gloss_nodes) > 0:
1760 process_gloss_contents(
1761 gloss_nodes,
1762 pos_type,
1763 {"tags": list(header_tags), "topics": list(header_topics)},
1764 )
1766 def parse_sense_node(
1767 node: Union[str, WikiNode], # never receives str
1768 sense_base: SenseData,
1769 pos: str,
1770 ) -> bool:
1771 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1772 Uses push_sense() to attempt adding data to pos_data in the scope
1773 of parse_language() when it reaches deep in the recursion. push_sense()
1774 returns True if it succeeds, and that is bubbled up the stack; if
1775 a sense was added downstream, the higher levels (whose shared data
1776 was already added by a subsense) do not push_sense(), unless it
1777 has examples that need to be put somewhere.
1778 """
1779 assert isinstance(sense_base, dict) # Added to every sense deeper in
1781 nonlocal sense_ordinal
1782 my_ordinal = sense_ordinal # copies, not a reference
1783 sense_ordinal += 1 # only use for sorting
1785 if not isinstance(node, WikiNode): 1785 ↛ 1787line 1785 didn't jump to line 1787 because the condition on line 1785 was never true
1786 # This doesn't seem to ever happen in practice.
1787 wxr.wtp.debug(
1788 "{}: parse_sense_node called with"
1789 "something that isn't a WikiNode".format(pos),
1790 sortid="page/1287/20230119",
1791 )
1792 return False
1794 if node.kind != NodeKind.LIST_ITEM: 1794 ↛ 1795line 1794 didn't jump to line 1795 because the condition on line 1794 was never true
1795 wxr.wtp.debug(
1796 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1797 )
1798 return False
1800 if node.sarg == ":":
1801 # Skip example entries at the highest level, ones without
1802 # a sense ("...#") above them.
1803 # If node.sarg is exactly and only ":", then it's at
1804 # the highest level; lower levels would have more
1805 # "indentation", like "#:" or "##:"
1806 return False
1808 # If a recursion call succeeds in push_sense(), bubble it up with
1809 # `added`.
1810 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1811 added = False
1813 gloss_template_args: set[str] = set()
1815 # For LISTs and LIST_ITEMS, their argument is something like
1816 # "##" or "##:", and using that we can rudimentally determine
1817 # list 'depth' if need be, and also what kind of list or
1818 # entry it is; # is for normal glosses, : for examples (indent)
1819 # and * is used for quotations on wiktionary.
1820 current_depth = node.sarg
1822 children = node.children
1824 # subentries, (presumably) a list
1825 # of subglosses below this. The list's
1826 # argument ends with #, and its depth should
1827 # be bigger than parent node.
1828 subentries = [
1829 x
1830 for x in children
1831 if isinstance(x, WikiNode)
1832 and x.kind == NodeKind.LIST
1833 and x.sarg == current_depth + "#"
1834 ]
1836 # sublists of examples and quotations. .sarg
1837 # does not end with "#".
1838 others = [
1839 x
1840 for x in children
1841 if isinstance(x, WikiNode)
1842 and x.kind == NodeKind.LIST
1843 and x.sarg != current_depth + "#"
1844 ]
1846 # the actual contents of this particular node.
1847 # can be a gloss (or a template that expands into
1848 # many glosses which we can't easily pre-expand)
1849 # or could be an "outer gloss" with more specific
1850 # subglosses, or could be a qualfier for the subglosses.
1851 contents = [
1852 x
1853 for x in children
1854 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1855 ]
1856 # If this entry has sublists of entries, we should combine
1857 # gloss information from both the "outer" and sublist content.
1858 # Sometimes the outer gloss
1859 # is more non-gloss or tags, sometimes it is a coarse sense
1860 # and the inner glosses are more specific. The outer one
1861 # does not seem to have qualifiers.
1863 # If we have one sublist with one element, treat it
1864 # specially as it may be a Wiktionary error; raise
1865 # that nested element to the same level.
1866 # XXX If need be, this block can be easily removed in
1867 # the current recursive logicand the result is one sense entry
1868 # with both glosses in the glosses list, as you would
1869 # expect. If the higher entry has examples, there will
1870 # be a higher entry with some duplicated data.
1871 if len(subentries) == 1:
1872 slc = subentries[0].children
1873 if len(slc) == 1:
1874 # copy current node and modify it so it doesn't
1875 # loop infinitely.
1876 cropped_node = copy.copy(node)
1877 cropped_node.children = [
1878 x
1879 for x in children
1880 if not (
1881 isinstance(x, WikiNode)
1882 and x.kind == NodeKind.LIST
1883 and x.sarg == current_depth + "#"
1884 )
1885 ]
1886 added |= parse_sense_node(cropped_node, sense_base, pos)
1887 nonlocal sense_data # this kludge causes duplicated raw_
1888 # glosses data if this is not done;
1889 # if the top-level (cropped_node)
1890 # does not push_sense() properly or
1891 # parse_sense_node() returns early,
1892 # sense_data is not reset. This happens
1893 # for example when you have a no-gloss
1894 # string like "(intransitive)":
1895 # no gloss, push_sense() returns early
1896 # and sense_data has duplicate data with
1897 # sense_base
1898 sense_data = {}
1899 added |= parse_sense_node(slc[0], sense_base, pos)
1900 return added
1902 return process_gloss_contents(
1903 contents,
1904 pos,
1905 sense_base,
1906 subentries,
1907 others,
1908 gloss_template_args,
1909 added,
1910 my_ordinal,
1911 )
1913 def process_gloss_contents(
1914 contents: list[Union[str, WikiNode]],
1915 pos: str,
1916 sense_base: SenseData,
1917 subentries: list[WikiNode] = [],
1918 others: list[WikiNode] = [],
1919 gloss_template_args: Set[str] = set(),
1920 added: bool = False,
1921 sorting_ordinal: int | None = None,
1922 ) -> bool:
1923 def sense_template_fn(
1924 name: str, ht: TemplateArgs, is_gloss: bool = False
1925 ) -> Optional[str]:
1926 # print(f"sense_template_fn: {name}, {ht}")
1927 if name in wikipedia_templates:
1928 # parse_wikipedia_template(wxr, pos_data, ht)
1929 return None
1930 if is_panel_template(wxr, name):
1931 return ""
1932 if name in INFO_TEMPLATE_FUNCS:
1933 info_data, info_exp = parse_info_template_arguments(
1934 wxr, name, ht, "sense"
1935 )
1936 if info_data or info_exp: 1936 ↛ 1942line 1936 didn't jump to line 1942 because the condition on line 1936 was always true
1937 if info_data: 1937 ↛ 1939line 1937 didn't jump to line 1939 because the condition on line 1937 was always true
1938 data_append(sense_base, "info_templates", info_data)
1939 if info_exp and isinstance(info_exp, str): 1939 ↛ 1941line 1939 didn't jump to line 1941 because the condition on line 1939 was always true
1940 return info_exp
1941 return ""
1942 if name in ("defdate",):
1943 date = clean_node(wxr, None, ht.get(1, ()))
1944 if part_two := ht.get(2): 1944 ↛ 1946line 1944 didn't jump to line 1946 because the condition on line 1944 was never true
1945 # Unicode mdash, not '-'
1946 date += "–" + clean_node(wxr, None, part_two)
1947 refs: dict[str, ReferenceData] = {}
1948 # ref, refn, ref2, ref2n, ref3, ref3n
1949 # ref1 not valid
1950 for k, v in sorted(
1951 (k, v) for k, v in ht.items() if isinstance(k, str)
1952 ):
1953 if m := re.match(r"ref(\d?)(n?)", k): 1953 ↛ 1950line 1953 didn't jump to line 1950 because the condition on line 1953 was always true
1954 ref_v = clean_node(wxr, None, v)
1955 if m.group(1) not in refs: # empty string or digit
1956 refs[m.group(1)] = ReferenceData()
1957 if m.group(2):
1958 refs[m.group(1)]["refn"] = ref_v
1959 else:
1960 refs[m.group(1)]["text"] = ref_v
1961 data_append(
1962 sense_base,
1963 "attestations",
1964 AttestationData(date=date, references=list(refs.values())),
1965 )
1966 return ""
1967 if name == "senseid":
1968 langid = clean_node(wxr, None, ht.get(1, ()))
1969 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1970 if re.match(r"Q\d+$", arg):
1971 data_append(sense_base, "wikidata", arg)
1972 data_append(sense_base, "senseid", langid + ":" + arg)
1973 if name in sense_linkage_templates:
1974 # print(f"SENSE_TEMPLATE_FN: {name}")
1975 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1976 return ""
1977 if name == "†" or name == "zh-obsolete":
1978 data_append(sense_base, "tags", "obsolete")
1979 return ""
1980 if name in {
1981 "ux",
1982 "uxi",
1983 "usex",
1984 "afex",
1985 "prefixusex",
1986 "ko-usex",
1987 "ko-x",
1988 "hi-x",
1989 "ja-usex-inline",
1990 "ja-x",
1991 "quotei",
1992 "he-x",
1993 "hi-x",
1994 "km-x",
1995 "ne-x",
1996 "shn-x",
1997 "th-x",
1998 "ur-x",
1999 }:
2000 # Usage examples are captured separately below. We don't
2001 # want to expand them into glosses even when unusual coding
2002 # is used in the entry.
2003 # These templates may slip through inside another item, but
2004 # currently we're separating out example entries (..#:)
2005 # well enough that there seems to very little contamination.
2006 if is_gloss:
2007 wxr.wtp.wiki_notice(
2008 "Example template is used for gloss text",
2009 sortid="extractor.en.page.sense_template_fn/1415",
2010 )
2011 else:
2012 return ""
2013 if name == "w": 2013 ↛ 2014line 2013 didn't jump to line 2014 because the condition on line 2013 was never true
2014 if ht.get(2) == "Wp":
2015 return ""
2016 for k, v in ht.items():
2017 v = v.strip()
2018 if v and "<" not in v:
2019 gloss_template_args.add(v)
2020 return None
2022 def extract_link_texts(item: GeneralNode) -> None:
2023 """Recursively extracts link texts from the gloss source. This
2024 information is used to select whether to remove final "." from
2025 form_of/alt_of (e.g., ihm/Hunsrik)."""
2026 if isinstance(item, (list, tuple)):
2027 for x in item:
2028 extract_link_texts(x)
2029 return
2030 if isinstance(item, str):
2031 # There seem to be HTML sections that may futher contain
2032 # unparsed links.
2033 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2033 ↛ 2034line 2033 didn't jump to line 2034 because the loop on line 2033 never started
2034 print("ITER:", m.group(0))
2035 v = m.group(1).split("|")[-1].strip()
2036 if v:
2037 gloss_template_args.add(v)
2038 return
2039 if not isinstance(item, WikiNode): 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true
2040 return
2041 if item.kind == NodeKind.LINK:
2042 v = item.largs[-1]
2043 if ( 2043 ↛ 2049line 2043 didn't jump to line 2049 because the condition on line 2043 was always true
2044 isinstance(v, list)
2045 and len(v) == 1
2046 and isinstance(v[0], str)
2047 ):
2048 gloss_template_args.add(v[0].strip())
2049 for x in item.children:
2050 extract_link_texts(x)
2052 extract_link_texts(contents)
2054 # get the raw text of non-list contents of this node, and other stuff
2055 # like tag and category data added to sense_base
2056 # cast = no-op type-setter for the type-checker
2057 partial_template_fn = cast(
2058 TemplateFnCallable,
2059 partial(sense_template_fn, is_gloss=True),
2060 )
2061 rawgloss = clean_node(
2062 wxr,
2063 sense_base,
2064 contents,
2065 template_fn=partial_template_fn,
2066 collect_links=True,
2067 )
2069 if not rawgloss: 2069 ↛ 2070line 2069 didn't jump to line 2070 because the condition on line 2069 was never true
2070 return False
2072 # remove manually typed ordered list text at the start("1. ")
2073 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2075 # get stuff like synonyms and categories from "others",
2076 # maybe examples and quotations
2077 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2079 # The gloss could contain templates that produce more list items.
2080 # This happens commonly with, e.g., {{inflection of|...}}. Split
2081 # to parts. However, e.g. Interlingua generates multiple glosses
2082 # in HTML directly without Wikitext markup, so we must also split
2083 # by just newlines.
2084 subglosses = rawgloss.splitlines()
2086 if len(subglosses) == 0: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true
2087 return False
2089 if any(s.startswith("#") for s in subglosses):
2090 subtree = wxr.wtp.parse(rawgloss)
2091 # from wikitextprocessor.parser import print_tree
2092 # print("SUBTREE GENERATED BY TEMPLATE:")
2093 # print_tree(subtree)
2094 new_subentries = [
2095 x
2096 for x in subtree.children
2097 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2098 ]
2100 new_others = [
2101 x
2102 for x in subtree.children
2103 if isinstance(x, WikiNode)
2104 and x.kind == NodeKind.LIST
2105 and not x.sarg.endswith("#")
2106 ]
2108 new_contents = [
2109 clean_node(wxr, [], x)
2110 for x in subtree.children
2111 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2112 ]
2114 subentries = subentries or new_subentries
2115 others = others or new_others
2116 subglosses = new_contents
2117 rawgloss = "".join(subglosses)
2118 # Generate no gloss for translation hub pages, but add the
2119 # "translation-hub" tag for them
2120 if rawgloss == "(This entry is a translation hub.)": 2120 ↛ 2121line 2120 didn't jump to line 2121 because the condition on line 2120 was never true
2121 data_append(sense_data, "tags", "translation-hub")
2122 return push_sense(sorting_ordinal)
2124 # Remove certain substrings specific to outer glosses
2125 strip_ends = [", particularly:"]
2126 for x in strip_ends:
2127 if rawgloss.endswith(x):
2128 rawgloss = rawgloss[: -len(x)].strip()
2129 break
2131 # A single gloss, or possibly an outer gloss.
2132 # Check if the possible outer gloss starts with
2133 # parenthesized tags/topics
2135 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2136 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2137 m = QUALIFIERS_RE.match(rawgloss)
2138 # (...): ... or (...(...)...): ...
2139 if m:
2140 q = m.group(1)
2141 rawgloss = rawgloss[m.end() :].strip()
2142 parse_sense_qualifier(wxr, q, sense_base)
2143 if rawgloss == "A pejorative:": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true
2144 data_append(sense_base, "tags", "pejorative")
2145 rawgloss = ""
2146 elif rawgloss == "Short forms.": 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true
2147 data_append(sense_base, "tags", "abbreviation")
2148 rawgloss = ""
2149 elif rawgloss == "Technical or specialized senses.": 2149 ↛ 2150line 2149 didn't jump to line 2150 because the condition on line 2149 was never true
2150 rawgloss = ""
2151 elif rawgloss.startswith("inflection of "):
2152 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2153 if parsed is not None: 2153 ↛ 2162line 2153 didn't jump to line 2162 because the condition on line 2153 was always true
2154 tags, origins = parsed
2155 if origins is not None: 2155 ↛ 2157line 2155 didn't jump to line 2157 because the condition on line 2155 was always true
2156 data_extend(sense_base, "form_of", origins)
2157 if tags is not None: 2157 ↛ 2160line 2157 didn't jump to line 2160 because the condition on line 2157 was always true
2158 data_extend(sense_base, "tags", tags)
2159 else:
2160 data_append(sense_base, "tags", "form-of")
2161 else:
2162 data_append(sense_base, "tags", "form-of")
2163 if rawgloss: 2163 ↛ 2194line 2163 didn't jump to line 2194 because the condition on line 2163 was always true
2164 # Code duplicating a lot of clean-up operations from later in
2165 # this block. We want to clean up the "supergloss" as much as
2166 # possible, in almost the same way as a normal gloss.
2167 supergloss = rawgloss
2169 if supergloss.startswith("; "): 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true
2170 supergloss = supergloss[1:].strip()
2172 if supergloss.startswith(("^†", "†")):
2173 data_append(sense_base, "tags", "obsolete")
2174 supergloss = supergloss[2:].strip()
2175 elif supergloss.startswith("^‡"): 2175 ↛ 2176line 2175 didn't jump to line 2176 because the condition on line 2175 was never true
2176 data_extend(sense_base, "tags", ["obsolete", "historical"])
2177 supergloss = supergloss[2:].strip()
2179 # remove [14th century...] style brackets at the end
2180 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2182 if supergloss.startswith((",", ":")):
2183 supergloss = supergloss[1:]
2184 supergloss = supergloss.strip()
2185 if supergloss.startswith("N. of "): 2185 ↛ 2186line 2185 didn't jump to line 2186 because the condition on line 2185 was never true
2186 supergloss = "Name of " + supergloss[6:]
2187 supergloss = supergloss[2:]
2188 data_append(sense_base, "glosses", supergloss)
2189 if supergloss in ("A person:",):
2190 data_append(sense_base, "tags", "g-person")
2192 # The main recursive call (except for the exceptions at the
2193 # start of this function).
2194 for sublist in subentries:
2195 if not ( 2195 ↛ 2198line 2195 didn't jump to line 2198 because the condition on line 2195 was never true
2196 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2197 ):
2198 wxr.wtp.debug(
2199 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2200 f"with items that are not LISTs",
2201 sortid="page/1511/20230119",
2202 )
2203 continue
2204 for item in sublist.children:
2205 if not ( 2205 ↛ 2209line 2205 didn't jump to line 2209 because the condition on line 2205 was never true
2206 isinstance(item, WikiNode)
2207 and item.kind == NodeKind.LIST_ITEM
2208 ):
2209 continue
2210 # copy sense_base to prevent cross-contamination between
2211 # subglosses and other subglosses and superglosses
2212 sense_base2 = copy.deepcopy(sense_base)
2213 if parse_sense_node(item, sense_base2, pos): 2213 ↛ 2204line 2213 didn't jump to line 2204 because the condition on line 2213 was always true
2214 added = True
2216 # Capture examples.
2217 # This is called after the recursive calls above so that
2218 # sense_base is not contaminated with meta-data from
2219 # example entries for *this* gloss.
2220 examples = []
2221 if wxr.config.capture_examples: 2221 ↛ 2225line 2221 didn't jump to line 2225 because the condition on line 2221 was always true
2222 examples = extract_examples(others, sense_base)
2224 # push_sense() succeeded somewhere down-river, so skip this level
2225 if added:
2226 if examples:
2227 # this higher-up gloss has examples that we do not want to skip
2228 wxr.wtp.debug(
2229 "'{}[...]' gloss has examples we want to keep, "
2230 "but there are subglosses.".format(repr(rawgloss[:30])),
2231 sortid="page/1498/20230118",
2232 )
2233 else:
2234 return True
2236 # Some entries, e.g., "iacebam", have weird sentences in quotes
2237 # after the gloss, but these sentences don't seem to be intended
2238 # as glosses. Skip them.
2239 indexed_subglosses = list(
2240 (i, gl)
2241 for i, gl in enumerate(subglosses)
2242 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2243 )
2245 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2245 ↛ 2246line 2245 didn't jump to line 2246 because the condition on line 2245 was never true
2246 gl = indexed_subglosses[0][1].strip()
2247 if gl.endswith(":"):
2248 gl = gl[:-1].strip()
2249 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2250 if parsed is not None:
2251 infl_tags, infl_dts = parsed
2252 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2253 # Interpret others as a particular form under
2254 # "inflection of"
2255 data_extend(sense_base, "tags", infl_tags)
2256 data_extend(sense_base, "form_of", infl_dts)
2257 indexed_subglosses = indexed_subglosses[1:]
2258 elif not infl_dts:
2259 data_extend(sense_base, "tags", infl_tags)
2260 indexed_subglosses = indexed_subglosses[1:]
2262 # Create senses for remaining subglosses
2263 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2264 gloss = gloss.strip()
2265 if not gloss and len(indexed_subglosses) > 1: 2265 ↛ 2266line 2265 didn't jump to line 2266 because the condition on line 2265 was never true
2266 continue
2267 # Push a new sense (if the last one is not empty)
2268 if push_sense(sorting_ordinal): 2268 ↛ 2269line 2268 didn't jump to line 2269 because the condition on line 2268 was never true
2269 added = True
2270 # if gloss not in sense_data.get("raw_glosses", ()):
2271 # data_append(sense_data, "raw_glosses", gloss)
2272 if i == 0 and examples:
2273 # In a multi-line gloss, associate examples
2274 # with only one of them.
2275 # XXX or you could use gloss_i == len(indexed_subglosses)
2276 # to associate examples with the *last* one.
2277 data_extend(sense_data, "examples", examples)
2278 if gloss.startswith("; ") and gloss_i > 0: 2278 ↛ 2279line 2278 didn't jump to line 2279 because the condition on line 2278 was never true
2279 gloss = gloss[1:].strip()
2280 # If the gloss starts with †, mark as obsolete
2281 if gloss.startswith("^†"): 2281 ↛ 2282line 2281 didn't jump to line 2282 because the condition on line 2281 was never true
2282 data_append(sense_data, "tags", "obsolete")
2283 gloss = gloss[2:].strip()
2284 elif gloss.startswith("^‡"): 2284 ↛ 2285line 2284 didn't jump to line 2285 because the condition on line 2284 was never true
2285 data_extend(sense_data, "tags", ["obsolete", "historical"])
2286 gloss = gloss[2:].strip()
2287 # Copy data for all senses to this sense
2288 for k, v in sense_base.items():
2289 if isinstance(v, (list, tuple)):
2290 if k != "tags":
2291 # Tags handled below (countable/uncountable special)
2292 data_extend(sense_data, k, v)
2293 else:
2294 assert k not in ("tags", "categories", "topics")
2295 sense_data[k] = v # type:ignore[literal-required]
2296 # Parse the gloss for this particular sense
2297 m = QUALIFIERS_RE.match(gloss)
2298 # (...): ... or (...(...)...): ...
2299 if m:
2300 parse_sense_qualifier(wxr, m.group(1), sense_data)
2301 gloss = gloss[m.end() :].strip()
2303 # Remove common suffix "[from 14th c.]" and similar
2304 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2306 # Check to make sure we don't have unhandled list items in gloss
2307 ofs = max(gloss.find("#"), gloss.find("* "))
2308 if ofs > 10 and "(#)" not in gloss:
2309 wxr.wtp.debug(
2310 "gloss may contain unhandled list items: {}".format(gloss),
2311 sortid="page/1412",
2312 )
2313 elif "\n" in gloss: 2313 ↛ 2314line 2313 didn't jump to line 2314 because the condition on line 2313 was never true
2314 wxr.wtp.debug(
2315 "gloss contains newline: {}".format(gloss),
2316 sortid="page/1416",
2317 )
2319 # Kludge, some glosses have a comma after initial qualifiers in
2320 # parentheses
2321 if gloss.startswith((",", ":")):
2322 gloss = gloss[1:]
2323 gloss = gloss.strip()
2324 if gloss.endswith(":"):
2325 gloss = gloss[:-1].strip()
2326 if gloss.startswith("N. of "): 2326 ↛ 2327line 2326 didn't jump to line 2327 because the condition on line 2326 was never true
2327 gloss = "Name of " + gloss[6:]
2328 if gloss.startswith("†"): 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true
2329 data_append(sense_data, "tags", "obsolete")
2330 gloss = gloss[1:]
2331 elif gloss.startswith("^†"): 2331 ↛ 2332line 2331 didn't jump to line 2332 because the condition on line 2331 was never true
2332 data_append(sense_data, "tags", "obsolete")
2333 gloss = gloss[2:]
2335 # Copy tags from sense_base if any. This will not copy
2336 # countable/uncountable if either was specified in the sense,
2337 # as sometimes both are specified in word head but only one
2338 # in individual senses.
2339 countability_tags = []
2340 base_tags = sense_base.get("tags", ())
2341 sense_tags = sense_data.get("tags", ())
2342 for tag in base_tags:
2343 if tag in ("countable", "uncountable"):
2344 if tag not in countability_tags: 2344 ↛ 2346line 2344 didn't jump to line 2346 because the condition on line 2344 was always true
2345 countability_tags.append(tag)
2346 continue
2347 if tag not in sense_tags:
2348 data_append(sense_data, "tags", tag)
2349 if countability_tags:
2350 if ( 2350 ↛ 2359line 2350 didn't jump to line 2359 because the condition on line 2350 was always true
2351 "countable" not in sense_tags
2352 and "uncountable" not in sense_tags
2353 ):
2354 data_extend(sense_data, "tags", countability_tags)
2356 # If outer gloss specifies a form-of ("inflection of", see
2357 # aquamarine/German), try to parse the inner glosses as
2358 # tags for an inflected form.
2359 if "form-of" in sense_base.get("tags", ()):
2360 parsed = parse_alt_or_inflection_of(
2361 wxr, gloss, gloss_template_args
2362 )
2363 if parsed is not None: 2363 ↛ 2369line 2363 didn't jump to line 2369 because the condition on line 2363 was always true
2364 infl_tags, infl_dts = parsed
2365 if not infl_dts and infl_tags: 2365 ↛ 2369line 2365 didn't jump to line 2369 because the condition on line 2365 was always true
2366 # Interpret as a particular form under "inflection of"
2367 data_extend(sense_data, "tags", infl_tags)
2369 if not gloss: 2369 ↛ 2370line 2369 didn't jump to line 2370 because the condition on line 2369 was never true
2370 data_append(sense_data, "tags", "empty-gloss")
2371 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2372 if ( 2372 ↛ 2383line 2372 didn't jump to line 2383 because the condition on line 2372 was always true
2373 gloss_i == 0
2374 and len(sense_data.get("glosses", tuple())) >= 1
2375 ):
2376 # If we added a "high-level gloss" from rawgloss, but this
2377 # is that same gloss_i, add this instead of the raw_gloss
2378 # from before if they're different: the rawgloss was not
2379 # cleaned exactly the same as this later gloss
2380 sense_data["glosses"][-1] = gloss
2381 else:
2382 # Add the gloss for the sense.
2383 data_append(sense_data, "glosses", gloss)
2385 # Kludge: there are cases (e.g., etc./Swedish) where there are
2386 # two abbreviations in the same sense, both generated by the
2387 # {{abbreviation of|...}} template. Handle these with some magic.
2388 position = 0
2389 split_glosses = []
2390 for m in re.finditer(r"Abbreviation of ", gloss):
2391 if m.start() != position: 2391 ↛ 2390line 2391 didn't jump to line 2390 because the condition on line 2391 was always true
2392 split_glosses.append(gloss[position : m.start()])
2393 position = m.start()
2394 split_glosses.append(gloss[position:])
2395 for gloss in split_glosses:
2396 # Check if this gloss describes an alt-of or inflection-of
2397 if (
2398 lang_code != "en"
2399 and " " not in gloss
2400 and distw([word], gloss) < 0.3
2401 ):
2402 # Don't try to parse gloss if it is one word
2403 # that is close to the word itself for non-English words
2404 # (probable translations of a tag/form name)
2405 continue
2406 parsed = parse_alt_or_inflection_of(
2407 wxr, gloss, gloss_template_args
2408 )
2409 if parsed is None:
2410 continue
2411 tags, dts = parsed
2412 if not dts and tags:
2413 data_extend(sense_data, "tags", tags)
2414 continue
2415 for dt in dts: # type:ignore[union-attr]
2416 ftags = list(tag for tag in tags if tag != "form-of")
2417 if "alt-of" in tags:
2418 data_extend(sense_data, "tags", ftags)
2419 data_append(sense_data, "alt_of", dt)
2420 elif "compound-of" in tags: 2420 ↛ 2421line 2420 didn't jump to line 2421 because the condition on line 2420 was never true
2421 data_extend(sense_data, "tags", ftags)
2422 data_append(sense_data, "compound_of", dt)
2423 elif "synonym-of" in tags: 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true
2424 data_extend(dt, "tags", ftags)
2425 data_append(sense_data, "synonyms", dt)
2426 elif tags and dt.get("word", "").startswith("of "): 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true
2427 dt["word"] = dt["word"][3:]
2428 data_append(sense_data, "tags", "form-of")
2429 data_extend(sense_data, "tags", ftags)
2430 data_append(sense_data, "form_of", dt)
2431 elif "form-of" in tags: 2431 ↛ 2415line 2431 didn't jump to line 2415 because the condition on line 2431 was always true
2432 data_extend(sense_data, "tags", tags)
2433 data_append(sense_data, "form_of", dt)
2435 if len(sense_data) == 0:
2436 if len(sense_base.get("tags", [])) == 0: 2436 ↛ 2438line 2436 didn't jump to line 2438 because the condition on line 2436 was always true
2437 del sense_base["tags"]
2438 sense_data.update(sense_base)
2439 if push_sense(sorting_ordinal): 2439 ↛ 2443line 2439 didn't jump to line 2443 because the condition on line 2439 was always true
2440 # push_sense succeded in adding a sense to pos_data
2441 added = True
2442 # print("PARSE_SENSE DONE:", pos_datas[-1])
2443 return added
2445 def parse_inflection(
2446 node: WikiNode, section: str, pos: Optional[str]
2447 ) -> None:
2448 """Parses inflection data (declension, conjugation) from the given
2449 page. This retrieves the actual inflection template
2450 parameters, which are very useful for applications that need
2451 to learn the inflection classes and generate inflected
2452 forms."""
2453 assert isinstance(node, WikiNode)
2454 assert isinstance(section, str)
2455 assert pos is None or isinstance(pos, str)
2456 # print("parse_inflection:", node)
2458 if pos is None: 2458 ↛ 2459line 2458 didn't jump to line 2459 because the condition on line 2458 was never true
2459 wxr.wtp.debug(
2460 "inflection table outside part-of-speech", sortid="page/1812"
2461 )
2462 return
2464 def inflection_template_fn(
2465 name: str, ht: TemplateArgs
2466 ) -> Optional[str]:
2467 # print("decl_conj_template_fn", name, ht)
2468 if is_panel_template(wxr, name): 2468 ↛ 2469line 2468 didn't jump to line 2469 because the condition on line 2468 was never true
2469 return ""
2470 if name in ("is-u-mutation",): 2470 ↛ 2473line 2470 didn't jump to line 2473 because the condition on line 2470 was never true
2471 # These are not to be captured as an exception to the
2472 # generic code below
2473 return None
2474 m = re.search(
2475 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2476 r"declension|inflection|mut|mutation)($|-)",
2477 name,
2478 )
2479 if m:
2480 args_ht = clean_template_args(wxr, ht)
2481 dt = {"name": name, "args": args_ht}
2482 data_append(pos_data, "inflection_templates", dt)
2484 return None
2486 # Convert the subtree back to Wikitext, then expand all and parse,
2487 # capturing templates in the process
2488 text = wxr.wtp.node_to_wikitext(node.children)
2490 # Split text into separate sections for each to-level template
2491 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2492 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2493 # The (?:...) creates a non-capturing regex group; if it was capturing,
2494 # like the group around it, it would create elements in brace_matches,
2495 # including None if it doesn't match.
2496 # 20250114: Added {| and |} into the regex because tables were being
2497 # cut into pieces by this code. Issue #973, introduction of two-part
2498 # book-end templates similar to trans-top and tran-bottom.
2499 template_sections = []
2500 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2501 # Because there is the possibility of triple curly braces
2502 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2503 # count nesting depth using pairs of two brackets, but
2504 # instead use singular braces ("{ }").
2505 # Because template delimiters should be balanced, regardless
2506 # of whether {{ or {{{ is used, and because we only care
2507 # about the outer-most delimiters (the highest level template)
2508 # we can just count the single braces when those single
2509 # braces are part of a group.
2510 table_nesting = 0
2511 # However, if we have a stray table ({| ... |}) that should always
2512 # be its own section, and should prevent templates from cutting it
2513 # into sections.
2515 # print(f"Parse inflection: {text=}")
2516 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2517 if len(brace_matches) > 1:
2518 tsection: list[str] = []
2519 after_templates = False # kludge to keep any text
2520 # before first template
2521 # with the first template;
2522 # otherwise, text
2523 # goes with preceding template
2524 for m in brace_matches:
2525 if m.startswith("\n; ") and after_templates: 2525 ↛ 2526line 2525 didn't jump to line 2526 because the condition on line 2525 was never true
2526 after_templates = False
2527 template_sections.append(tsection)
2528 tsection = []
2529 tsection.append(m)
2530 elif m.startswith("{{") or m.endswith("{|"):
2531 if (
2532 template_nesting == 0
2533 and after_templates
2534 and table_nesting == 0
2535 ):
2536 template_sections.append(tsection)
2537 tsection = []
2538 # start new section
2539 after_templates = True
2540 if m.startswith("{{"):
2541 template_nesting += 1
2542 else:
2543 # m.endswith("{|")
2544 table_nesting += 1
2545 tsection.append(m)
2546 elif m.startswith("}}") or m.endswith("|}"):
2547 if m.startswith("}}"):
2548 template_nesting -= 1
2549 if template_nesting < 0: 2549 ↛ 2550line 2549 didn't jump to line 2550 because the condition on line 2549 was never true
2550 wxr.wtp.error(
2551 "Negatively nested braces, "
2552 "couldn't split inflection templates, "
2553 "{}/{} section {}".format(
2554 word, language, section
2555 ),
2556 sortid="page/1871",
2557 )
2558 template_sections = [] # use whole text
2559 break
2560 else:
2561 table_nesting -= 1
2562 if table_nesting < 0: 2562 ↛ 2563line 2562 didn't jump to line 2563 because the condition on line 2562 was never true
2563 wxr.wtp.error(
2564 "Negatively nested table braces, "
2565 "couldn't split inflection section, "
2566 "{}/{} section {}".format(
2567 word, language, section
2568 ),
2569 sortid="page/20250114",
2570 )
2571 template_sections = [] # use whole text
2572 break
2573 tsection.append(m)
2574 else:
2575 tsection.append(m)
2576 if tsection: # dangling tsection 2576 ↛ 2584line 2576 didn't jump to line 2584 because the condition on line 2576 was always true
2577 template_sections.append(tsection)
2578 # Why do it this way around? The parser has a preference
2579 # to associate bits outside of tables with the preceding
2580 # table (`after`-variable), so a new tsection begins
2581 # at {{ and everything before it belongs to the previous
2582 # template.
2584 texts = []
2585 if not template_sections:
2586 texts = [text]
2587 else:
2588 for tsection in template_sections:
2589 texts.append("".join(tsection))
2590 if template_nesting != 0: 2590 ↛ 2591line 2590 didn't jump to line 2591 because the condition on line 2590 was never true
2591 wxr.wtp.error(
2592 "Template nesting error: "
2593 "template_nesting = {} "
2594 "couldn't split inflection templates, "
2595 "{}/{} section {}".format(
2596 template_nesting, word, language, section
2597 ),
2598 sortid="page/1896",
2599 )
2600 texts = [text]
2601 for text in texts:
2602 tree = wxr.wtp.parse(
2603 text, expand_all=True, template_fn=inflection_template_fn
2604 )
2606 if not text.strip():
2607 continue
2609 # Parse inflection tables from the section. The data is stored
2610 # under "forms".
2611 if wxr.config.capture_inflections: 2611 ↛ 2601line 2611 didn't jump to line 2601 because the condition on line 2611 was always true
2612 tablecontext = None
2613 m = re.search(r"{{([^}{|]+)\|?", text)
2614 if m:
2615 template_name = m.group(1)
2616 tablecontext = TableContext(template_name)
2618 parse_inflection_section(
2619 wxr,
2620 pos_data,
2621 word,
2622 language,
2623 pos,
2624 section,
2625 tree,
2626 tablecontext=tablecontext,
2627 )
2629 def get_subpage_section(
2630 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2631 ) -> Optional[Union[WikiNode, str]]:
2632 """Loads a subpage of the given page, and finds the section
2633 for the given language, part-of-speech, and section title. This
2634 is used for finding translations and other sections on subpages."""
2635 assert isinstance(language, str)
2636 assert isinstance(title, str)
2637 assert isinstance(subtitle, str)
2638 assert isinstance(seqs, (list, tuple))
2639 for seq in seqs:
2640 for x in seq:
2641 assert isinstance(x, str)
2642 subpage_title = word + "/" + subtitle
2643 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2644 if subpage_content is None:
2645 wxr.wtp.error(
2646 "/translations not found despite "
2647 "{{see translation subpage|...}}",
2648 sortid="page/1934",
2649 )
2650 return None
2652 def recurse(
2653 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2654 ) -> Optional[Union[str, WikiNode]]:
2655 # print(f"seq: {seq}")
2656 if not seq:
2657 return node
2658 if not isinstance(node, WikiNode):
2659 return None
2660 # print(f"node.kind: {node.kind}")
2661 if node.kind in LEVEL_KINDS:
2662 t = clean_node(wxr, None, node.largs[0])
2663 # print(f"t: {t} == seq[0]: {seq[0]}?")
2664 if t.lower() == seq[0].lower():
2665 seq = seq[1:]
2666 if not seq:
2667 return node
2668 for n in node.children:
2669 ret = recurse(n, seq)
2670 if ret is not None:
2671 return ret
2672 return None
2674 tree = wxr.wtp.parse(
2675 subpage_content,
2676 pre_expand=True,
2677 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2678 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2679 )
2680 assert tree.kind == NodeKind.ROOT
2681 for seq in seqs:
2682 ret = recurse(tree, seq)
2683 if ret is None:
2684 wxr.wtp.debug(
2685 "Failed to find subpage section {}/{} seq {}".format(
2686 title, subtitle, seq
2687 ),
2688 sortid="page/1963",
2689 )
2690 return ret
2692 def parse_linkage(
2693 data: WordData, field: str, linkagenode: LevelNode
2694 ) -> None:
2695 assert isinstance(data, dict)
2696 assert isinstance(field, str)
2697 assert isinstance(linkagenode, WikiNode)
2698 # if field == "synonyms":
2699 # print("field", field)
2700 # print("data", data)
2701 # print("children:")
2702 # print(linkagenode.children)
2703 if not wxr.config.capture_linkages: 2703 ↛ 2704line 2703 didn't jump to line 2704 because the condition on line 2703 was never true
2704 return
2705 have_panel_template = False
2706 toplevel_text = []
2707 next_navframe_sense = None # Used for "(sense):" before NavFrame
2709 def parse_linkage_item(
2710 contents: list[Union[str, WikiNode]],
2711 field: str,
2712 sense: Optional[str] = None,
2713 ):
2714 assert isinstance(contents, (list, tuple))
2715 assert isinstance(field, str)
2716 assert sense is None or isinstance(sense, str)
2718 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2719 # .format(field, sense, contents))
2721 parts: list[str] = []
2722 ruby: list[tuple[str, str]] = []
2723 urls: list[str] = []
2724 # data about link text; this is used to skip splitting on
2725 # linkage text items that contain stuff like commas; for
2726 # example "Hunde, die bellen, beißen nicht" in article
2727 # beißen is split into "Hunde", "die bellen" etc.
2728 # We take that link text and use it, eventually,
2729 # in split_at_comma_semi to skip splitting on those
2730 # commas.
2731 links_that_should_not_be_split: list[str] = []
2733 def item_recurse(
2734 contents: list[Union[str, WikiNode]], italic=False
2735 ) -> None:
2736 assert isinstance(contents, (list, tuple))
2737 nonlocal sense
2738 nonlocal ruby
2739 nonlocal parts
2740 # print("ITEM_RECURSE:", contents)
2741 for node in contents:
2742 if isinstance(node, str):
2743 parts.append(node)
2744 continue
2745 kind = node.kind
2746 # print("ITEM_RECURSE KIND:", kind,
2747 # node.sarg if node.sarg else node.largs)
2748 if is_list_item(node): 2748 ↛ 2749line 2748 didn't jump to line 2749 because the condition on line 2748 was never true
2749 if parts:
2750 sense1: Optional[str]
2751 sense1 = clean_node(wxr, None, parts)
2752 if sense1.endswith(":"):
2753 sense1 = sense1[:-1].strip()
2754 if sense1.startswith("(") and sense1.endswith(")"):
2755 sense1 = sense1[1:-1].strip()
2756 if sense1.lower() == TRANSLATIONS_TITLE:
2757 sense1 = None
2758 # print("linkage item_recurse LIST sense1:", sense1)
2759 parse_linkage_recurse(
2760 node.children, field, sense=sense1 or sense
2761 )
2762 parts = []
2763 else:
2764 parse_linkage_recurse(node.children, field, sense)
2765 elif kind in ( 2765 ↛ 2770line 2765 didn't jump to line 2770 because the condition on line 2765 was never true
2766 NodeKind.TABLE,
2767 NodeKind.TABLE_ROW,
2768 NodeKind.TABLE_CELL,
2769 ):
2770 parse_linkage_recurse(node.children, field, sense)
2771 elif kind in ( 2771 ↛ 2775line 2771 didn't jump to line 2775 because the condition on line 2771 was never true
2772 NodeKind.TABLE_HEADER_CELL,
2773 NodeKind.TABLE_CAPTION,
2774 ):
2775 continue
2776 elif kind == NodeKind.HTML: 2776 ↛ 2777line 2776 didn't jump to line 2777 because the condition on line 2776 was never true
2777 classes = (node.attrs.get("class") or "").split()
2778 if node.sarg in ("gallery", "ref", "cite", "caption"):
2779 continue
2780 elif node.sarg == "ruby":
2781 rb = parse_ruby(wxr, node)
2782 if rb:
2783 ruby.append(rb)
2784 parts.append(rb[0])
2785 continue
2786 elif node.sarg == "math":
2787 parts.append(clean_node(wxr, None, node))
2788 continue
2789 elif "interProject" in classes:
2790 continue # These do not seem to be displayed
2791 if "NavFrame" in classes:
2792 parse_linkage_recurse(node.children, field, sense)
2793 else:
2794 item_recurse(node.children, italic=italic)
2795 elif kind == NodeKind.ITALIC:
2796 item_recurse(node.children, italic=True)
2797 elif kind == NodeKind.LINK:
2798 ignore = False
2799 if isinstance(node.largs[0][0], str): 2799 ↛ 2741line 2799 didn't jump to line 2741 because the condition on line 2799 was always true
2800 v1 = node.largs[0][0].strip().lower()
2801 if v1.startswith( 2801 ↛ 2805line 2801 didn't jump to line 2805 because the condition on line 2801 was never true
2802 ns_title_prefix_tuple(wxr, "Category", True)
2803 + ns_title_prefix_tuple(wxr, "File", True)
2804 ):
2805 ignore = True
2806 if not ignore: 2806 ↛ 2741line 2806 didn't jump to line 2741 because the condition on line 2806 was always true
2807 v = node.largs[-1]
2808 if (
2809 len(node.largs) == 1
2810 and len(v) > 0
2811 and isinstance(v[0], str)
2812 and v[0][0] == ":"
2813 ):
2814 v = [v[0][1:]] + list(v[1:]) # type:ignore
2815 if isinstance(v[0], str) and not v[0].isalnum():
2816 links_that_should_not_be_split.append(
2817 "".join(v[0])
2818 ) # type: ignore
2819 item_recurse(v, italic=italic)
2820 elif kind == NodeKind.URL:
2821 if len(node.largs) < 2 and node.largs:
2822 # Naked url captured
2823 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2824 continue
2825 if len(node.largs) == 2: 2825 ↛ 2830line 2825 didn't jump to line 2830 because the condition on line 2825 was always true
2826 # Url from link with text
2827 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2828 # print(f"{node.largs=!r}")
2829 # print("linkage recurse URL {}".format(node))
2830 item_recurse(node.largs[-1], italic=italic)
2831 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD):
2832 item_recurse(node.children, italic=italic)
2833 else:
2834 wxr.wtp.debug(
2835 "linkage item_recurse unhandled {}: {}".format(
2836 node.kind, node
2837 ),
2838 sortid="page/2073",
2839 )
2841 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2842 # .format(contents))
2844 item_recurse(contents)
2845 item = clean_node(wxr, None, parts)
2846 # print("LINKAGE ITEM CONTENTS:", parts)
2847 # print("CLEANED ITEM: {!r}".format(item))
2848 # print(f"URLS {urls=!r}")
2850 return parse_linkage_item_text(
2851 wxr,
2852 word,
2853 data,
2854 field,
2855 item,
2856 sense,
2857 ruby,
2858 sense_datas,
2859 is_reconstruction,
2860 urls or None,
2861 links_that_should_not_be_split or None,
2862 )
2864 def parse_linkage_recurse(
2865 contents: list[Union[WikiNode, str]],
2866 field: str,
2867 sense: Optional[str],
2868 ) -> None:
2869 assert isinstance(contents, (list, tuple))
2870 assert sense is None or isinstance(sense, str)
2871 nonlocal next_navframe_sense
2872 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2873 for node in contents:
2874 if isinstance(node, str):
2875 # Ignore top-level text, generally comments before the
2876 # linkages list. However, if no linkages are found, then
2877 # use this for linkages (not all words use bullet points
2878 # for linkages).
2879 toplevel_text.append(node)
2880 continue
2881 assert isinstance(node, WikiNode)
2882 kind = node.kind
2883 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2884 if is_list(node):
2885 parse_linkage_recurse(node.children, field, sense)
2886 elif is_list_item(node):
2887 v = parse_linkage_item(node.children, field, sense)
2888 if v: 2888 ↛ 2892line 2888 didn't jump to line 2892 because the condition on line 2888 was never true
2889 # parse_linkage_item() can return a value that should
2890 # be used as the sense for the follow-on linkages,
2891 # which are typically provided in a table (see 滿)
2892 next_navframe_sense = v
2893 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2894 parse_linkage_recurse(node.children, field, sense)
2895 elif kind == NodeKind.TABLE_CELL:
2896 parse_linkage_item(node.children, field, sense)
2897 elif kind in (
2898 NodeKind.TABLE_CAPTION,
2899 NodeKind.TABLE_HEADER_CELL,
2900 NodeKind.PREFORMATTED,
2901 NodeKind.BOLD,
2902 ):
2903 continue
2904 elif kind == NodeKind.HTML: 2904 ↛ 2906line 2904 didn't jump to line 2906 because the condition on line 2904 was never true
2905 # Recurse to process inside the HTML for most tags
2906 if node.sarg in ("gallery", "ref", "cite", "caption"):
2907 continue
2908 classes = (node.attrs.get("class") or "").split()
2909 if node.sarg == "li":
2910 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2911 v = parse_linkage_item(node.children, field, sense)
2912 if v:
2913 next_navframe_sense = v
2914 elif "qualifier-content" in classes:
2915 sense1 = clean_node(wxr, None, node.children)
2916 if sense1.endswith(":"):
2917 sense1 = sense1[:-1].strip()
2918 if sense and sense1:
2919 wxr.wtp.debug(
2920 "linkage qualifier-content on multiple "
2921 "levels: {!r} and {!r}".format(sense, sense1),
2922 sortid="page/2170",
2923 )
2924 parse_linkage_recurse(node.children, field, sense1)
2925 elif "NavFrame" in classes:
2926 # NavFrame uses previously assigned next_navframe_sense
2927 # (from a "(sense):" item) and clears it afterwards
2928 parse_linkage_recurse(
2929 node.children, field, sense or next_navframe_sense
2930 )
2931 next_navframe_sense = None
2932 else:
2933 parse_linkage_recurse(node.children, field, sense)
2934 elif kind in LEVEL_KINDS: 2934 ↛ 2936line 2934 didn't jump to line 2936 because the condition on line 2934 was never true
2935 # Just recurse to any possible subsections
2936 parse_linkage_recurse(node.children, field, sense)
2937 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2938 # Skip these on top level; at least sometimes bold is
2939 # used for indicating a subtitle
2940 continue
2941 elif kind == NodeKind.LINK: 2941 ↛ 2947line 2941 didn't jump to line 2947 because the condition on line 2941 was always true
2942 # Recurse into the last argument
2943 # Apparently ":/" is used as a link to "/", so strip
2944 # initial value
2945 parse_linkage_recurse(node.largs[-1], field, sense)
2946 else:
2947 wxr.wtp.debug(
2948 "parse_linkage_recurse unhandled {}: {}".format(
2949 kind, node
2950 ),
2951 sortid="page/2196",
2952 )
2954 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2955 nonlocal have_panel_template
2956 if is_panel_template(wxr, name):
2957 have_panel_template = True
2958 return ""
2959 return None
2961 # Main body of parse_linkage()
2962 l_nodes = []
2963 l_sense = ""
2964 for node in linkagenode.children:
2965 if (
2966 isinstance(node, TemplateNode)
2967 and node.template_name == "zh-dial"
2968 ):
2969 extract_zh_dial_template(wxr, data, node, l_sense)
2970 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
2971 for list_item in node.find_child(NodeKind.LIST_ITEM):
2972 for t_node in list_item.find_child(NodeKind.TEMPLATE):
2973 if t_node.template_name in ["s", "sense"]:
2974 l_sense = clean_node(wxr, None, t_node).strip(
2975 "(): "
2976 )
2977 l_nodes.append(node)
2978 else:
2979 l_nodes.append(node)
2980 text = wxr.wtp.node_to_wikitext(l_nodes)
2981 parsed = wxr.wtp.parse(
2982 text, expand_all=True, template_fn=linkage_template_fn1
2983 )
2984 parse_linkage_recurse(parsed.children, field, None)
2985 if not data.get(field) and not have_panel_template:
2986 text = "".join(toplevel_text).strip()
2987 if "\n" not in text and "," in text and text.count(",") > 3:
2988 if not text.startswith("See "): 2988 ↛ exitline 2988 didn't return from function 'parse_linkage' because the condition on line 2988 was always true
2989 parse_linkage_item([text], field, None)
2991 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2992 """Parses translations for a word. This may also pull in translations
2993 from separate translation subpages."""
2994 assert isinstance(data, dict)
2995 assert isinstance(xlatnode, WikiNode)
2996 # print("===== PARSE_TRANSLATIONS {} {} {}"
2997 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2998 # print("parse_translations xlatnode={}".format(xlatnode))
2999 if not wxr.config.capture_translations: 2999 ↛ 3000line 2999 didn't jump to line 3000 because the condition on line 2999 was never true
3000 return
3001 sense_parts: list[Union[WikiNode, str]] = []
3002 sense: Optional[str] = None
3004 def parse_translation_item(
3005 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
3006 ) -> None:
3007 nonlocal sense
3008 assert isinstance(contents, list)
3009 assert lang is None or isinstance(lang, str)
3010 # print("PARSE_TRANSLATION_ITEM:", contents)
3012 langcode: Optional[str] = None
3013 if sense is None:
3014 sense = clean_node(wxr, data, sense_parts).strip()
3015 # print("sense <- clean_node: ", sense)
3016 idx = sense.find("See also translations at")
3017 if idx > 0: 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true
3018 wxr.wtp.debug(
3019 "Skipping translation see also: {}".format(sense),
3020 sortid="page/2361",
3021 )
3022 sense = sense[:idx].strip()
3023 if sense.endswith(":"): 3023 ↛ 3024line 3023 didn't jump to line 3024 because the condition on line 3023 was never true
3024 sense = sense[:-1].strip()
3025 if sense.endswith("—"): 3025 ↛ 3026line 3025 didn't jump to line 3026 because the condition on line 3025 was never true
3026 sense = sense[:-1].strip()
3027 translations_from_template: list[str] = []
3029 def translation_item_template_fn(
3030 name: str, ht: TemplateArgs
3031 ) -> Optional[str]:
3032 nonlocal langcode
3033 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
3034 if is_panel_template(wxr, name):
3035 return ""
3036 if name in ("t+check", "t-check", "t-needed"):
3037 # We ignore these templates. They seem to have outright
3038 # garbage in some entries, and very varying formatting in
3039 # others. These should be transitory and unreliable
3040 # anyway.
3041 return "__IGNORE__"
3042 if name in ("t", "t+", "t-simple", "tt", "tt+"):
3043 code = ht.get(1)
3044 if code: 3044 ↛ 3054line 3044 didn't jump to line 3054 because the condition on line 3044 was always true
3045 if langcode and code != langcode:
3046 wxr.wtp.debug(
3047 "inconsistent language codes {} vs "
3048 "{} in translation item: {!r} {}".format(
3049 langcode, code, name, ht
3050 ),
3051 sortid="page/2386",
3052 )
3053 langcode = code
3054 tr = ht.get(2)
3055 if tr:
3056 tr = clean_node(wxr, None, [tr])
3057 translations_from_template.append(tr)
3058 return None
3059 if name == "t-egy":
3060 langcode = "egy"
3061 return None
3062 if name == "ttbc":
3063 code = ht.get(1)
3064 if code: 3064 ↛ 3066line 3064 didn't jump to line 3066 because the condition on line 3064 was always true
3065 langcode = code
3066 return None
3067 if name == "trans-see": 3067 ↛ 3068line 3067 didn't jump to line 3068 because the condition on line 3067 was never true
3068 wxr.wtp.error(
3069 "UNIMPLEMENTED trans-see template", sortid="page/2405"
3070 )
3071 return ""
3072 if name.endswith("-top"): 3072 ↛ 3073line 3072 didn't jump to line 3073 because the condition on line 3072 was never true
3073 return ""
3074 if name.endswith("-bottom"): 3074 ↛ 3075line 3074 didn't jump to line 3075 because the condition on line 3074 was never true
3075 return ""
3076 if name.endswith("-mid"): 3076 ↛ 3077line 3076 didn't jump to line 3077 because the condition on line 3076 was never true
3077 return ""
3078 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
3079 # .format(name),
3080 # sortid="page/2414")
3081 return None
3083 sublists = list(
3084 x
3085 for x in contents
3086 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
3087 )
3088 contents = list(
3089 x
3090 for x in contents
3091 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
3092 )
3094 item = clean_node(
3095 wxr, data, contents, template_fn=translation_item_template_fn
3096 )
3097 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
3099 # Parse the translation item.
3100 if item: 3100 ↛ exitline 3100 didn't return from function 'parse_translation_item' because the condition on line 3100 was always true
3101 lang = parse_translation_item_text(
3102 wxr,
3103 word,
3104 data,
3105 item,
3106 sense,
3107 lang,
3108 langcode,
3109 translations_from_template,
3110 is_reconstruction,
3111 )
3113 # Handle sublists. They are frequently used for different
3114 # scripts for the language and different variants of the
3115 # language. We will include the lower-level header as a
3116 # tag in those cases.
3117 for listnode in sublists:
3118 assert listnode.kind == NodeKind.LIST
3119 for node in listnode.children:
3120 if not isinstance(node, WikiNode): 3120 ↛ 3121line 3120 didn't jump to line 3121 because the condition on line 3120 was never true
3121 continue
3122 if node.kind == NodeKind.LIST_ITEM: 3122 ↛ 3119line 3122 didn't jump to line 3119 because the condition on line 3122 was always true
3123 parse_translation_item(node.children, lang=lang)
3125 def parse_translation_template(node: WikiNode) -> None:
3126 assert isinstance(node, WikiNode)
3128 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3129 nonlocal sense_parts
3130 nonlocal sense
3131 if is_panel_template(wxr, name):
3132 return ""
3133 if name == "see also":
3134 # XXX capture
3135 # XXX for example, "/" has top-level list containing
3136 # see also items. So also should parse those.
3137 return ""
3138 if name == "trans-see":
3139 # XXX capture
3140 return ""
3141 if name == "see translation subpage": 3141 ↛ 3142line 3141 didn't jump to line 3142 because the condition on line 3141 was never true
3142 sense_parts = []
3143 sense = None
3144 sub = ht.get(1, "")
3145 if sub:
3146 m = re.match(
3147 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3148 )
3149 else:
3150 m = None
3151 etym = ""
3152 etym_numbered = ""
3153 pos = ""
3154 if m:
3155 etym_numbered = m.group(1)
3156 etym = m.group(2)
3157 pos = m.group(3)
3158 if not sub:
3159 wxr.wtp.debug(
3160 "no part-of-speech in "
3161 "{{see translation subpage|...}}, "
3162 "defaulting to just wxr.wtp.section "
3163 "(= language)",
3164 sortid="page/2468",
3165 )
3166 # seq sent to get_subpage_section without sub and pos
3167 seq = [
3168 language,
3169 TRANSLATIONS_TITLE,
3170 ]
3171 elif (
3172 m
3173 and etym.lower().strip() in ETYMOLOGY_TITLES
3174 and pos.lower() in POS_TITLES
3175 ):
3176 seq = [
3177 language,
3178 etym_numbered,
3179 pos,
3180 TRANSLATIONS_TITLE,
3181 ]
3182 elif sub.lower() in POS_TITLES:
3183 # seq with sub but not pos
3184 seq = [
3185 language,
3186 sub,
3187 TRANSLATIONS_TITLE,
3188 ]
3189 else:
3190 # seq with sub and pos
3191 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3192 if pos.lower() not in POS_TITLES:
3193 wxr.wtp.debug(
3194 "unhandled see translation subpage: "
3195 "language={} sub={} "
3196 "wxr.wtp.subsection={}".format(
3197 language, sub, wxr.wtp.subsection
3198 ),
3199 sortid="page/2478",
3200 )
3201 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3202 subnode = get_subpage_section(
3203 wxr.wtp.title or "MISSING_TITLE",
3204 TRANSLATIONS_TITLE,
3205 [seq],
3206 )
3207 if subnode is None or not isinstance(subnode, WikiNode):
3208 # Failed to find the normal subpage section
3209 # seq with sub and pos
3210 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3211 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
3212 seqs: list[list[str] | tuple[str, ...]] = [
3213 [TRANSLATIONS_TITLE],
3214 [language, pos],
3215 ]
3216 subnode = get_subpage_section(
3217 wxr.wtp.title or "MISSING_TITLE",
3218 TRANSLATIONS_TITLE,
3219 seqs,
3220 )
3221 if subnode is not None and isinstance(subnode, WikiNode):
3222 parse_translations(data, subnode)
3223 return ""
3224 if name in (
3225 "c",
3226 "C",
3227 "categorize",
3228 "cat",
3229 "catlangname",
3230 "topics",
3231 "top",
3232 "qualifier",
3233 "cln",
3234 ):
3235 # These are expanded in the default way
3236 return None
3237 if name in (
3238 "trans-top",
3239 "trans-top-see",
3240 ):
3241 # XXX capture id from trans-top? Capture sense here
3242 # instead of trying to parse it from expanded content?
3243 if ht.get(1):
3244 sense_parts = []
3245 sense = ht.get(1)
3246 else:
3247 sense_parts = []
3248 sense = None
3249 return None
3250 if name in (
3251 "trans-bottom",
3252 "trans-mid",
3253 "checktrans-mid",
3254 "checktrans-bottom",
3255 ):
3256 return None
3257 if name == "checktrans-top":
3258 sense_parts = []
3259 sense = None
3260 return ""
3261 if name == "trans-top-also":
3262 # XXX capture?
3263 sense_parts = []
3264 sense = None
3265 return ""
3266 wxr.wtp.error(
3267 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3268 name, ht
3269 ),
3270 sortid="page/2517",
3271 )
3272 return ""
3274 wxr.wtp.expand(
3275 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3276 )
3278 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3279 nonlocal sense
3280 nonlocal sense_parts
3281 for node in xlatnode.children:
3282 # print(node)
3283 if isinstance(node, str):
3284 if sense:
3285 if not node.isspace():
3286 wxr.wtp.debug(
3287 "skipping string in the middle of "
3288 "translations: {}".format(node),
3289 sortid="page/2530",
3290 )
3291 continue
3292 # Add a part to the sense
3293 sense_parts.append(node)
3294 sense = None
3295 continue
3296 assert isinstance(node, WikiNode)
3297 kind = node.kind
3298 if kind == NodeKind.LIST:
3299 for item in node.children:
3300 if not isinstance(item, WikiNode): 3300 ↛ 3301line 3300 didn't jump to line 3301 because the condition on line 3300 was never true
3301 continue
3302 if item.kind != NodeKind.LIST_ITEM: 3302 ↛ 3303line 3302 didn't jump to line 3303 because the condition on line 3302 was never true
3303 continue
3304 if item.sarg == ":": 3304 ↛ 3305line 3304 didn't jump to line 3305 because the condition on line 3304 was never true
3305 continue
3306 parse_translation_item(item.children)
3307 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3307 ↛ 3311line 3307 didn't jump to line 3311 because the condition on line 3307 was never true
3308 # Silently skip list items that are just indented; these
3309 # are used for text between translations, such as indicating
3310 # translations that need to be checked.
3311 pass
3312 elif kind == NodeKind.TEMPLATE:
3313 parse_translation_template(node)
3314 elif kind in ( 3314 ↛ 3319line 3314 didn't jump to line 3319 because the condition on line 3314 was never true
3315 NodeKind.TABLE,
3316 NodeKind.TABLE_ROW,
3317 NodeKind.TABLE_CELL,
3318 ):
3319 parse_translation_recurse(node)
3320 elif kind == NodeKind.HTML:
3321 if node.attrs.get("class") == "NavFrame": 3321 ↛ 3327line 3321 didn't jump to line 3327 because the condition on line 3321 was never true
3322 # Reset ``sense_parts`` (and force recomputing
3323 # by clearing ``sense``) as each NavFrame specifies
3324 # its own sense. This helps eliminate garbage coming
3325 # from text at the beginning at the translations
3326 # section.
3327 sense_parts = []
3328 sense = None
3329 # for item in node.children:
3330 # if not isinstance(item, WikiNode):
3331 # continue
3332 # parse_translation_recurse(item)
3333 parse_translation_recurse(node)
3334 elif kind in LEVEL_KINDS: 3334 ↛ 3336line 3334 didn't jump to line 3336 because the condition on line 3334 was never true
3335 # Sub-levels will be recursed elsewhere
3336 pass
3337 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3338 parse_translation_recurse(node)
3339 elif kind == NodeKind.PREFORMATTED: 3339 ↛ 3340line 3339 didn't jump to line 3340 because the condition on line 3339 was never true
3340 print("parse_translation_recurse: PREFORMATTED:", node)
3341 elif kind == NodeKind.LINK: 3341 ↛ 3395line 3341 didn't jump to line 3395 because the condition on line 3341 was always true
3342 arg0 = node.largs[0]
3343 # Kludge: I've seen occasional normal links to translation
3344 # subpages from main pages (e.g., language/English/Noun
3345 # in July 2021) instead of the normal
3346 # {{see translation subpage|...}} template. This should
3347 # handle them. Note: must be careful not to read other
3348 # links, particularly things like in "human being":
3349 # "a human being -- see [[man/translations]]" (group title)
3350 if ( 3350 ↛ 3358line 3350 didn't jump to line 3358 because the condition on line 3350 was never true
3351 isinstance(arg0, (list, tuple))
3352 and arg0
3353 and isinstance(arg0[0], str)
3354 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3355 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3356 == wxr.wtp.title
3357 ):
3358 wxr.wtp.debug(
3359 "translations subpage link found on main "
3360 "page instead "
3361 "of normal {{see translation subpage|...}}",
3362 sortid="page/2595",
3363 )
3364 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3365 if sub.lower() in POS_TITLES:
3366 seq = [
3367 language,
3368 sub,
3369 TRANSLATIONS_TITLE,
3370 ]
3371 subnode = get_subpage_section(
3372 wxr.wtp.title,
3373 TRANSLATIONS_TITLE,
3374 [seq],
3375 )
3376 if subnode is not None and isinstance(
3377 subnode, WikiNode
3378 ):
3379 parse_translations(data, subnode)
3380 else:
3381 wxr.wtp.error(
3382 "/translations link outside part-of-speech"
3383 )
3385 if (
3386 len(arg0) >= 1
3387 and isinstance(arg0[0], str)
3388 and not arg0[0].lower().startswith("category:")
3389 ):
3390 for x in node.largs[-1]:
3391 if isinstance(x, str): 3391 ↛ 3394line 3391 didn't jump to line 3394 because the condition on line 3391 was always true
3392 sense_parts.append(x)
3393 else:
3394 parse_translation_recurse(x)
3395 elif not sense:
3396 sense_parts.append(node)
3397 else:
3398 wxr.wtp.debug(
3399 "skipping text between translation items/senses: "
3400 "{}".format(node),
3401 sortid="page/2621",
3402 )
3404 # Main code of parse_translation(). We want ``sense`` to be assigned
3405 # regardless of recursion levels, and thus the code is structured
3406 # to define at this level and recurse in parse_translation_recurse().
3407 parse_translation_recurse(xlatnode)
3409 def parse_etymology(data: WordData, node: WikiNode) -> None:
3410 """Parses an etymology section."""
3411 assert isinstance(data, dict)
3412 assert isinstance(node, WikiNode)
3414 templates: list[TemplateData] = []
3416 # Counter for preventing the capture of etymology templates
3417 # when we are inside templates that we want to ignore (i.e.,
3418 # not capture).
3419 ignore_count = 0
3421 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3422 nonlocal ignore_count
3423 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3424 return ""
3425 if re.match(ignored_etymology_templates_re, name):
3426 ignore_count += 1
3427 return None
3429 # CONTINUE_HERE
3431 def etym_post_template_fn(
3432 name: str, ht: TemplateArgs, expansion: str
3433 ) -> None:
3434 nonlocal ignore_count
3435 if name in wikipedia_templates:
3436 parse_wikipedia_template(wxr, data, ht)
3437 return None
3438 if re.match(ignored_etymology_templates_re, name):
3439 ignore_count -= 1
3440 return None
3441 if ignore_count == 0: 3441 ↛ 3447line 3441 didn't jump to line 3447 because the condition on line 3441 was always true
3442 ht = clean_template_args(wxr, ht)
3443 expansion = clean_node(wxr, None, expansion)
3444 templates.append(
3445 {"name": name, "args": ht, "expansion": expansion}
3446 )
3447 return None
3449 # Remove any subsections
3450 contents = list(
3451 x
3452 for x in node.children
3453 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3454 )
3455 # Convert to text, also capturing templates using post_template_fn
3456 text = clean_node(
3457 wxr,
3458 None,
3459 contents,
3460 template_fn=etym_template_fn,
3461 post_template_fn=etym_post_template_fn,
3462 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3463 # Save the collected information.
3464 if len(text) > 0:
3465 data["etymology_text"] = text
3466 if len(templates) > 0:
3467 # Some etymology templates, like Template:root do not generate
3468 # text, so they should be added here. Elsewhere, we check
3469 # for Template:root and add some text to the expansion to please
3470 # the validation.
3471 data["etymology_templates"] = templates
3473 for child_node in node.find_child_recursively( 3473 ↛ exitline 3473 didn't return from function 'parse_etymology' because the loop on line 3473 didn't complete
3474 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3475 ):
3476 if child_node.kind in LEVEL_KIND_FLAGS:
3477 break
3478 elif isinstance( 3478 ↛ 3481line 3478 didn't jump to line 3481 because the condition on line 3478 was never true
3479 child_node, TemplateNode
3480 ) and child_node.template_name in ["zh-x", "zh-q"]:
3481 if "etymology_examples" not in data:
3482 data["etymology_examples"] = []
3483 data["etymology_examples"].extend(
3484 extract_template_zh_x(
3485 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3486 )
3487 )
3489 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3490 """This recurses into a subtree in the parse tree for a page."""
3491 nonlocal etym_data
3492 nonlocal pos_data
3493 nonlocal inside_level_four
3495 redirect_list: list[str] = [] # for `zh-see` template
3497 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3498 """This is called for otherwise unprocessed parts of the page.
3499 We still expand them so that e.g. Category links get captured."""
3500 if name in wikipedia_templates:
3501 data = select_data()
3502 parse_wikipedia_template(wxr, data, ht)
3503 return None
3504 if is_panel_template(wxr, name):
3505 return ""
3506 return None
3508 for node in treenode.children:
3509 if not isinstance(node, WikiNode):
3510 # print(" X{}".format(repr(node)[:40]))
3511 continue
3512 if isinstance(node, TemplateNode):
3513 if process_soft_redirect_template(wxr, node, redirect_list):
3514 continue
3515 elif node.template_name == "zh-forms":
3516 extract_zh_forms_template(wxr, node, select_data())
3518 if node.kind not in LEVEL_KINDS:
3519 # XXX handle e.g. wikipedia links at the top of a language
3520 # XXX should at least capture "also" at top of page
3521 if node.kind in (
3522 NodeKind.HLINE,
3523 NodeKind.LIST,
3524 NodeKind.LIST_ITEM,
3525 ):
3526 continue
3527 # print(" UNEXPECTED: {}".format(node))
3528 # Clean the node to collect category links
3529 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3530 continue
3531 t = clean_node(
3532 wxr, etym_data, node.sarg if node.sarg else node.largs
3533 )
3534 t = t.lower()
3535 # XXX these counts were never implemented fully, and even this
3536 # gets discarded: Search STATISTICS_IMPLEMENTATION
3537 wxr.config.section_counts[t] += 1
3538 # print("PROCESS_CHILDREN: T:", repr(t))
3539 if t in IGNORED_TITLES:
3540 pass
3541 elif t.startswith(PRONUNCIATION_TITLE):
3542 # Chinese Pronunciation section kludge; we demote these to
3543 # be level 4 instead of 3 so that they're part of a larger
3544 # etymology hierarchy; usually the data here is empty and
3545 # acts as an inbetween between POS and Etymology data
3546 inside_level_four = True
3547 if t.startswith(PRONUNCIATION_TITLE + " "):
3548 # Pronunciation 1, etc, are used in Chinese Glyphs,
3549 # and each of them may have senses under Definition
3550 push_level_four_section(True)
3551 wxr.wtp.start_subsection(None)
3552 if wxr.config.capture_pronunciation: 3552 ↛ 3644line 3552 didn't jump to line 3644 because the condition on line 3552 was always true
3553 data = select_data()
3554 parse_pronunciation(
3555 wxr,
3556 node,
3557 data,
3558 etym_data,
3559 have_etym,
3560 base_data,
3561 lang_code,
3562 )
3563 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3564 push_etym()
3565 wxr.wtp.start_subsection(None)
3566 if wxr.config.capture_etymologies: 3566 ↛ 3644line 3566 didn't jump to line 3644 because the condition on line 3566 was always true
3567 m = re.search(r"\s(\d+)$", t)
3568 if m:
3569 etym_data["etymology_number"] = int(m.group(1))
3570 parse_etymology(etym_data, node)
3571 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3572 data = select_data()
3573 extract_descendant_section(wxr, data, node, False)
3574 elif (
3575 t in PROTO_ROOT_DERIVED_TITLES
3576 and pos == "root"
3577 and is_reconstruction
3578 and wxr.config.capture_descendants
3579 ):
3580 data = select_data()
3581 extract_descendant_section(wxr, data, node, True)
3582 elif t == TRANSLATIONS_TITLE:
3583 data = select_data()
3584 parse_translations(data, node)
3585 elif t in INFLECTION_TITLES:
3586 parse_inflection(node, t, pos)
3587 elif t == "alternative forms":
3588 extract_alt_form_section(wxr, select_data(), node)
3589 else:
3590 lst = t.split()
3591 while len(lst) > 1 and lst[-1].isdigit(): 3591 ↛ 3592line 3591 didn't jump to line 3592 because the condition on line 3591 was never true
3592 lst = lst[:-1]
3593 t_no_number = " ".join(lst).lower()
3594 if t_no_number in POS_TITLES:
3595 push_pos()
3596 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3597 pos = dt["pos"] or "MISSING_POS"
3598 wxr.wtp.start_subsection(t)
3599 if "debug" in dt:
3600 wxr.wtp.debug(
3601 "{} in section {}".format(dt["debug"], t),
3602 sortid="page/2755",
3603 )
3604 if "warning" in dt: 3604 ↛ 3605line 3604 didn't jump to line 3605 because the condition on line 3604 was never true
3605 wxr.wtp.wiki_notice(
3606 "{} in section {}".format(dt["warning"], t),
3607 sortid="page/2759",
3608 )
3609 if "error" in dt: 3609 ↛ 3610line 3609 didn't jump to line 3610 because the condition on line 3609 was never true
3610 wxr.wtp.error(
3611 "{} in section {}".format(dt["error"], t),
3612 sortid="page/2763",
3613 )
3614 if "note" in dt: 3614 ↛ 3615line 3614 didn't jump to line 3615 because the condition on line 3614 was never true
3615 wxr.wtp.note(
3616 "{} in section {}".format(dt["note"], t),
3617 sortid="page/20251017a",
3618 )
3619 if "wiki_notice" in dt: 3619 ↛ 3620line 3619 didn't jump to line 3620 because the condition on line 3619 was never true
3620 wxr.wtp.wiki_notice(
3621 "{} in section {}".format(dt["wiki_notice"], t),
3622 sortid="page/20251017b",
3623 )
3624 # Parse word senses for the part-of-speech
3625 parse_part_of_speech(node, pos)
3626 if "tags" in dt:
3627 for pdata in sense_datas:
3628 data_extend(pdata, "tags", dt["tags"])
3629 elif t_no_number in LINKAGE_TITLES:
3630 # print(f"LINKAGE_TITLES NODE {node=}")
3631 rel = LINKAGE_TITLES[t_no_number]
3632 data = select_data()
3633 parse_linkage(data, rel, node)
3634 elif t_no_number == COMPOUNDS_TITLE:
3635 data = select_data()
3636 if wxr.config.capture_compounds: 3636 ↛ 3644line 3636 didn't jump to line 3644 because the condition on line 3636 was always true
3637 parse_linkage(data, "derived", node)
3639 # XXX parse interesting templates also from other sections. E.g.,
3640 # {{Letter|...}} in ===See also===
3641 # Also <gallery>
3643 # Recurse to children of this node, processing subtitles therein
3644 stack.append(t)
3645 process_children(node, pos)
3646 stack.pop()
3648 if len(redirect_list) > 0:
3649 if len(pos_data) > 0:
3650 pos_data["redirects"] = redirect_list
3651 if "pos" not in pos_data: 3651 ↛ 3652line 3651 didn't jump to line 3652 because the condition on line 3651 was never true
3652 pos_data["pos"] = "soft-redirect"
3653 else:
3654 new_page_data = copy.deepcopy(base_data)
3655 new_page_data["redirects"] = redirect_list
3656 if "pos" not in new_page_data: 3656 ↛ 3658line 3656 didn't jump to line 3658 because the condition on line 3656 was always true
3657 new_page_data["pos"] = "soft-redirect"
3658 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3659 page_datas.append(new_page_data)
3661 def extract_examples(
3662 others: list[WikiNode], sense_base: SenseData
3663 ) -> list[ExampleData]:
3664 """Parses through a list of definitions and quotes to find examples.
3665 Returns a list of example dicts to be added to sense data. Adds
3666 meta-data, mostly categories, into sense_base."""
3667 assert isinstance(others, list)
3668 examples: list[ExampleData] = []
3670 for sub in others:
3671 if not sub.sarg.endswith((":", "*")): 3671 ↛ 3672line 3671 didn't jump to line 3672 because the condition on line 3671 was never true
3672 continue
3673 for item in sub.children:
3674 if not isinstance(item, WikiNode): 3674 ↛ 3675line 3674 didn't jump to line 3675 because the condition on line 3674 was never true
3675 continue
3676 if item.kind != NodeKind.LIST_ITEM: 3676 ↛ 3677line 3676 didn't jump to line 3677 because the condition on line 3676 was never true
3677 continue
3678 usex_type = None
3679 example_template_args = []
3680 example_template_names = []
3681 taxons = set()
3683 # Bypass this function when parsing Chinese, Japanese and
3684 # quotation templates.
3685 new_example_lists = extract_example_list_item(
3686 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3687 )
3688 if len(new_example_lists) > 0:
3689 examples.extend(new_example_lists)
3690 continue
3692 def usex_template_fn(
3693 name: str, ht: TemplateArgs
3694 ) -> Optional[str]:
3695 nonlocal usex_type
3696 if is_panel_template(wxr, name):
3697 return ""
3698 if name in usex_templates:
3699 usex_type = "example"
3700 example_template_args.append(ht)
3701 example_template_names.append(name)
3702 elif name in quotation_templates:
3703 usex_type = "quotation"
3704 elif name in taxonomy_templates: 3704 ↛ 3705line 3704 didn't jump to line 3705 because the condition on line 3704 was never true
3705 taxons.update(ht.get(1, "").split())
3706 for prefix in template_linkages_to_ignore_in_examples:
3707 if re.search(
3708 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3709 ):
3710 return ""
3711 return None
3713 # bookmark
3714 ruby: list[tuple[str, str]] = []
3715 contents = item.children
3716 if lang_code == "ja":
3717 # Capture ruby contents if this is a Japanese language
3718 # example.
3719 # print(contents)
3720 if ( 3720 ↛ 3725line 3720 didn't jump to line 3725 because the condition on line 3720 was never true
3721 contents
3722 and isinstance(contents, str)
3723 and re.match(r"\s*$", contents[0])
3724 ):
3725 contents = contents[1:]
3726 exp = wxr.wtp.parse(
3727 wxr.wtp.node_to_wikitext(contents),
3728 # post_template_fn=head_post_template_fn,
3729 expand_all=True,
3730 )
3731 rub, rest = extract_ruby(wxr, exp.children)
3732 if rub:
3733 for rtup in rub:
3734 ruby.append(rtup)
3735 contents = rest
3736 subtext = clean_node(
3737 wxr, sense_base, contents, template_fn=usex_template_fn
3738 )
3740 frozen_taxons = frozenset(taxons)
3741 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3743 # print(f"{subtext=}")
3744 subtext = re.sub(
3745 r"\s*\(please add an English "
3746 r"translation of this "
3747 r"(example|usage example|quote)\)",
3748 "",
3749 subtext,
3750 ).strip()
3751 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3752 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3753 # print("subtext:", repr(subtext))
3755 lines = subtext.splitlines()
3756 # print(lines)
3758 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3759 lines = list(
3760 x
3761 for x in lines
3762 if not re.match(
3763 r"(Synonyms: |Antonyms: |Hyponyms: |"
3764 r"Synonym: |Antonym: |Hyponym: |"
3765 r"Hypernyms: |Derived terms: |"
3766 r"Related terms: |"
3767 r"Hypernym: |Derived term: |"
3768 r"Coordinate terms:|"
3769 r"Related term: |"
3770 r"For more quotations using )",
3771 x,
3772 )
3773 )
3774 tr = ""
3775 ref = ""
3776 roman = ""
3777 # for line in lines:
3778 # print("LINE:", repr(line))
3779 # print(classify_desc(line))
3780 if len(lines) == 1 and lang_code != "en":
3781 parts = example_splitter_re.split(lines[0])
3782 if ( 3782 ↛ 3790line 3782 didn't jump to line 3790 because the condition on line 3782 was never true
3783 len(parts) > 2
3784 and len(example_template_args) == 1
3785 and any(
3786 ("―" in s) or ("—" in s)
3787 for s in example_template_args[0].values()
3788 )
3789 ):
3790 if nparts := synch_splits_with_args(
3791 lines[0], example_template_args[0]
3792 ):
3793 parts = nparts
3794 if ( 3794 ↛ 3799line 3794 didn't jump to line 3799 because the condition on line 3794 was never true
3795 len(example_template_args) == 1
3796 and "lit" in example_template_args[0]
3797 ):
3798 # ugly brute-force kludge in case there's a lit= arg
3799 literally = example_template_args[0].get("lit", "")
3800 if literally:
3801 literally = (
3802 " (literally, “"
3803 + clean_value(wxr, literally)
3804 + "”)"
3805 )
3806 else:
3807 literally = ""
3808 if ( 3808 ↛ 3847line 3808 didn't jump to line 3847 because the condition on line 3808 was never true
3809 len(example_template_args) == 1
3810 and len(parts) == 2
3811 and len(example_template_args[0])
3812 - (
3813 # horrible kludge to ignore these arguments
3814 # when calculating how many there are
3815 sum(
3816 s in example_template_args[0]
3817 for s in (
3818 "lit", # generates text, but we handle it
3819 "inline",
3820 "noenum",
3821 "nocat",
3822 "sort",
3823 )
3824 )
3825 )
3826 == 3
3827 and clean_value(
3828 wxr, example_template_args[0].get(2, "")
3829 )
3830 == parts[0].strip()
3831 and clean_value(
3832 wxr,
3833 (
3834 example_template_args[0].get(3)
3835 or example_template_args[0].get("translation")
3836 or example_template_args[0].get("t", "")
3837 )
3838 + literally, # in case there's a lit= argument
3839 )
3840 == parts[1].strip()
3841 ):
3842 # {{exampletemplate|ex|Foo bar baz|English translation}}
3843 # is a pretty reliable 'heuristic', so we use it here
3844 # before the others. To be extra sure the template
3845 # doesn't do anything weird, we compare the arguments
3846 # and the output to each other.
3847 lines = [parts[0].strip()]
3848 tr = parts[1].strip()
3849 elif (
3850 len(parts) == 2
3851 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3852 ):
3853 # These other branches just do some simple heuristics w/
3854 # the expanded output of the template (if applicable).
3855 lines = [parts[0].strip()]
3856 tr = parts[1].strip()
3857 elif ( 3857 ↛ 3863line 3857 didn't jump to line 3863 because the condition on line 3857 was never true
3858 len(parts) == 3
3859 and classify_desc2(parts[1])
3860 in ("romanization", "english")
3861 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3862 ):
3863 lines = [parts[0].strip()]
3864 roman = parts[1].strip()
3865 tr = parts[2].strip()
3866 else:
3867 parts = re.split(r"\s+-\s+", lines[0])
3868 if ( 3868 ↛ 3872line 3868 didn't jump to line 3872 because the condition on line 3868 was never true
3869 len(parts) == 2
3870 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3871 ):
3872 lines = [parts[0].strip()]
3873 tr = parts[1].strip()
3874 elif len(lines) > 1:
3875 if any(
3876 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3877 ) and not (len(example_template_names) == 1):
3878 refs: list[str] = []
3879 for i in range(len(lines)): 3879 ↛ 3885line 3879 didn't jump to line 3885 because the loop on line 3879 didn't complete
3880 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3880 ↛ 3881line 3880 didn't jump to line 3881 because the condition on line 3880 was never true
3881 break
3882 refs.append(lines[i].strip())
3883 if re.search(r"[]\d:)]\s*$", lines[i]):
3884 break
3885 ref = " ".join(refs)
3886 lines = lines[i + 1 :]
3887 if (
3888 lang_code != "en"
3889 and len(lines) >= 2
3890 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3891 ):
3892 i = len(lines) - 1
3893 while ( 3893 ↛ 3898line 3893 didn't jump to line 3898 because the condition on line 3893 was never true
3894 i > 1
3895 and classify_desc2(lines[i - 1])
3896 in ENGLISH_TEXTS
3897 ):
3898 i -= 1
3899 tr = "\n".join(lines[i:])
3900 lines = lines[:i]
3901 if len(lines) >= 2:
3902 if classify_desc2(lines[-1]) == "romanization":
3903 roman = lines[-1].strip()
3904 lines = lines[:-1]
3906 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3907 ref = lines[0]
3908 lines = lines[1:]
3909 elif lang_code != "en" and len(lines) == 2:
3910 cls1 = classify_desc2(lines[0])
3911 cls2 = classify_desc2(lines[1])
3912 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3913 tr = lines[1]
3914 lines = [lines[0]]
3915 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3915 ↛ 3916line 3915 didn't jump to line 3916 because the condition on line 3915 was never true
3916 tr = lines[0]
3917 lines = [lines[1]]
3918 elif ( 3918 ↛ 3925line 3918 didn't jump to line 3925 because the condition on line 3918 was never true
3919 re.match(r"^[#*]*:+", lines[1])
3920 and classify_desc2(
3921 re.sub(r"^[#*:]+\s*", "", lines[1])
3922 )
3923 in ENGLISH_TEXTS
3924 ):
3925 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3926 lines = [lines[0]]
3927 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3928 # Both were classified as English, but
3929 # presumably one is not. Assume first is
3930 # non-English, as that seems more common.
3931 tr = lines[1]
3932 lines = [lines[0]]
3933 elif (
3934 usex_type != "quotation"
3935 and lang_code != "en"
3936 and len(lines) == 3
3937 ):
3938 cls1 = classify_desc2(lines[0])
3939 cls2 = classify_desc2(lines[1])
3940 cls3 = classify_desc2(lines[2])
3941 if (
3942 cls3 == "english"
3943 and cls2 in ("english", "romanization")
3944 and cls1 != "english"
3945 ):
3946 tr = lines[2].strip()
3947 roman = lines[1].strip()
3948 lines = [lines[0].strip()]
3949 elif ( 3949 ↛ 3957line 3949 didn't jump to line 3957 because the condition on line 3949 was never true
3950 usex_type == "quotation"
3951 and lang_code != "en"
3952 and len(lines) > 2
3953 ):
3954 # for x in lines:
3955 # print(" LINE: {}: {}"
3956 # .format(classify_desc2(x), x))
3957 if re.match(r"^[#*]*:+\s*$", lines[1]):
3958 ref = lines[0]
3959 lines = lines[2:]
3960 cls1 = classify_desc2(lines[-1])
3961 if cls1 == "english":
3962 i = len(lines) - 1
3963 while (
3964 i > 1
3965 and classify_desc2(lines[i - 1])
3966 == ENGLISH_TEXTS
3967 ):
3968 i -= 1
3969 tr = "\n".join(lines[i:])
3970 lines = lines[:i]
3972 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3973 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3974 tr = re.sub(r"^[#*:]+\s*", "", tr)
3975 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3976 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3977 ref = re.sub(r"^[#*:]+\s*", "", ref)
3978 ref = re.sub(
3979 r", (volume |number |page )?“?"
3980 r"\(please specify ([^)]|\(s\))*\)”?|"
3981 ", text here$",
3982 "",
3983 ref,
3984 )
3985 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3986 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3987 subtext = "\n".join(x for x in lines if x)
3988 if not tr and lang_code != "en":
3989 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3990 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3990 ↛ 3991line 3990 didn't jump to line 3991 because the condition on line 3990 was never true
3991 tr = m.group(2)
3992 subtext = subtext[: m.start()] + m.group(1)
3993 elif lines:
3994 parts = re.split(r"\s*[―—]+\s*", lines[0])
3995 if ( 3995 ↛ 3999line 3995 didn't jump to line 3999 because the condition on line 3995 was never true
3996 len(parts) == 2
3997 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3998 ):
3999 subtext = parts[0].strip()
4000 tr = parts[1].strip()
4001 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
4002 subtext = re.sub(
4003 r"(please add an English translation of "
4004 r"this (quote|usage example))",
4005 "",
4006 subtext,
4007 )
4008 subtext = re.sub(
4009 r"\s*→New International Version " "translation$",
4010 "",
4011 subtext,
4012 ) # e.g. pis/Tok Pisin (Bible)
4013 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4014 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4015 note = None
4016 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4017 if ( 4017 ↛ 4025line 4017 didn't jump to line 4025 because the condition on line 4017 was never true
4018 m is not None
4019 and lang_code != "en"
4020 and (
4021 m.group(1).startswith("with ")
4022 or classify_desc2(m.group(1)) == "english"
4023 )
4024 ):
4025 note = m.group(1)
4026 subtext = subtext[m.end() :]
4027 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4028 ref = re.sub(r",\s*→ISBN", "", ref)
4029 ref = ref.strip()
4030 if ref.endswith(":") or ref.endswith(","):
4031 ref = ref[:-1].strip()
4032 ref = re.sub(r"\s+,\s+", ", ", ref)
4033 ref = re.sub(r"\s+", " ", ref)
4034 if ref and not subtext: 4034 ↛ 4035line 4034 didn't jump to line 4035 because the condition on line 4034 was never true
4035 subtext = ref
4036 ref = ""
4037 if subtext:
4038 dt: ExampleData = {"text": subtext}
4039 if ref:
4040 dt["ref"] = ref
4041 if tr:
4042 dt["english"] = tr # DEPRECATED for "translation"
4043 dt["translation"] = tr
4044 if usex_type:
4045 dt["type"] = usex_type
4046 if note: 4046 ↛ 4047line 4046 didn't jump to line 4047 because the condition on line 4046 was never true
4047 dt["note"] = note
4048 if roman:
4049 dt["roman"] = roman
4050 if ruby:
4051 dt["ruby"] = ruby
4052 examples.append(dt)
4054 return examples
4056 # Main code of parse_language()
4057 # Process the section
4058 stack.append(language)
4059 process_children(langnode, None)
4060 stack.pop()
4062 # Finalize word entires
4063 push_etym()
4064 ret = []
4065 for data in page_datas:
4066 merge_base(data, base_data)
4067 ret.append(data)
4069 # Copy all tags to word senses
4070 for data in ret:
4071 if "senses" not in data: 4071 ↛ 4072line 4071 didn't jump to line 4072 because the condition on line 4071 was never true
4072 continue
4073 # WordData should not have a 'tags' field, but if it does, it's
4074 # deleted and its contents removed and placed in each sense;
4075 # that's why the type ignores.
4076 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4077 if "tags" in data:
4078 del data["tags"] # type: ignore[typeddict-item]
4079 for sense in data["senses"]:
4080 data_extend(sense, "tags", tags)
4082 return ret
4085def parse_wikipedia_template(
4086 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4087) -> None:
4088 """Helper function for parsing {{wikipedia|...}} and related templates."""
4089 assert isinstance(wxr, WiktextractContext)
4090 assert isinstance(data, dict)
4091 assert isinstance(ht, dict)
4092 langid = clean_node(wxr, data, ht.get("lang", ()))
4093 pagename = (
4094 clean_node(wxr, data, ht.get(1, ()))
4095 or wxr.wtp.title
4096 or "MISSING_PAGE_TITLE"
4097 )
4098 if langid:
4099 data_append(data, "wikipedia", langid + ":" + pagename)
4100 else:
4101 data_append(data, "wikipedia", pagename)
4104def parse_top_template(
4105 wxr: WiktextractContext, node: WikiNode, data: WordData
4106) -> None:
4107 """Parses a template that occurs on the top-level in a page, before any
4108 language subtitles."""
4109 assert isinstance(wxr, WiktextractContext)
4110 assert isinstance(node, WikiNode)
4111 assert isinstance(data, dict)
4113 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4114 if name in wikipedia_templates:
4115 parse_wikipedia_template(wxr, data, ht)
4116 return None
4117 if is_panel_template(wxr, name):
4118 return ""
4119 if name in ("reconstruction",): 4119 ↛ 4120line 4119 didn't jump to line 4120 because the condition on line 4119 was never true
4120 return ""
4121 if name.lower() == "also" or name.lower().startswith("also/"):
4122 # XXX shows related words that might really have been the intended
4123 # word, capture them
4124 return ""
4125 if name == "see also": 4125 ↛ 4127line 4125 didn't jump to line 4127 because the condition on line 4125 was never true
4126 # XXX capture
4127 return ""
4128 if name == "cardinalbox": 4128 ↛ 4130line 4128 didn't jump to line 4130 because the condition on line 4128 was never true
4129 # XXX capture
4130 return ""
4131 if name == "character info": 4131 ↛ 4133line 4131 didn't jump to line 4133 because the condition on line 4131 was never true
4132 # XXX capture
4133 return ""
4134 if name == "commonscat": 4134 ↛ 4136line 4134 didn't jump to line 4136 because the condition on line 4134 was never true
4135 # XXX capture link to Wikimedia commons
4136 return ""
4137 if name == "wrongtitle": 4137 ↛ 4140line 4137 didn't jump to line 4140 because the condition on line 4137 was never true
4138 # XXX this should be captured to replace page title with the
4139 # correct title. E.g. ⿰亻革家
4140 return ""
4141 if name == "wikidata": 4141 ↛ 4142line 4141 didn't jump to line 4142 because the condition on line 4141 was never true
4142 arg = clean_node(wxr, data, ht.get(1, ()))
4143 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4144 data_append(data, "wikidata", arg)
4145 return ""
4146 wxr.wtp.debug(
4147 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4148 sortid="page/2870",
4149 )
4150 return ""
4152 clean_node(wxr, None, [node], template_fn=top_template_fn)
4155def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4156 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4157 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4158 that are next to each other."""
4160 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4161 # section get overwritten. In this case, let's just combine the two.
4163 # In Chinese entries, Pronunciation can be preceded on the
4164 # same level 3 by its Etymology *and* Glyph Origin sections:
4165 # ===Glyph Origin===
4166 # ===Etymology===
4167 # ===Pronunciation===
4168 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4169 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4170 # are now level 6
4172 # Known lowercase PoS names are in part_of_speech_map
4173 # Known lowercase linkage section names are in linkage_map
4175 old = re.split(
4176 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4177 )
4179 parts = []
4180 npar = 4 # Number of parentheses in above expression
4181 parts.append(old[0])
4182 prev_level = None
4183 level = None
4184 skip_level_title = False # When combining etymology sections
4185 for i in range(1, len(old), npar + 1):
4186 left = old[i]
4187 right = old[i + npar - 1]
4188 # remove Wikilinks in title
4189 title = re.sub(r"^\[\[", "", old[i + 1])
4190 title = re.sub(r"\]\]$", "", title)
4191 prev_level = level
4192 level = len(left)
4193 part = old[i + npar]
4194 if level != len(right): 4194 ↛ 4195line 4194 didn't jump to line 4195 because the condition on line 4194 was never true
4195 wxr.wtp.debug(
4196 "subtitle has unbalanced levels: "
4197 "{!r} has {} on the left and {} on the right".format(
4198 title, left, right
4199 ),
4200 sortid="page/2904",
4201 )
4202 lc = title.lower()
4203 if name_to_code(title, "en") != "":
4204 if level > 2: 4204 ↛ 4205line 4204 didn't jump to line 4205 because the condition on line 4204 was never true
4205 wxr.wtp.debug(
4206 "subtitle has language name {} at level {}".format(
4207 title, level
4208 ),
4209 sortid="page/2911",
4210 )
4211 level = 2
4212 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4213 if level > 3: 4213 ↛ 4214line 4213 didn't jump to line 4214 because the condition on line 4213 was never true
4214 wxr.wtp.debug(
4215 "etymology section {} at level {}".format(title, level),
4216 sortid="page/2917",
4217 )
4218 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
4219 # sections cheek-to-cheek
4220 skip_level_title = True
4221 # Modify the title of previous ("Glyph Origin") section, in
4222 # case we have a meaningful title like "Etymology 1"
4223 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4224 level = 3
4225 elif lc.startswith(PRONUNCIATION_TITLE):
4226 # Pronunciation is now a level between POS and Etymology, so
4227 # we need to shift everything down by one
4228 level = 4
4229 elif lc in POS_TITLES:
4230 level = 5
4231 elif lc == TRANSLATIONS_TITLE:
4232 level = 6
4233 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4234 level = 6
4235 elif lc in INFLECTION_TITLES:
4236 level = 6
4237 elif lc == DESCENDANTS_TITLE:
4238 level = 6
4239 elif title in PROTO_ROOT_DERIVED_TITLES: 4239 ↛ 4240line 4239 didn't jump to line 4240 because the condition on line 4239 was never true
4240 level = 6
4241 elif lc in IGNORED_TITLES:
4242 level = 6
4243 else:
4244 level = 6
4245 if skip_level_title:
4246 skip_level_title = False
4247 parts.append(part)
4248 else:
4249 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4250 parts.append(part)
4251 # print("=" * level, title)
4252 # if level != len(left):
4253 # print(" FIXED LEVEL OF {} {} -> {}"
4254 # .format(title, len(left), level))
4256 text = "".join(parts)
4257 # print(text)
4258 return text
4261def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4262 # Skip translation pages
4263 if word.endswith("/" + TRANSLATIONS_TITLE): 4263 ↛ 4264line 4263 didn't jump to line 4264 because the condition on line 4263 was never true
4264 return []
4266 if wxr.config.verbose: 4266 ↛ 4267line 4266 didn't jump to line 4267 because the condition on line 4266 was never true
4267 logger.info(f"Parsing page: {word}")
4269 wxr.config.word = word
4270 wxr.wtp.start_page(word)
4272 # Remove <noinclude> and similar tags from main pages. They
4273 # should not appear there, but at least net/Elfdala has one and it
4274 # is probably not the only one.
4275 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4276 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4277 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4279 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4280 # pages that have, for example, Translations section under Linkage, or
4281 # Translations section on the same level as Noun. Enforce a proper
4282 # hierarchy by manipulating the subtitle levels in certain cases.
4283 text = fix_subtitle_hierarchy(wxr, text)
4285 # Parse the page, pre-expanding those templates that are likely to
4286 # influence parsing
4287 tree = wxr.wtp.parse(
4288 text,
4289 pre_expand=True,
4290 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4291 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4292 )
4293 # from wikitextprocessor.parser import print_tree
4294 # print("PAGE PARSE:", print_tree(tree))
4296 top_data: WordData = {}
4298 # Iterate over top-level titles, which should be languages for normal
4299 # pages
4300 by_lang = defaultdict(list)
4301 for langnode in tree.children:
4302 if not isinstance(langnode, WikiNode):
4303 continue
4304 if langnode.kind == NodeKind.TEMPLATE:
4305 parse_top_template(wxr, langnode, top_data)
4306 continue
4307 if langnode.kind == NodeKind.LINK:
4308 # Some pages have links at top level, e.g., "trees" in Wiktionary
4309 continue
4310 if langnode.kind != NodeKind.LEVEL2: 4310 ↛ 4311line 4310 didn't jump to line 4311 because the condition on line 4310 was never true
4311 wxr.wtp.debug(
4312 f"unexpected top-level node: {langnode}", sortid="page/3014"
4313 )
4314 continue
4315 lang = clean_node(
4316 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4317 )
4318 lang_code = name_to_code(lang, "en")
4319 if lang_code == "": 4319 ↛ 4320line 4319 didn't jump to line 4320 because the condition on line 4319 was never true
4320 wxr.wtp.debug(
4321 f"unrecognized language name: {lang}", sortid="page/3019"
4322 )
4323 if (
4324 wxr.config.capture_language_codes
4325 and lang_code not in wxr.config.capture_language_codes
4326 ):
4327 continue
4328 wxr.wtp.start_section(lang)
4330 # Collect all words from the page.
4331 # print(f"{langnode=}")
4332 datas = parse_language(wxr, langnode, lang, lang_code)
4334 # Propagate fields resulting from top-level templates to this
4335 # part-of-speech.
4336 for data in datas:
4337 if "lang" not in data: 4337 ↛ 4338line 4337 didn't jump to line 4338 because the condition on line 4337 was never true
4338 wxr.wtp.debug(
4339 "internal error -- no lang in data: {}".format(data),
4340 sortid="page/3034",
4341 )
4342 continue
4343 for k, v in top_data.items():
4344 assert isinstance(v, (list, tuple))
4345 data_extend(data, k, v)
4346 by_lang[data["lang"]].append(data)
4348 # XXX this code is clearly out of date. There is no longer a "conjugation"
4349 # field. FIX OR REMOVE.
4350 # Do some post-processing on the words. For example, we may distribute
4351 # conjugation information to all the words.
4352 ret = []
4353 for lang, lang_datas in by_lang.items():
4354 ret.extend(lang_datas)
4356 for x in ret:
4357 if x["word"] != word:
4358 if word.startswith("Unsupported titles/"):
4359 wxr.wtp.debug(
4360 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4361 sortid="20231101/3578page.py",
4362 )
4363 else:
4364 wxr.wtp.debug(
4365 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4366 sortid="20231101/3582page.py",
4367 )
4368 x["original_title"] = word
4369 # validate tag data
4370 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4371 return ret
4374def recursively_separate_raw_tags(
4375 wxr: WiktextractContext, data: dict[str, Any]
4376) -> None:
4377 if not isinstance(data, dict): 4377 ↛ 4378line 4377 didn't jump to line 4378 because the condition on line 4377 was never true
4378 wxr.wtp.error(
4379 "'data' is not dict; most probably "
4380 "data has a list that contains at least one dict and "
4381 "at least one non-dict item",
4382 sortid="en/page-4016/20240419",
4383 )
4384 return
4385 new_tags: list[str] = []
4386 raw_tags: list[str] = data.get("raw_tags", [])
4387 for field, val in data.items():
4388 if field == "tags":
4389 for tag in val:
4390 if tag not in valid_tags:
4391 raw_tags.append(tag)
4392 else:
4393 new_tags.append(tag)
4394 if isinstance(val, list):
4395 if len(val) > 0 and isinstance(val[0], dict):
4396 for d in val:
4397 recursively_separate_raw_tags(wxr, d)
4398 if "tags" in data and not new_tags:
4399 del data["tags"]
4400 elif new_tags:
4401 data["tags"] = new_tags
4402 if raw_tags:
4403 data["raw_tags"] = raw_tags
4406def process_soft_redirect_template(
4407 wxr: WiktextractContext,
4408 template_node: TemplateNode,
4409 redirect_pages: list[str],
4410) -> bool:
4411 # return `True` if the template is soft redirect template
4412 if template_node.template_name == "zh-see":
4413 # https://en.wiktionary.org/wiki/Template:zh-see
4414 title = clean_node(
4415 wxr, None, template_node.template_parameters.get(1, "")
4416 )
4417 if title != "": 4417 ↛ 4419line 4417 didn't jump to line 4419 because the condition on line 4417 was always true
4418 redirect_pages.append(title)
4419 return True
4420 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4421 # https://en.wiktionary.org/wiki/Template:ja-see
4422 for key, value in template_node.template_parameters.items():
4423 if isinstance(key, int): 4423 ↛ 4422line 4423 didn't jump to line 4422 because the condition on line 4423 was always true
4424 title = clean_node(wxr, None, value)
4425 if title != "": 4425 ↛ 4422line 4425 didn't jump to line 4422 because the condition on line 4425 was always true
4426 redirect_pages.append(title)
4427 return True
4428 return False
4431ZH_FORMS_TAGS = {
4432 "trad.": "Traditional-Chinese",
4433 "simp.": "Simplified-Chinese",
4434 "alternative forms": "alternative",
4435 "2nd round simp.": "Second-Round-Simplified-Chinese",
4436}
4439def extract_zh_forms_template(
4440 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4441):
4442 # https://en.wiktionary.org/wiki/Template:zh-forms
4443 lit_meaning = clean_node(
4444 wxr, None, t_node.template_parameters.get("lit", "")
4445 )
4446 if lit_meaning != "":
4447 base_data["literal_meaning"] = lit_meaning
4448 expanded_node = wxr.wtp.parse(
4449 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4450 )
4451 for table in expanded_node.find_child(NodeKind.TABLE):
4452 for row in table.find_child(NodeKind.TABLE_ROW):
4453 row_header = ""
4454 row_header_tags = []
4455 header_has_span = False
4456 for cell in row.find_child(
4457 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4458 ):
4459 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4460 row_header, row_header_tags, header_has_span = (
4461 extract_zh_forms_header_cell(wxr, base_data, cell)
4462 )
4463 elif not header_has_span:
4464 extract_zh_forms_data_cell(
4465 wxr, base_data, cell, row_header, row_header_tags
4466 )
4468 if "forms" in base_data and len(base_data["forms"]) == 0: 4468 ↛ 4469line 4468 didn't jump to line 4469 because the condition on line 4468 was never true
4469 del base_data["forms"]
4472def extract_zh_forms_header_cell(
4473 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4474) -> tuple[str, list[str], bool]:
4475 row_header = ""
4476 row_header_tags = []
4477 header_has_span = False
4478 first_span_index = len(header_cell.children)
4479 for index, span_tag in header_cell.find_html("span", with_index=True):
4480 if index < first_span_index: 4480 ↛ 4482line 4480 didn't jump to line 4482 because the condition on line 4480 was always true
4481 first_span_index = index
4482 header_has_span = True
4483 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4484 for raw_tag in row_header.split(" and "):
4485 raw_tag = raw_tag.strip()
4486 if raw_tag != "":
4487 row_header_tags.append(raw_tag)
4488 for span_tag in header_cell.find_html_recursively("span"):
4489 span_lang = span_tag.attrs.get("lang", "")
4490 form_nodes = []
4491 sup_title = ""
4492 for node in span_tag.children:
4493 if isinstance(node, HTMLNode) and node.tag == "sup": 4493 ↛ 4494line 4493 didn't jump to line 4494 because the condition on line 4493 was never true
4494 for sup_span in node.find_html("span"):
4495 sup_title = sup_span.attrs.get("title", "")
4496 else:
4497 form_nodes.append(node)
4498 if span_lang in ["zh-Hant", "zh-Hans"]:
4499 for word in clean_node(wxr, None, form_nodes).split("/"):
4500 if word not in [wxr.wtp.title, ""]:
4501 form = {"form": word}
4502 for raw_tag in row_header_tags:
4503 if raw_tag in ZH_FORMS_TAGS: 4503 ↛ 4506line 4503 didn't jump to line 4506 because the condition on line 4503 was always true
4504 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4505 else:
4506 data_append(form, "raw_tags", raw_tag)
4507 if sup_title != "": 4507 ↛ 4508line 4507 didn't jump to line 4508 because the condition on line 4507 was never true
4508 data_append(form, "raw_tags", sup_title)
4509 data_append(base_data, "forms", form)
4510 return row_header, row_header_tags, header_has_span
4513def extract_zh_forms_data_cell(
4514 wxr: WiktextractContext,
4515 base_data: WordData,
4516 cell: WikiNode,
4517 row_header: str,
4518 row_header_tags: list[str],
4519):
4520 from .zh_pron_tags import ZH_PRON_TAGS
4522 forms = []
4523 for top_span_tag in cell.find_html("span"):
4524 span_style = top_span_tag.attrs.get("style", "")
4525 span_lang = top_span_tag.attrs.get("lang", "")
4526 if span_style == "white-space:nowrap;":
4527 extract_zh_forms_data_cell(
4528 wxr, base_data, top_span_tag, row_header, row_header_tags
4529 )
4530 elif "font-size:80%" in span_style:
4531 raw_tag = clean_node(wxr, None, top_span_tag)
4532 if raw_tag != "": 4532 ↛ 4523line 4532 didn't jump to line 4523 because the condition on line 4532 was always true
4533 for form in forms:
4534 if raw_tag in ZH_PRON_TAGS: 4534 ↛ 4540line 4534 didn't jump to line 4540 because the condition on line 4534 was always true
4535 tr_tag = ZH_PRON_TAGS[raw_tag]
4536 if isinstance(tr_tag, list): 4536 ↛ 4537line 4536 didn't jump to line 4537 because the condition on line 4536 was never true
4537 data_extend(form, "tags", tr_tag)
4538 elif isinstance(tr_tag, str): 4538 ↛ 4533line 4538 didn't jump to line 4533 because the condition on line 4538 was always true
4539 data_append(form, "tags", tr_tag)
4540 elif raw_tag in valid_tags:
4541 data_append(form, "tags", raw_tag)
4542 else:
4543 data_append(form, "raw_tags", raw_tag)
4544 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4544 ↛ 4523line 4544 didn't jump to line 4523 because the condition on line 4544 was always true
4545 word = clean_node(wxr, None, top_span_tag)
4546 if word not in ["", "/", wxr.wtp.title]:
4547 form = {"form": word}
4548 if row_header != "anagram": 4548 ↛ 4554line 4548 didn't jump to line 4554 because the condition on line 4548 was always true
4549 for raw_tag in row_header_tags:
4550 if raw_tag in ZH_FORMS_TAGS: 4550 ↛ 4553line 4550 didn't jump to line 4553 because the condition on line 4550 was always true
4551 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4552 else:
4553 data_append(form, "raw_tags", raw_tag)
4554 if span_lang == "zh-Hant":
4555 data_append(form, "tags", "Traditional-Chinese")
4556 elif span_lang == "zh-Hans":
4557 data_append(form, "tags", "Simplified-Chinese")
4558 forms.append(form)
4560 if row_header == "anagram": 4560 ↛ 4561line 4560 didn't jump to line 4561 because the condition on line 4560 was never true
4561 for form in forms:
4562 l_data = {"word": form["form"]}
4563 for key in ["tags", "raw_tags"]:
4564 if key in form:
4565 l_data[key] = form[key]
4566 data_append(base_data, "anagrams", l_data)
4567 else:
4568 data_extend(base_data, "forms", forms)