Coverage for src/wiktextract/extractor/en/page.py: 45%
1940 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8import sys
9from collections import defaultdict
10from functools import partial
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Iterable,
15 Iterator,
16 Optional,
17 Set,
18 Union,
19 cast,
20)
22from mediawiki_langcodes import get_all_names, name_to_code
23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
24from wikitextprocessor.parser import (
25 LEVEL_KIND_FLAGS,
26 GeneralNode,
27 NodeKind,
28 TemplateNode,
29 WikiNode,
30)
32from ...clean import clean_template_args, clean_value
33from ...datautils import (
34 data_append,
35 data_extend,
36 ns_title_prefix_tuple,
37)
38from ...page import (
39 LEVEL_KINDS,
40 clean_node,
41 is_panel_template,
42 recursively_extract,
43)
44from ...tags import valid_tags
45from ...wxr_context import WiktextractContext
46from ...wxr_logging import logger
47from ..ruby import extract_ruby, parse_ruby
48from ..share import strip_nodes
49from .example import extract_example_list_item, extract_template_zh_x
50from .form_descriptions import (
51 classify_desc,
52 decode_tags,
53 distw,
54 parse_alt_or_inflection_of,
55 parse_sense_qualifier,
56 parse_word_head,
57)
58from .inflection import TableContext, parse_inflection_section
59from .info_templates import (
60 INFO_TEMPLATE_FUNCS,
61 parse_info_template_arguments,
62 parse_info_template_node,
63)
64from .linkages import parse_linkage_item_text
65from .parts_of_speech import PARTS_OF_SPEECH
66from .section_titles import (
67 COMPOUNDS_TITLE,
68 DESCENDANTS_TITLE,
69 ETYMOLOGY_TITLES,
70 IGNORED_TITLES,
71 INFLECTION_TITLES,
72 LINKAGE_TITLES,
73 POS_TITLES,
74 PRONUNCIATION_TITLE,
75 PROTO_ROOT_DERIVED_TITLES,
76 TRANSLATIONS_TITLE,
77)
78from .translations import parse_translation_item_text
79from .type_utils import (
80 DescendantData,
81 ExampleData,
82 FormData,
83 LinkageData,
84 SenseData,
85 SoundData,
86 TemplateData,
87 WordData,
88)
89from .unsupported_titles import unsupported_title_map
91# When determining whether a string is 'english', classify_desc
92# might return 'taxonomic' which is English text 99% of the time.
93ENGLISH_TEXTS = ("english", "taxonomic")
95# Matches head tag
96HEAD_TAG_RE = re.compile(
97 r"^(head|Han char|arabic-noun|arabic-noun-form|"
98 r"hangul-symbol|syllable-hangul)$|"
99 + r"^(latin|"
100 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
101 + r")-("
102 + "|".join(
103 [
104 "abbr",
105 "adj",
106 "adjective",
107 "adjective form",
108 "adjective-form",
109 "adv",
110 "adverb",
111 "affix",
112 "animal command",
113 "art",
114 "article",
115 "aux",
116 "bound pronoun",
117 "bound-pronoun",
118 "Buyla",
119 "card num",
120 "card-num",
121 "cardinal",
122 "chunom",
123 "classifier",
124 "clitic",
125 "cls",
126 "cmene",
127 "cmavo",
128 "colloq-verb",
129 "colverbform",
130 "combining form",
131 "combining-form",
132 "comparative",
133 "con",
134 "concord",
135 "conj",
136 "conjunction",
137 "conjug",
138 "cont",
139 "contr",
140 "converb",
141 "daybox",
142 "decl",
143 "decl noun",
144 "def",
145 "dem",
146 "det",
147 "determ",
148 "Deva",
149 "ending",
150 "entry",
151 "form",
152 "fuhivla",
153 "gerund",
154 "gismu",
155 "hanja",
156 "hantu",
157 "hanzi",
158 "head",
159 "ideophone",
160 "idiom",
161 "inf",
162 "indef",
163 "infixed pronoun",
164 "infixed-pronoun",
165 "infl",
166 "inflection",
167 "initialism",
168 "int",
169 "interfix",
170 "interj",
171 "interjection",
172 "jyut",
173 "latin",
174 "letter",
175 "locative",
176 "lujvo",
177 "monthbox",
178 "mutverb",
179 "name",
180 "nisba",
181 "nom",
182 "noun",
183 "noun form",
184 "noun-form",
185 "noun plural",
186 "noun-plural",
187 "nounprefix",
188 "num",
189 "number",
190 "numeral",
191 "ord",
192 "ordinal",
193 "par",
194 "part",
195 "part form",
196 "part-form",
197 "participle",
198 "particle",
199 "past",
200 "past neg",
201 "past-neg",
202 "past participle",
203 "past-participle",
204 "perfect participle",
205 "perfect-participle",
206 "personal pronoun",
207 "personal-pronoun",
208 "pref",
209 "prefix",
210 "phrase",
211 "pinyin",
212 "plural noun",
213 "plural-noun",
214 "pos",
215 "poss-noun",
216 "post",
217 "postp",
218 "postposition",
219 "PP",
220 "pp",
221 "ppron",
222 "pred",
223 "predicative",
224 "prep",
225 "prep phrase",
226 "prep-phrase",
227 "preposition",
228 "present participle",
229 "present-participle",
230 "pron",
231 "prondem",
232 "pronindef",
233 "pronoun",
234 "prop",
235 "proper noun",
236 "proper-noun",
237 "proper noun form",
238 "proper-noun form",
239 "proper noun-form",
240 "proper-noun-form",
241 "prov",
242 "proverb",
243 "prpn",
244 "prpr",
245 "punctuation mark",
246 "punctuation-mark",
247 "regnoun",
248 "rel",
249 "rom",
250 "romanji",
251 "root",
252 "sign",
253 "suff",
254 "suffix",
255 "syllable",
256 "symbol",
257 "verb",
258 "verb form",
259 "verb-form",
260 "verbal noun",
261 "verbal-noun",
262 "verbnec",
263 "vform",
264 ]
265 )
266 + r")(-|/|\+|$)"
267)
269# Head-templates causing problems (like newlines) that can be squashed into
270# an empty string in the template handler while saving their template
271# data for later.
272WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
274FLOATING_TABLE_TEMPLATES: set[str] = {
275 # az-suffix-form creates a style=floatright div that is otherwise
276 # deleted; if it is not pre-expanded, we can intercept the template
277 # so we add this set into do_not_pre_expand, and intercept the
278 # templates in parse_part_of_speech
279 "az-suffix-forms",
280 "az-inf-p",
281 "kk-suffix-forms",
282 "ky-suffix-forms",
283 "tr-inf-p",
284 "tr-suffix-forms",
285 "tt-suffix-forms",
286 "uz-suffix-forms",
287}
288# These two should contain template names that should always be
289# pre-expanded when *first* processing the tree, or not pre-expanded
290# so that the template are left in place with their identifying
291# name intact for later filtering.
293DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
294DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
296# Additional templates to be expanded in the pre-expand phase
297ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
298 "multitrans",
299 "multitrans-nowiki",
300 "trans-top",
301 "trans-top-also",
302 "trans-bottom",
303 "checktrans-top",
304 "checktrans-bottom",
305 "col1",
306 "col2",
307 "col3",
308 "col4",
309 "col5",
310 "col1-u",
311 "col2-u",
312 "col3-u",
313 "col4-u",
314 "col5-u",
315 "check deprecated lang param usage",
316 "deprecated code",
317 "ru-verb-alt-ё",
318 "ru-noun-alt-ё",
319 "ru-adj-alt-ё",
320 "ru-proper noun-alt-ё",
321 "ru-pos-alt-ё",
322 "ru-alt-ё",
323 "inflection of",
324 "no deprecated lang param usage",
325}
327# Inverse linkage for those that have them
328linkage_inverses: dict[str, str] = {
329 # XXX this is not currently used, move to post-processing
330 "synonyms": "synonyms",
331 "hypernyms": "hyponyms",
332 "hyponyms": "hypernyms",
333 "holonyms": "meronyms",
334 "meronyms": "holonyms",
335 "derived": "derived_from",
336 "coordinate_terms": "coordinate_terms",
337 "troponyms": "hypernyms",
338 "antonyms": "antonyms",
339 "instances": "instance_of",
340 "related": "related",
341}
343# Templates that are used to form panels on pages and that
344# should be ignored in various positions
345PANEL_TEMPLATES: set[str] = {
346 "Character info",
347 "CJKV",
348 "French personal pronouns",
349 "French possessive adjectives",
350 "French possessive pronouns",
351 "Han etym",
352 "Japanese demonstratives",
353 "Latn-script",
354 "LDL",
355 "MW1913Abbr",
356 "Number-encoding",
357 "Nuttall",
358 "Spanish possessive adjectives",
359 "Spanish possessive pronouns",
360 "USRegionDisputed",
361 "Webster 1913",
362 "ase-rfr",
363 "attention",
364 "attn",
365 "beer",
366 "broken ref",
367 "ca-compass",
368 "character info",
369 "character info/var",
370 "checksense",
371 "compass-fi",
372 "copyvio suspected",
373 "delete",
374 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
375 "etystub",
376 "examples",
377 "hu-corr",
378 "hu-suff-pron",
379 "interwiktionary",
380 "ja-kanjitab",
381 "ko-hanja-search",
382 "look",
383 "maintenance box",
384 "maintenance line",
385 "mediagenic terms",
386 "merge",
387 "missing template",
388 "morse links",
389 "move",
390 "multiple images",
391 "no inline",
392 "picdic",
393 "picdicimg",
394 "picdiclabel",
395 "polyominoes",
396 "predidential nomics",
397 "punctuation", # This actually gets pre-expanded
398 "reconstructed",
399 "request box",
400 "rf-sound example",
401 "rfaccents",
402 "rfap",
403 "rfaspect",
404 "rfc",
405 "rfc-auto",
406 "rfc-header",
407 "rfc-level",
408 "rfc-pron-n",
409 "rfc-sense",
410 "rfclarify",
411 "rfd",
412 "rfd-redundant",
413 "rfd-sense",
414 "rfdate",
415 "rfdatek",
416 "rfdef",
417 "rfe",
418 "rfe/dowork",
419 "rfex",
420 "rfexp",
421 "rfform",
422 "rfgender",
423 "rfi",
424 "rfinfl",
425 "rfm",
426 "rfm-sense",
427 "rfp",
428 "rfp-old",
429 "rfquote",
430 "rfquote-sense",
431 "rfquotek",
432 "rfref",
433 "rfscript",
434 "rft2",
435 "rftaxon",
436 "rftone",
437 "rftranslit",
438 "rfv",
439 "rfv-etym",
440 "rfv-pron",
441 "rfv-quote",
442 "rfv-sense",
443 "selfref",
444 "split",
445 "stroke order", # XXX consider capturing this?
446 "stub entry",
447 "t-needed",
448 "tbot entry",
449 "tea room",
450 "tea room sense",
451 # "ttbc", - XXX needed in at least on/Preposition/Translation page
452 "unblock",
453 "unsupportedpage",
454 "video frames",
455 "was wotd",
456 "wrongtitle",
457 "zh-forms",
458 "zh-hanzi-box",
459 "no entry",
460}
462# lookup table for the tags of Chinese dialectal synonyms
463zh_tag_lookup: dict[str, list[str]] = {
464 "Formal": ["formal"],
465 "Written-Standard-Chinese": ["Standard-Chinese"],
466 "historical or Internet slang": ["historical", "internet-slang"],
467 "now usually derogatory or offensive": ["offensive", "derogatory"],
468 "lofty": [],
469}
471# Template name prefixes used for language-specific panel templates (i.e.,
472# templates that create side boxes or notice boxes or that should generally
473# be ignored).
474PANEL_PREFIXES: set[str] = {
475 "list:compass points/",
476 "list:Gregorian calendar months/",
477 "RQ:",
478}
480# Templates used for wikipedia links.
481wikipedia_templates: set[str] = {
482 "wikipedia",
483 "slim-wikipedia",
484 "w",
485 "W",
486 "swp",
487 "wiki",
488 "Wikipedia",
489 "wtorw",
490}
491for x in PANEL_PREFIXES & wikipedia_templates: 491 ↛ 492line 491 didn't jump to line 492 because the loop on line 491 never started
492 print(
493 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
494 x
495 )
496 )
498# Mapping from a template name (without language prefix) for the main word
499# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
500# it could validly occur. This is used as just a sanity check to give
501# warnings about probably incorrect coding in Wiktionary.
502template_allowed_pos_map: dict[str, list[str]] = {
503 "abbr": ["abbrev"],
504 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
505 "plural noun": ["noun", "name"],
506 "plural-noun": ["noun", "name"],
507 "proper noun": ["noun", "name"],
508 "proper-noun": ["name", "noun"],
509 "prop": ["name", "noun"],
510 "verb": ["verb", "phrase"],
511 "gerund": ["verb"],
512 "particle": ["adv", "particle"],
513 "adj": ["adj", "adj_noun"],
514 "pron": ["pron", "noun"],
515 "name": ["name", "noun"],
516 "adv": ["adv", "intj", "conj", "particle"],
517 "phrase": ["phrase", "prep_phrase"],
518 "noun phrase": ["phrase"],
519 "ordinal": ["num"],
520 "number": ["num"],
521 "pos": ["affix", "name", "num"],
522 "suffix": ["suffix", "affix"],
523 "character": ["character"],
524 "letter": ["character"],
525 "kanji": ["character"],
526 "cont": ["abbrev"],
527 "interj": ["intj"],
528 "con": ["conj"],
529 "part": ["particle"],
530 "prep": ["prep", "postp"],
531 "postp": ["postp"],
532 "misspelling": ["noun", "adj", "verb", "adv"],
533 "part-form": ["verb"],
534}
535for k, v in template_allowed_pos_map.items():
536 for x in v:
537 if x not in PARTS_OF_SPEECH: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true
538 print(
539 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
540 "".format(x, k, v)
541 )
542 assert False
545# Templates ignored during etymology extraction, i.e., these will not be listed
546# in the extracted etymology templates.
547ignored_etymology_templates: list[str] = [
548 "...",
549 "IPAchar",
550 "ipachar",
551 "ISBN",
552 "isValidPageName",
553 "redlink category",
554 "deprecated code",
555 "check deprecated lang param usage",
556 "para",
557 "p",
558 "cite",
559 "Cite news",
560 "Cite newsgroup",
561 "cite paper",
562 "cite MLLM 1976",
563 "cite journal",
564 "cite news/documentation",
565 "cite paper/documentation",
566 "cite video game",
567 "cite video game/documentation",
568 "cite newsgroup",
569 "cite newsgroup/documentation",
570 "cite web/documentation",
571 "cite news",
572 "Cite book",
573 "Cite-book",
574 "cite book",
575 "cite web",
576 "cite-usenet",
577 "cite-video/documentation",
578 "Cite-journal",
579 "rfe",
580 "catlangname",
581 "cln",
582 "langname-lite",
583 "no deprecated lang param usage",
584 "mention",
585 "m",
586 "m-self",
587 "link",
588 "l",
589 "ll",
590 "l-self",
591]
592# Regexp for matching ignored etymology template names. This adds certain
593# prefixes to the names listed above.
594ignored_etymology_templates_re = re.compile(
595 r"^((cite-|R:|RQ:).*|"
596 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
597 + r")$"
598)
600# Regexp for matching ignored descendants template names. Right now we just
601# copy the ignored etymology templates
602ignored_descendants_templates_re = ignored_etymology_templates_re
604# Set of template names that are used to define usage examples. If the usage
605# example contains one of these templates, then it its type is set to
606# "example"
607usex_templates: set[str] = {
608 "afex",
609 "affixusex",
610 "co", # {{collocation}} acts like a example template, specifically for
611 # pairs of combinations of words that are more common than you'd
612 # except would be randomly; hlavní#Czech
613 "coi",
614 "collocation",
615 "el-example",
616 "el-x",
617 "example",
618 "examples",
619 "he-usex",
620 "he-x",
621 "hi-usex",
622 "hi-x",
623 "ja-usex-inline",
624 "ja-usex",
625 "ja-x",
626 "jbo-example",
627 "jbo-x",
628 "km-usex",
629 "km-x",
630 "ko-usex",
631 "ko-x",
632 "lo-usex",
633 "lo-x",
634 "ne-x",
635 "ne-usex",
636 "prefixusex",
637 "ryu-usex",
638 "ryu-x",
639 "shn-usex",
640 "shn-x",
641 "suffixusex",
642 "th-usex",
643 "th-x",
644 "ur-usex",
645 "ur-x",
646 "usex",
647 "usex-suffix",
648 "ux",
649 "uxi",
650}
652stop_head_at_these_templates: set[str] = {
653 "category",
654 "cat",
655 "topics",
656 "catlangname",
657 "c",
658 "C",
659 "top",
660 "cln",
661}
663# Set of template names that are used to define quotation examples. If the
664# usage example contains one of these templates, then its type is set to
665# "quotation".
666quotation_templates: set[str] = {
667 "collapse-quote",
668 "quote-av",
669 "quote-book",
670 "quote-GYLD",
671 "quote-hansard",
672 "quotei",
673 "quote-journal",
674 "quotelite",
675 "quote-mailing list",
676 "quote-meta",
677 "quote-newsgroup",
678 "quote-song",
679 "quote-text",
680 "quote",
681 "quote-us-patent",
682 "quote-video game",
683 "quote-web",
684 "quote-wikipedia",
685 "wikiquote",
686 "Wikiquote",
687}
689taxonomy_templates = {
690 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
691 "taxfmt",
692 "taxlink",
693 "taxlink2",
694 "taxlinknew",
695 "taxlook",
696}
698# Template name component to linkage section listing. Integer section means
699# default section, starting at that argument.
700# XXX not used anymore, except for the first elements: moved to
701# template_linkages
702# template_linkage_mappings: list[list[Union[str, int]]] = [
703# ["syn", "synonyms"],
704# ["synonyms", "synonyms"],
705# ["ant", "antonyms"],
706# ["antonyms", "antonyms"],
707# ["hyp", "hyponyms"],
708# ["hyponyms", "hyponyms"],
709# ["der", "derived"],
710# ["derived terms", "derived"],
711# ["coordinate terms", "coordinate_terms"],
712# ["rel", "related"],
713# ["col", 2],
714# ]
716# Template names, this was exctracted from template_linkage_mappings,
717# because the code using template_linkage_mappings was actually not used
718# (but not removed).
719template_linkages: set[str] = {
720 "syn",
721 "synonyms",
722 "ant",
723 "antonyms",
724 "hyp",
725 "hyponyms",
726 "der",
727 "derived terms",
728 "coordinate terms",
729 "rel",
730 "col",
731}
733# Maps template name used in a word sense to a linkage field that it adds.
734sense_linkage_templates: dict[str, str] = {
735 "syn": "synonyms",
736 "synonyms": "synonyms",
737 "hyp": "hyponyms",
738 "hyponyms": "hyponyms",
739 "ant": "antonyms",
740 "antonyms": "antonyms",
741}
744def decode_html_entities(v: Union[str, int]) -> str:
745 """Decodes HTML entities from a value, converting them to the respective
746 Unicode characters/strings."""
747 if isinstance(v, int):
748 # I changed this to return str(v) instead of v = str(v),
749 # but there might have been the intention to have more logic
750 # here. html.unescape would not do anything special with an integer,
751 # it needs html escape symbols (&xx;).
752 return str(v)
753 return html.unescape(v)
756def parse_sense_linkage(
757 wxr: WiktextractContext,
758 data: SenseData,
759 name: str,
760 ht: TemplateArgs,
761) -> None:
762 """Parses a linkage (synonym, etc) specified in a word sense."""
763 assert isinstance(wxr, WiktextractContext)
764 assert isinstance(data, dict)
765 assert isinstance(name, str)
766 assert isinstance(ht, dict)
767 field = sense_linkage_templates[name]
768 for i in range(2, 20):
769 w = ht.get(i) or ""
770 w = clean_node(wxr, data, w)
771 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
772 if w.startswith(alias):
773 w = w[len(alias) :]
774 break
775 if not w:
776 break
777 tags: list[str] = []
778 topics: list[str] = []
779 english: Optional[str] = None
780 # Try to find qualifiers for this synonym
781 q = ht.get("q{}".format(i - 1))
782 if q:
783 cls = classify_desc(q)
784 if cls == "tags":
785 tagsets1, topics1 = decode_tags(q)
786 for ts in tagsets1:
787 tags.extend(ts)
788 topics.extend(topics1)
789 elif cls == "english":
790 if english:
791 english += "; " + q
792 else:
793 english = q
794 # Try to find English translation for this synonym
795 t = ht.get("t{}".format(i - 1))
796 if t:
797 if english:
798 english += "; " + t
799 else:
800 english = t
802 # See if the linkage contains a parenthesized alt
803 alt = None
804 m = re.search(r"\(([^)]+)\)$", w)
805 if m:
806 w = w[: m.start()].strip()
807 alt = m.group(1)
809 dt = {"word": w}
810 if tags:
811 data_extend(dt, "tags", tags)
812 if topics:
813 data_extend(dt, "topics", topics)
814 if english:
815 dt["english"] = english
816 if alt:
817 dt["alt"] = alt
818 data_append(data, field, dt)
821EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
822example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
823captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
826def synch_splits_with_args(
827 line: str, targs: TemplateArgs
828) -> Optional[list[str]]:
829 """If it looks like there's something weird with how a line of example
830 text has been split, this function will do the splitting after counting
831 occurences of the splitting regex inside the two main template arguments
832 containing the string data for the original language example and the
833 English translations.
834 """
835 # Previously, we split without capturing groups, but here we want to
836 # keep the original splitting hyphen regex intact.
837 fparts = captured_splitters_re.split(line)
838 new_parts = []
839 # ["First", " – ", "second", " – ", "third..."] from OL argument
840 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
841 new_parts.append("".join(fparts[:first]))
842 # Translation argument
843 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
844 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
845 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
846 new_parts.append("".join(fparts[first + 1 : second]))
848 if all(new_parts): # no empty strings from the above spaghetti
849 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
850 return new_parts
851 else:
852 return None
855QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
856QUALIFIERS_RE = re.compile(QUALIFIERS)
857# (...): ... or (...(...)...): ...
860def parse_language(
861 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
862) -> list[WordData]:
863 """Iterates over the text of the page, returning words (parts-of-speech)
864 defined on the page one at a time. (Individual word senses for the
865 same part-of-speech are typically encoded in the same entry.)"""
866 # imported here to avoid circular import
867 from .pronunciation import parse_pronunciation
869 assert isinstance(wxr, WiktextractContext)
870 assert isinstance(langnode, WikiNode)
871 assert isinstance(language, str)
872 assert isinstance(lang_code, str)
873 # print("parse_language", language)
875 is_reconstruction = False
876 word: str = wxr.wtp.title # type: ignore[assignment]
877 unsupported_prefix = "Unsupported titles/"
878 if word.startswith(unsupported_prefix):
879 w = word[len(unsupported_prefix) :]
880 if w in unsupported_title_map: 880 ↛ 883line 880 didn't jump to line 883 because the condition on line 880 was always true
881 word = unsupported_title_map[w]
882 else:
883 wxr.wtp.error(
884 "Unimplemented unsupported title: {}".format(word),
885 sortid="page/870",
886 )
887 word = w
888 elif word.startswith("Reconstruction:"): 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true
889 word = word[word.find("/") + 1 :]
890 is_reconstruction = True
892 base_data: WordData = {
893 "word": word,
894 "lang": language,
895 "lang_code": lang_code,
896 }
897 if is_reconstruction: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true
898 data_append(base_data, "tags", "reconstruction")
899 sense_data: SenseData = {}
900 pos_data: WordData = {} # For a current part-of-speech
901 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
902 etym_data: WordData = {} # For one etymology
903 pos_datas: list[SenseData] = []
904 level_four_datas: list[WordData] = []
905 etym_datas: list[WordData] = []
906 page_datas: list[WordData] = []
907 have_etym = False
908 inside_level_four = False # This is for checking if the etymology section
909 # or article has a Pronunciation section, for Chinese mostly; because
910 # Chinese articles can have three level three sections (two etymology
911 # sections and pronunciation sections) one after another, we need a kludge
912 # to better keep track of whether we're in a normal "etym" or inside a
913 # "level four" (which is what we've turned the level three Pron sections
914 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
915 # a step.
916 stack: list[str] = [] # names of items on the "stack"
918 def merge_base(data: WordData, base: WordData) -> None:
919 for k, v in base.items():
920 # Copy the value to ensure that we don't share lists or
921 # dicts between structures (even nested ones).
922 v = copy.deepcopy(v)
923 if k not in data:
924 # The list was copied above, so this will not create shared ref
925 data[k] = v # type: ignore[literal-required]
926 continue
927 if data[k] == v: # type: ignore[literal-required] 927 ↛ 929line 927 didn't jump to line 929 because the condition on line 927 was always true
928 continue
929 if (
930 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
931 or isinstance(
932 v,
933 (list, tuple), # Should this be "and"?
934 )
935 ):
936 data[k] = list(data[k]) + list(v) # type: ignore
937 elif data[k] != v: # type: ignore[literal-required]
938 wxr.wtp.warning(
939 "conflicting values for {} in merge_base: "
940 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
941 sortid="page/904",
942 )
944 def complementary_pop(pron: SoundData, key: str) -> SoundData:
945 """Remove unnecessary keys from dict values
946 in a list comprehension..."""
947 if key in pron:
948 pron.pop(key) # type: ignore
949 return pron
951 # If the result has sounds, eliminate sounds that have a prefix that
952 # does not match "word" or one of "forms"
953 if "sounds" in data and "word" in data: 953 ↛ 954line 953 didn't jump to line 954 because the condition on line 953 was never true
954 accepted = [data["word"]]
955 accepted.extend(f["form"] for f in data.get("forms", dict()))
956 data["sounds"] = list(
957 s
958 for s in data["sounds"]
959 if "form" not in s or s["form"] in accepted
960 )
961 # If the result has sounds, eliminate sounds that have a pos that
962 # does not match "pos"
963 if "sounds" in data and "pos" in data: 963 ↛ 964line 963 didn't jump to line 964 because the condition on line 963 was never true
964 data["sounds"] = list(
965 complementary_pop(s, "pos")
966 for s in data["sounds"]
967 # "pos" is not a field of SoundData, correctly, so we're
968 # removing it here. It's a kludge on a kludge on a kludge.
969 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
970 )
972 def push_sense() -> bool:
973 """Starts collecting data for a new word sense. This returns True
974 if a sense was added."""
975 nonlocal sense_data
976 tags = sense_data.get("tags", ())
977 if (
978 not sense_data.get("glosses")
979 and "translation-hub" not in tags
980 and "no-gloss" not in tags
981 ):
982 return False
984 if ( 984 ↛ 994line 984 didn't jump to line 994 because the condition on line 984 was never true
985 (
986 "participle" in sense_data.get("tags", ())
987 or "infinitive" in sense_data.get("tags", ())
988 )
989 and "alt_of" not in sense_data
990 and "form_of" not in sense_data
991 and "etymology_text" in etym_data
992 and etym_data["etymology_text"] != ""
993 ):
994 etym = etym_data["etymology_text"]
995 etym = etym.split(". ")[0]
996 ret = parse_alt_or_inflection_of(wxr, etym, set())
997 if ret is not None:
998 tags, lst = ret
999 assert isinstance(lst, (list, tuple))
1000 if "form-of" in tags:
1001 data_extend(sense_data, "form_of", lst)
1002 data_extend(sense_data, "tags", tags)
1003 elif "alt-of" in tags:
1004 data_extend(sense_data, "alt_of", lst)
1005 data_extend(sense_data, "tags", tags)
1007 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1007 ↛ 1010line 1007 didn't jump to line 1010 because the condition on line 1007 was never true
1008 "tags", ()
1009 ):
1010 data_append(sense_data, "tags", "no-gloss")
1012 pos_datas.append(sense_data)
1013 sense_data = {}
1014 return True
1016 def push_pos() -> None:
1017 """Starts collecting data for a new part-of-speech."""
1018 nonlocal pos_data
1019 nonlocal pos_datas
1020 push_sense()
1021 if wxr.wtp.subsection:
1022 data: WordData = {"senses": pos_datas}
1023 merge_base(data, pos_data)
1024 level_four_datas.append(data)
1025 pos_data = {}
1026 pos_datas = []
1027 wxr.wtp.start_subsection(None)
1029 def push_level_four_section() -> None:
1030 """Starts collecting data for a new level four sections, which
1031 is usually virtual and empty, unless the article has Chinese
1032 'Pronunciation' sections that are etymology-section-like but
1033 under etymology, and at the same level in the source. We modify
1034 the source to demote Pronunciation sections like that to level
1035 4, and other sections one step lower."""
1036 nonlocal level_four_data
1037 nonlocal level_four_datas
1038 nonlocal etym_datas
1039 push_pos()
1040 # print(f"======\n{etym_data=}")
1041 # print(f"======\n{etym_datas=}")
1042 # print(f"======\n{level_four_data=}")
1043 # print(f"======\n{level_four_datas=}")
1044 for data in level_four_datas:
1045 merge_base(data, level_four_data)
1046 etym_datas.append(data)
1047 for data in etym_datas:
1048 merge_base(data, etym_data)
1049 page_datas.append(data)
1050 level_four_data = {}
1051 level_four_datas = []
1052 etym_datas = []
1054 def push_etym() -> None:
1055 """Starts collecting data for a new etymology."""
1056 nonlocal etym_data
1057 nonlocal etym_datas
1058 nonlocal have_etym
1059 nonlocal inside_level_four
1060 have_etym = True
1061 push_level_four_section()
1062 inside_level_four = False
1063 etym_data = {}
1065 def select_data() -> WordData:
1066 """Selects where to store data (pos or etym) based on whether we
1067 are inside a pos (part-of-speech)."""
1068 # print(f"{wxr.wtp.subsection=}")
1069 # print(f"{stack=}")
1070 if wxr.wtp.subsection is not None: 1070 ↛ 1072line 1070 didn't jump to line 1072 because the condition on line 1070 was always true
1071 return pos_data
1072 if stack[-1] == language:
1073 return base_data
1074 if inside_level_four is False:
1075 return etym_data
1076 return level_four_data
1078 term_label_templates: list[TemplateData] = []
1080 def head_post_template_fn(
1081 name: str, ht: TemplateArgs, expansion: str
1082 ) -> Optional[str]:
1083 """Handles special templates in the head section of a word. Head
1084 section is the text after part-of-speech subtitle and before word
1085 sense list. Typically it generates the bold line for the word, but
1086 may also contain other useful information that often ends in
1087 side boxes. We want to capture some of that additional information."""
1088 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1089 if is_panel_template(wxr, name): 1089 ↛ 1092line 1089 didn't jump to line 1092 because the condition on line 1089 was never true
1090 # Completely ignore these templates (not even recorded in
1091 # head_templates)
1092 return ""
1093 if name == "head":
1094 # XXX are these also captured in forms? Should this special case
1095 # be removed?
1096 t = ht.get(2, "")
1097 if t == "pinyin": 1097 ↛ 1098line 1097 didn't jump to line 1098 because the condition on line 1097 was never true
1098 data_append(pos_data, "tags", "Pinyin")
1099 elif t == "romanization": 1099 ↛ 1100line 1099 didn't jump to line 1100 because the condition on line 1099 was never true
1100 data_append(pos_data, "tags", "romanization")
1101 if ( 1101 ↛ 1122line 1101 didn't jump to line 1122 because the condition on line 1101 was always true
1102 HEAD_TAG_RE.fullmatch(name) is not None
1103 or name in WORD_LEVEL_HEAD_TEMPLATES
1104 ):
1105 args_ht = clean_template_args(wxr, ht)
1106 cleaned_expansion = clean_node(wxr, None, expansion)
1107 dt: TemplateData = {
1108 "name": name,
1109 "args": args_ht,
1110 "expansion": cleaned_expansion,
1111 }
1112 data_append(pos_data, "head_templates", dt)
1113 if name in WORD_LEVEL_HEAD_TEMPLATES:
1114 term_label_templates.append(dt)
1115 # Squash these, their tags are applied to the whole word,
1116 # and some cause problems like "term-label"
1117 return ""
1119 # The following are both captured in head_templates and parsed
1120 # separately
1122 if name in wikipedia_templates: 1122 ↛ 1125line 1122 didn't jump to line 1125 because the condition on line 1122 was never true
1123 # Note: various places expect to have content from wikipedia
1124 # templates, so cannot convert this to empty
1125 parse_wikipedia_template(wxr, pos_data, ht)
1126 return None
1128 if name == "number box": 1128 ↛ 1130line 1128 didn't jump to line 1130 because the condition on line 1128 was never true
1129 # XXX extract numeric value?
1130 return ""
1131 if name == "enum": 1131 ↛ 1133line 1131 didn't jump to line 1133 because the condition on line 1131 was never true
1132 # XXX extract?
1133 return ""
1134 if name == "cardinalbox": 1134 ↛ 1137line 1134 didn't jump to line 1137 because the condition on line 1134 was never true
1135 # XXX extract similar to enum?
1136 # XXX this can also occur in top-level under language
1137 return ""
1138 if name == "Han simplified forms": 1138 ↛ 1140line 1138 didn't jump to line 1140 because the condition on line 1138 was never true
1139 # XXX extract?
1140 return ""
1141 # if name == "ja-kanji forms":
1142 # # XXX extract?
1143 # return ""
1144 # if name == "vi-readings":
1145 # # XXX extract?
1146 # return ""
1147 # if name == "ja-kanji":
1148 # # XXX extract?
1149 # return ""
1150 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1150 ↛ 1152line 1150 didn't jump to line 1152 because the condition on line 1150 was never true
1151 # XXX extract?
1152 return ""
1154 return None
1156 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1157 """Parses the subsection for a part-of-speech under a language on
1158 a page."""
1159 assert isinstance(posnode, WikiNode)
1160 assert isinstance(pos, str)
1161 # print("parse_part_of_speech", pos)
1162 pos_data["pos"] = pos
1163 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1164 lists: list[list[WikiNode]] = [[]] # list of lists
1165 first_para = True
1166 first_head_tmplt = True
1167 collecting_head = True
1168 start_of_paragraph = True
1170 # XXX extract templates from posnode with recursively_extract
1171 # that break stuff, like ja-kanji or az-suffix-form.
1172 # Do the extraction with a list of template names, combined from
1173 # different lists, then separate out them into different lists
1174 # that are handled at different points of the POS section.
1175 # First, extract az-suffix-form, put it in `inflection`,
1176 # and parse `inflection`'s content when appropriate later.
1177 # The contents of az-suffix-form (and ja-kanji) that generate
1178 # divs with "floatright" in their style gets deleted by
1179 # clean_value, so templates that slip through from here won't
1180 # break anything.
1181 # XXX bookmark
1182 # print("===================")
1183 # print(posnode.children)
1185 floaters, poschildren = recursively_extract(
1186 posnode.children,
1187 lambda x: (
1188 isinstance(x, WikiNode)
1189 and (
1190 (
1191 x.kind == NodeKind.TEMPLATE
1192 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1193 )
1194 or (
1195 x.kind == NodeKind.LINK
1196 # Need to check for stringiness because some links are
1197 # broken; for example, if a template is missing an
1198 # argument, a link might look like `[[{{{1}}}...]]`
1199 and isinstance(x.largs[0][0], str)
1200 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1201 )
1202 )
1203 ),
1204 )
1205 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1206 tempnode.largs = [["Inflection"]]
1207 tempnode.children = floaters
1208 parse_inflection(tempnode, "Floating Div", pos)
1209 # print(poschildren)
1210 # XXX new above
1212 if not poschildren: 1212 ↛ 1213line 1212 didn't jump to line 1213 because the condition on line 1212 was never true
1213 if not floaters:
1214 wxr.wtp.debug(
1215 "PoS section without contents",
1216 sortid="en/page/1051/20230612",
1217 )
1218 else:
1219 wxr.wtp.debug(
1220 "PoS section without contents except for a floating table",
1221 sortid="en/page/1056/20230612",
1222 )
1223 return
1225 for node in poschildren:
1226 if isinstance(node, str):
1227 for m in re.finditer(r"\n+|[^\n]+", node):
1228 p = m.group(0)
1229 if p.startswith("\n\n") and pre:
1230 first_para = False
1231 start_of_paragraph = True
1232 break
1233 if p and collecting_head:
1234 pre[-1].append(p)
1235 continue
1236 assert isinstance(node, WikiNode)
1237 kind = node.kind
1238 if kind == NodeKind.LIST:
1239 lists[-1].append(node)
1240 collecting_head = False
1241 start_of_paragraph = True
1242 continue
1243 elif kind in LEVEL_KINDS:
1244 # Stop parsing section if encountering any kind of
1245 # level header (like ===Noun=== or ====Further Reading====).
1246 # At a quick glance, this should be the default behavior,
1247 # but if some kinds of source articles have sub-sub-sections
1248 # that should be parsed XXX it should be handled by changing
1249 # this break.
1250 break
1251 elif collecting_head and kind == NodeKind.LINK: 1251 ↛ 1254line 1251 didn't jump to line 1254 because the condition on line 1251 was never true
1252 # We might collect relevant links as they are often pictures
1253 # relating to the word
1254 if len(node.largs[0]) >= 1 and isinstance(
1255 node.largs[0][0], str
1256 ):
1257 if node.largs[0][0].startswith(
1258 ns_title_prefix_tuple(wxr, "Category")
1259 ):
1260 # [[Category:...]]
1261 # We're at the end of the file, probably, so stop
1262 # here. Otherwise the head will get garbage.
1263 break
1264 if node.largs[0][0].startswith(
1265 ns_title_prefix_tuple(wxr, "File")
1266 ):
1267 # Skips file links
1268 continue
1269 start_of_paragraph = False
1270 pre[-1].extend(node.largs[-1])
1271 elif kind == NodeKind.HTML:
1272 if node.sarg == "br": 1272 ↛ 1278line 1272 didn't jump to line 1278 because the condition on line 1272 was always true
1273 if pre[-1]: 1273 ↛ 1225line 1273 didn't jump to line 1225 because the condition on line 1273 was always true
1274 pre.append([]) # Switch to next head
1275 lists.append([]) # Lists parallels pre
1276 collecting_head = True
1277 start_of_paragraph = True
1278 elif collecting_head and node.sarg not in (
1279 "gallery",
1280 "ref",
1281 "cite",
1282 "caption",
1283 ):
1284 start_of_paragraph = False
1285 pre[-1].append(node)
1286 else:
1287 start_of_paragraph = False
1288 elif isinstance(node, TemplateNode):
1289 # XXX Insert code here that disambiguates between
1290 # templates that generate word heads and templates
1291 # that don't.
1292 # There's head_tag_re that seems like a regex meant
1293 # to identify head templates. Too bad it's None.
1295 # ignore {{category}}, {{cat}}... etc.
1296 if node.template_name in stop_head_at_these_templates:
1297 # we've reached a template that should be at the end,
1298 continue
1300 # skip these templates; panel_templates is already used
1301 # to skip certain templates else, but it also applies to
1302 # head parsing quite well.
1303 # node.largs[0][0] should always be str, but can't type-check
1304 # that.
1305 if is_panel_template(wxr, node.template_name): 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true
1306 continue
1307 # skip these templates
1308 # if node.largs[0][0] in skip_these_templates_in_head:
1309 # first_head_tmplt = False # no first_head_tmplt at all
1310 # start_of_paragraph = False
1311 # continue
1313 if first_head_tmplt and pre[-1]:
1314 first_head_tmplt = False
1315 start_of_paragraph = False
1316 pre[-1].append(node)
1317 elif pre[-1] and start_of_paragraph:
1318 pre.append([]) # Switch to the next head
1319 lists.append([]) # lists parallel pre
1320 collecting_head = True
1321 start_of_paragraph = False
1322 pre[-1].append(node)
1323 else:
1324 pre[-1].append(node)
1325 elif first_para: 1325 ↛ 1225line 1325 didn't jump to line 1225 because the condition on line 1325 was always true
1326 start_of_paragraph = False
1327 if collecting_head: 1327 ↛ 1225line 1327 didn't jump to line 1225 because the condition on line 1327 was always true
1328 pre[-1].append(node)
1329 # XXX use template_fn in clean_node to check that the head macro
1330 # is compatible with the current part-of-speech and generate warning
1331 # if not. Use template_allowed_pos_map.
1333 # Clean up empty pairs, and fix messes with extra newlines that
1334 # separate templates that are followed by lists wiktextract issue #314
1336 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1337 cleaned_lists: list[list[WikiNode]] = []
1338 pairless_pre_index = None
1340 for pre1, ls in zip(pre, lists):
1341 if pre1 and not ls:
1342 pairless_pre_index = len(cleaned_pre)
1343 if not pre1 and not ls: 1343 ↛ 1345line 1343 didn't jump to line 1345 because the condition on line 1343 was never true
1344 # skip [] + []
1345 continue
1346 if not ls and all(
1347 (isinstance(x, str) and not x.strip()) for x in pre1
1348 ):
1349 # skip ["\n", " "] + []
1350 continue
1351 if ls and not pre1:
1352 if pairless_pre_index is not None: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true
1353 cleaned_lists[pairless_pre_index] = ls
1354 pairless_pre_index = None
1355 continue
1356 cleaned_pre.append(pre1)
1357 cleaned_lists.append(ls)
1359 pre = cleaned_pre
1360 lists = cleaned_lists
1362 there_are_many_heads = len(pre) > 1
1363 header_tags: list[str] = []
1364 header_topics: list[str] = []
1365 previous_head_had_list = False
1367 if not any(g for g in lists):
1368 process_gloss_without_list(
1369 poschildren, pos, pos_data, header_tags, header_topics
1370 )
1371 else:
1372 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1373 # if len(ls) == 0:
1374 # # don't have gloss list
1375 # # XXX add code here to filter out 'garbage', like text
1376 # # that isn't a head template or head.
1377 # continue
1379 if all(not sl for sl in lists[i:]): 1379 ↛ 1380line 1379 didn't jump to line 1380 because the condition on line 1379 was never true
1380 if i == 0:
1381 if isinstance(node, str):
1382 wxr.wtp.debug(
1383 "first head without list of senses,"
1384 "string: '{}[...]', {}/{}".format(
1385 node[:20], word, language
1386 ),
1387 sortid="page/1689/20221215",
1388 )
1389 if isinstance(node, WikiNode):
1390 if node.largs and node.largs[0][0] in [
1391 "Han char",
1392 ]:
1393 # just ignore these templates
1394 pass
1395 else:
1396 wxr.wtp.debug(
1397 "first head without "
1398 "list of senses, "
1399 "template node "
1400 "{}, {}/{}".format(
1401 node.largs, word, language
1402 ),
1403 sortid="page/1694/20221215",
1404 )
1405 else:
1406 wxr.wtp.debug(
1407 "first head without list of senses, "
1408 "{}/{}".format(word, language),
1409 sortid="page/1700/20221215",
1410 )
1411 # no break here so that the first head always
1412 # gets processed.
1413 else:
1414 if isinstance(node, str):
1415 wxr.wtp.debug(
1416 "later head without list of senses,"
1417 "string: '{}[...]', {}/{}".format(
1418 node[:20], word, language
1419 ),
1420 sortid="page/1708/20221215",
1421 )
1422 if isinstance(node, WikiNode):
1423 wxr.wtp.debug(
1424 "later head without list of senses,"
1425 "template node "
1426 "{}, {}/{}".format(
1427 node.sarg if node.sarg else node.largs,
1428 word,
1429 language,
1430 ),
1431 sortid="page/1713/20221215",
1432 )
1433 else:
1434 wxr.wtp.debug(
1435 "later head without list of senses, "
1436 "{}/{}".format(word, language),
1437 sortid="page/1719/20221215",
1438 )
1439 break
1440 head_group = i + 1 if there_are_many_heads else None
1441 # print("parse_part_of_speech: {}: {}: pre={}"
1442 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1444 if previous_head_had_list: 1444 ↛ 1449line 1444 didn't jump to line 1449 because the condition on line 1444 was never true
1445 # We use a boolean flag here because we want to be able
1446 # let the header_tags data pass through after the loop
1447 # is over without accidentally emptying it, if there are
1448 # no pos_datas and we need a dummy data.
1449 header_tags.clear()
1450 header_topics.clear()
1452 process_gloss_header(
1453 pre1, pos, head_group, pos_data, header_tags, header_topics
1454 )
1455 for ln in ls:
1456 # Parse each list associated with this head.
1457 for node in ln.children:
1458 # Parse nodes in l.children recursively.
1459 # The recursion function uses push_sense() to
1460 # add stuff into pos_data, and returns True or
1461 # False if something is added, which bubbles upward.
1462 # If the bubble is "True", then higher levels of
1463 # the recursion will not push_sense(), because
1464 # the data is already pushed into a sub-gloss
1465 # downstream, unless the higher level has examples
1466 # that need to be put somewhere.
1467 common_data: SenseData = {
1468 "tags": list(header_tags),
1469 "topics": list(header_topics),
1470 }
1471 if head_group:
1472 common_data["head_nr"] = head_group
1473 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1475 if len(ls) > 0:
1476 previous_head_had_list = True
1477 else:
1478 previous_head_had_list = False
1480 # If there are no senses extracted, add a dummy sense. We want to
1481 # keep tags extracted from the head for the dummy sense.
1482 push_sense() # Make sure unfinished data pushed, and start clean sense
1483 if len(pos_datas) == 0:
1484 data_extend(sense_data, "tags", header_tags)
1485 data_extend(sense_data, "topics", header_topics)
1486 data_append(sense_data, "tags", "no-gloss")
1487 push_sense()
1489 def process_gloss_header(
1490 header_nodes: list[Union[WikiNode, str]],
1491 pos_type: str,
1492 header_group: Optional[int],
1493 pos_data: WordData,
1494 header_tags: list[str],
1495 header_topics: list[str],
1496 ) -> None:
1497 ruby = []
1498 links: list[str] = []
1500 # process template parse nodes here
1501 new_nodes = []
1502 info_template_data = []
1503 for node in header_nodes:
1504 # print(f"{node=}")
1505 info_data, info_out = parse_info_template_node(wxr, node, "head")
1506 if info_data or info_out:
1507 if info_data: 1507 ↛ 1509line 1507 didn't jump to line 1509 because the condition on line 1507 was always true
1508 info_template_data.append(info_data)
1509 if info_out: # including just the original node 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true
1510 new_nodes.append(info_out)
1511 else:
1512 new_nodes.append(node)
1513 header_nodes = new_nodes
1515 if info_template_data:
1516 if "info_templates" not in pos_data: 1516 ↛ 1519line 1516 didn't jump to line 1519 because the condition on line 1516 was always true
1517 pos_data["info_templates"] = info_template_data
1518 else:
1519 pos_data["info_templates"].extend(info_template_data)
1521 if not word.isalnum():
1522 # if the word contains non-letter or -number characters, it might
1523 # have something that messes with split-at-semi-comma; we collect
1524 # links so that we can skip splitting them.
1525 exp = wxr.wtp.parse(
1526 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1527 )
1528 link_nodes, _ = recursively_extract(
1529 exp.children,
1530 lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.LINK,
1531 )
1532 for ln in link_nodes:
1533 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1534 if not ltext.isalnum():
1535 links.append(ltext)
1536 if word not in links: 1536 ↛ 1538line 1536 didn't jump to line 1538 because the condition on line 1536 was always true
1537 links.append(word)
1538 if lang_code == "ja":
1539 exp = wxr.wtp.parse(
1540 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1541 )
1542 rub, _ = recursively_extract(
1543 exp.children,
1544 lambda x: isinstance(x, WikiNode)
1545 and x.kind == NodeKind.HTML
1546 and x.sarg == "ruby",
1547 )
1548 if rub is not None: 1548 ↛ 1557line 1548 didn't jump to line 1557 because the condition on line 1548 was always true
1549 for r in rub: 1549 ↛ 1550line 1549 didn't jump to line 1550 because the loop on line 1549 never started
1550 if TYPE_CHECKING:
1551 # we know the lambda above in recursively_extract
1552 # returns only WikiNodes in rub
1553 assert isinstance(r, WikiNode)
1554 rt = parse_ruby(wxr, r)
1555 if rt is not None:
1556 ruby.append(rt)
1557 header_text = clean_node(
1558 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1559 )
1561 term_label_tags: list[str] = []
1562 term_label_topics: list[str] = []
1563 if len(term_label_templates) > 0:
1564 # parse term label templates; if there are other similar kinds
1565 # of templates in headers that you want to squash and apply as
1566 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1567 for templ_data in term_label_templates:
1568 print(templ_data)
1569 expan = templ_data.get("expansion", "").strip("().,; ")
1570 if not expan: 1570 ↛ 1571line 1570 didn't jump to line 1571 because the condition on line 1570 was never true
1571 continue
1572 tlb_tagsets, tlb_topics = decode_tags(expan)
1573 for tlb_tags in tlb_tagsets:
1574 if len(tlb_tags) > 0 and not any( 1574 ↛ 1573line 1574 didn't jump to line 1573 because the condition on line 1574 was always true
1575 t.startswith("error-") for t in tlb_tags
1576 ):
1577 term_label_tags.extend(tlb_tags)
1578 term_label_topics.extend(tlb_topics)
1579 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1581 header_text = re.sub(r"\s+", " ", header_text)
1582 # print(f"{header_text=}")
1583 parse_word_head(
1584 wxr,
1585 pos_type,
1586 header_text,
1587 pos_data,
1588 is_reconstruction,
1589 header_group,
1590 ruby=ruby,
1591 links=links,
1592 )
1593 if "tags" in pos_data:
1594 # pos_data can get "tags" data from some source; type-checkers
1595 # doesn't like it, so let's ignore it.
1596 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1597 del pos_data["tags"] # type: ignore[typeddict-item]
1598 if len(term_label_tags) > 0:
1599 header_tags.extend(term_label_tags)
1600 if len(term_label_topics) > 0:
1601 header_topics.extend(term_label_topics)
1603 def process_gloss_without_list(
1604 nodes: list[Union[WikiNode, str]],
1605 pos_type: str,
1606 pos_data: WordData,
1607 header_tags: list[str],
1608 header_topics: list[str],
1609 ) -> None:
1610 # gloss text might not inside a list
1611 header_nodes: list[Union[str, WikiNode]] = []
1612 gloss_nodes: list[Union[str, WikiNode]] = []
1613 for node in strip_nodes(nodes):
1614 if isinstance(node, WikiNode):
1615 if isinstance(node, TemplateNode):
1616 if node.template_name in (
1617 "zh-see",
1618 "ja-see",
1619 "ja-see-kango",
1620 ):
1621 continue # soft redirect
1622 elif ( 1622 ↛ 1630line 1622 didn't jump to line 1630 because the condition on line 1622 was always true
1623 node.template_name == "head"
1624 or node.template_name.startswith(f"{lang_code}-")
1625 ):
1626 header_nodes.append(node)
1627 continue
1628 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1628 ↛ 1630line 1628 didn't jump to line 1630 because the condition on line 1628 was always true
1629 break
1630 gloss_nodes.append(node)
1632 if len(header_nodes) > 0:
1633 process_gloss_header(
1634 header_nodes,
1635 pos_type,
1636 None,
1637 pos_data,
1638 header_tags,
1639 header_topics,
1640 )
1641 if len(gloss_nodes) > 0:
1642 process_gloss_contents(
1643 gloss_nodes,
1644 pos_type,
1645 {"tags": list(header_tags), "topics": list(header_topics)},
1646 )
1648 def parse_sense_node(
1649 node: Union[str, WikiNode], # never receives str
1650 sense_base: SenseData,
1651 pos: str,
1652 ) -> bool:
1653 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1654 Uses push_sense() to attempt adding data to pos_data in the scope
1655 of parse_language() when it reaches deep in the recursion. push_sense()
1656 returns True if it succeeds, and that is bubbled up the stack; if
1657 a sense was added downstream, the higher levels (whose shared data
1658 was already added by a subsense) do not push_sense(), unless it
1659 has examples that need to be put somewhere.
1660 """
1661 assert isinstance(sense_base, dict) # Added to every sense deeper in
1662 if not isinstance(node, WikiNode): 1662 ↛ 1664line 1662 didn't jump to line 1664 because the condition on line 1662 was never true
1663 # This doesn't seem to ever happen in practice.
1664 wxr.wtp.debug(
1665 "{}: parse_sense_node called with"
1666 "something that isn't a WikiNode".format(pos),
1667 sortid="page/1287/20230119",
1668 )
1669 return False
1671 if node.kind != NodeKind.LIST_ITEM: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true
1672 wxr.wtp.debug(
1673 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1674 )
1675 return False
1677 if node.sarg == ":": 1677 ↛ 1683line 1677 didn't jump to line 1683 because the condition on line 1677 was never true
1678 # Skip example entries at the highest level, ones without
1679 # a sense ("...#") above them.
1680 # If node.sarg is exactly and only ":", then it's at
1681 # the highest level; lower levels would have more
1682 # "indentation", like "#:" or "##:"
1683 return False
1685 # If a recursion call succeeds in push_sense(), bubble it up with
1686 # `added`.
1687 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1688 added = False
1690 gloss_template_args: set[str] = set()
1692 # For LISTs and LIST_ITEMS, their argument is something like
1693 # "##" or "##:", and using that we can rudimentally determine
1694 # list 'depth' if need be, and also what kind of list or
1695 # entry it is; # is for normal glosses, : for examples (indent)
1696 # and * is used for quotations on wiktionary.
1697 current_depth = node.sarg
1699 children = node.children
1701 # subentries, (presumably) a list
1702 # of subglosses below this. The list's
1703 # argument ends with #, and its depth should
1704 # be bigger than parent node.
1705 subentries = [
1706 x
1707 for x in children
1708 if isinstance(x, WikiNode)
1709 and x.kind == NodeKind.LIST
1710 and x.sarg == current_depth + "#"
1711 ]
1713 # sublists of examples and quotations. .sarg
1714 # does not end with "#".
1715 others = [
1716 x
1717 for x in children
1718 if isinstance(x, WikiNode)
1719 and x.kind == NodeKind.LIST
1720 and x.sarg != current_depth + "#"
1721 ]
1723 # the actual contents of this particular node.
1724 # can be a gloss (or a template that expands into
1725 # many glosses which we can't easily pre-expand)
1726 # or could be an "outer gloss" with more specific
1727 # subglosses, or could be a qualfier for the subglosses.
1728 contents = [
1729 x
1730 for x in children
1731 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1732 ]
1733 # If this entry has sublists of entries, we should combine
1734 # gloss information from both the "outer" and sublist content.
1735 # Sometimes the outer gloss
1736 # is more non-gloss or tags, sometimes it is a coarse sense
1737 # and the inner glosses are more specific. The outer one
1738 # does not seem to have qualifiers.
1740 # If we have one sublist with one element, treat it
1741 # specially as it may be a Wiktionary error; raise
1742 # that nested element to the same level.
1743 # XXX If need be, this block can be easily removed in
1744 # the current recursive logicand the result is one sense entry
1745 # with both glosses in the glosses list, as you would
1746 # expect. If the higher entry has examples, there will
1747 # be a higher entry with some duplicated data.
1748 if len(subentries) == 1:
1749 slc = subentries[0].children
1750 if len(slc) == 1: 1750 ↛ 1753line 1750 didn't jump to line 1753 because the condition on line 1750 was never true
1751 # copy current node and modify it so it doesn't
1752 # loop infinitely.
1753 cropped_node = copy.copy(node)
1754 cropped_node.children = [
1755 x
1756 for x in children
1757 if not (
1758 isinstance(x, WikiNode)
1759 and x.kind == NodeKind.LIST
1760 and x.sarg == current_depth + "#"
1761 )
1762 ]
1763 added |= parse_sense_node(cropped_node, sense_base, pos)
1764 nonlocal sense_data # this kludge causes duplicated raw_
1765 # glosses data if this is not done;
1766 # if the top-level (cropped_node)
1767 # does not push_sense() properly or
1768 # parse_sense_node() returns early,
1769 # sense_data is not reset. This happens
1770 # for example when you have a no-gloss
1771 # string like "(intransitive)":
1772 # no gloss, push_sense() returns early
1773 # and sense_data has duplicate data with
1774 # sense_base
1775 sense_data = {}
1776 added |= parse_sense_node(slc[0], sense_base, pos)
1777 return added
1779 return process_gloss_contents(
1780 contents,
1781 pos,
1782 sense_base,
1783 subentries,
1784 others,
1785 gloss_template_args,
1786 added,
1787 )
1789 def process_gloss_contents(
1790 contents: list[Union[str, WikiNode]],
1791 pos: str,
1792 sense_base: SenseData,
1793 subentries: list[WikiNode] = [],
1794 others: list[WikiNode] = [],
1795 gloss_template_args: Set[str] = set(),
1796 added: bool = False,
1797 ) -> bool:
1798 def sense_template_fn(
1799 name: str, ht: TemplateArgs, is_gloss: bool = False
1800 ) -> Optional[str]:
1801 # print(f"sense_template_fn: {name}, {ht}")
1802 if name in wikipedia_templates: 1802 ↛ 1804line 1802 didn't jump to line 1804 because the condition on line 1802 was never true
1803 # parse_wikipedia_template(wxr, pos_data, ht)
1804 return None
1805 if is_panel_template(wxr, name): 1805 ↛ 1806line 1805 didn't jump to line 1806 because the condition on line 1805 was never true
1806 return ""
1807 if name in INFO_TEMPLATE_FUNCS:
1808 info_data, info_exp = parse_info_template_arguments(
1809 wxr, name, ht, "sense"
1810 )
1811 if info_data or info_exp: 1811 ↛ 1817line 1811 didn't jump to line 1817 because the condition on line 1811 was always true
1812 if info_data: 1812 ↛ 1814line 1812 didn't jump to line 1814 because the condition on line 1812 was always true
1813 data_append(sense_base, "info_templates", info_data)
1814 if info_exp and isinstance(info_exp, str): 1814 ↛ 1816line 1814 didn't jump to line 1816 because the condition on line 1814 was always true
1815 return info_exp
1816 return ""
1817 if name in ("defdate",): 1817 ↛ 1818line 1817 didn't jump to line 1818 because the condition on line 1817 was never true
1818 return ""
1819 if name == "senseid": 1819 ↛ 1820line 1819 didn't jump to line 1820 because the condition on line 1819 was never true
1820 langid = clean_node(wxr, None, ht.get(1, ()))
1821 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1822 if re.match(r"Q\d+$", arg):
1823 data_append(sense_base, "wikidata", arg)
1824 data_append(sense_base, "senseid", langid + ":" + arg)
1825 if name in sense_linkage_templates: 1825 ↛ 1827line 1825 didn't jump to line 1827 because the condition on line 1825 was never true
1826 # print(f"SENSE_TEMPLATE_FN: {name}")
1827 parse_sense_linkage(wxr, sense_base, name, ht)
1828 return ""
1829 if name == "†" or name == "zh-obsolete": 1829 ↛ 1830line 1829 didn't jump to line 1830 because the condition on line 1829 was never true
1830 data_append(sense_base, "tags", "obsolete")
1831 return ""
1832 if name in {
1833 "ux",
1834 "uxi",
1835 "usex",
1836 "afex",
1837 "prefixusex",
1838 "ko-usex",
1839 "ko-x",
1840 "hi-x",
1841 "ja-usex-inline",
1842 "ja-x",
1843 "quotei",
1844 "he-x",
1845 "hi-x",
1846 "km-x",
1847 "ne-x",
1848 "shn-x",
1849 "th-x",
1850 "ur-x",
1851 }:
1852 # Usage examples are captured separately below. We don't
1853 # want to expand them into glosses even when unusual coding
1854 # is used in the entry.
1855 # These templates may slip through inside another item, but
1856 # currently we're separating out example entries (..#:)
1857 # well enough that there seems to very little contamination.
1858 if is_gloss: 1858 ↛ 1864line 1858 didn't jump to line 1864 because the condition on line 1858 was always true
1859 wxr.wtp.warning(
1860 "Example template is used for gloss text",
1861 sortid="extractor.en.page.sense_template_fn/1415",
1862 )
1863 else:
1864 return ""
1865 if name == "w": 1865 ↛ 1866line 1865 didn't jump to line 1866 because the condition on line 1865 was never true
1866 if ht.get(2) == "Wp":
1867 return ""
1868 for k, v in ht.items():
1869 v = v.strip()
1870 if v and "<" not in v: 1870 ↛ 1868line 1870 didn't jump to line 1868 because the condition on line 1870 was always true
1871 gloss_template_args.add(v)
1872 return None
1874 def extract_link_texts(item: GeneralNode) -> None:
1875 """Recursively extracts link texts from the gloss source. This
1876 information is used to select whether to remove final "." from
1877 form_of/alt_of (e.g., ihm/Hunsrik)."""
1878 if isinstance(item, (list, tuple)):
1879 for x in item:
1880 extract_link_texts(x)
1881 return
1882 if isinstance(item, str):
1883 # There seem to be HTML sections that may futher contain
1884 # unparsed links.
1885 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1885 ↛ 1886line 1885 didn't jump to line 1886 because the loop on line 1885 never started
1886 print("ITER:", m.group(0))
1887 v = m.group(1).split("|")[-1].strip()
1888 if v:
1889 gloss_template_args.add(v)
1890 return
1891 if not isinstance(item, WikiNode): 1891 ↛ 1892line 1891 didn't jump to line 1892 because the condition on line 1891 was never true
1892 return
1893 if item.kind == NodeKind.LINK:
1894 v = item.largs[-1]
1895 if ( 1895 ↛ 1901line 1895 didn't jump to line 1901 because the condition on line 1895 was always true
1896 isinstance(v, list)
1897 and len(v) == 1
1898 and isinstance(v[0], str)
1899 ):
1900 gloss_template_args.add(v[0].strip())
1901 for x in item.children:
1902 extract_link_texts(x)
1904 extract_link_texts(contents)
1906 # get the raw text of non-list contents of this node, and other stuff
1907 # like tag and category data added to sense_base
1908 # cast = no-op type-setter for the type-checker
1909 partial_template_fn = cast(
1910 TemplateFnCallable,
1911 partial(sense_template_fn, is_gloss=True),
1912 )
1913 rawgloss = clean_node(
1914 wxr,
1915 sense_base,
1916 contents,
1917 template_fn=partial_template_fn,
1918 collect_links=True,
1919 )
1921 if not rawgloss: 1921 ↛ 1922line 1921 didn't jump to line 1922 because the condition on line 1921 was never true
1922 return False
1924 # remove manually typed ordered list text at the start("1. ")
1925 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
1927 # get stuff like synonyms and categories from "others",
1928 # maybe examples and quotations
1929 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
1931 # The gloss could contain templates that produce more list items.
1932 # This happens commonly with, e.g., {{inflection of|...}}. Split
1933 # to parts. However, e.g. Interlingua generates multiple glosses
1934 # in HTML directly without Wikitext markup, so we must also split
1935 # by just newlines.
1936 subglosses = rawgloss.splitlines()
1938 if len(subglosses) == 0: 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true
1939 return False
1941 if any(s.startswith("#") for s in subglosses):
1942 subtree = wxr.wtp.parse(rawgloss)
1943 # from wikitextprocessor.parser import print_tree
1944 # print("SUBTREE GENERATED BY TEMPLATE:")
1945 # print_tree(subtree)
1946 new_subentries = [
1947 x
1948 for x in subtree.children
1949 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
1950 ]
1952 new_others = [
1953 x
1954 for x in subtree.children
1955 if isinstance(x, WikiNode)
1956 and x.kind == NodeKind.LIST
1957 and not x.sarg.endswith("#")
1958 ]
1960 new_contents = [
1961 clean_node(wxr, [], x)
1962 for x in subtree.children
1963 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1964 ]
1966 subentries = subentries or new_subentries
1967 others = others or new_others
1968 subglosses = new_contents
1969 rawgloss = "".join(subglosses)
1970 # Generate no gloss for translation hub pages, but add the
1971 # "translation-hub" tag for them
1972 if rawgloss == "(This entry is a translation hub.)": 1972 ↛ 1973line 1972 didn't jump to line 1973 because the condition on line 1972 was never true
1973 data_append(sense_data, "tags", "translation-hub")
1974 return push_sense()
1976 # Remove certain substrings specific to outer glosses
1977 strip_ends = [", particularly:"]
1978 for x in strip_ends:
1979 if rawgloss.endswith(x): 1979 ↛ 1980line 1979 didn't jump to line 1980 because the condition on line 1979 was never true
1980 rawgloss = rawgloss[: -len(x)].strip()
1981 break
1983 # A single gloss, or possibly an outer gloss.
1984 # Check if the possible outer gloss starts with
1985 # parenthesized tags/topics
1987 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 1987 ↛ 1989line 1987 didn't jump to line 1989 because the condition on line 1987 was always true
1988 data_append(sense_base, "raw_glosses", subglosses[0].strip())
1989 m = QUALIFIERS_RE.match(rawgloss)
1990 # (...): ... or (...(...)...): ...
1991 if m:
1992 q = m.group(1)
1993 rawgloss = rawgloss[m.end() :].strip()
1994 parse_sense_qualifier(wxr, q, sense_base)
1995 if rawgloss == "A pejorative:": 1995 ↛ 1996line 1995 didn't jump to line 1996 because the condition on line 1995 was never true
1996 data_append(sense_base, "tags", "pejorative")
1997 rawgloss = ""
1998 elif rawgloss == "Short forms.": 1998 ↛ 1999line 1998 didn't jump to line 1999 because the condition on line 1998 was never true
1999 data_append(sense_base, "tags", "abbreviation")
2000 rawgloss = ""
2001 elif rawgloss == "Technical or specialized senses.": 2001 ↛ 2002line 2001 didn't jump to line 2002 because the condition on line 2001 was never true
2002 rawgloss = ""
2003 elif rawgloss.startswith("inflection of "):
2004 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2005 if parsed is not None: 2005 ↛ 2014line 2005 didn't jump to line 2014 because the condition on line 2005 was always true
2006 tags, origins = parsed
2007 if origins is not None: 2007 ↛ 2009line 2007 didn't jump to line 2009 because the condition on line 2007 was always true
2008 data_extend(sense_base, "form_of", origins)
2009 if tags is not None: 2009 ↛ 2012line 2009 didn't jump to line 2012 because the condition on line 2009 was always true
2010 data_extend(sense_base, "tags", tags)
2011 else:
2012 data_append(sense_base, "tags", "form-of")
2013 else:
2014 data_append(sense_base, "tags", "form-of")
2015 if rawgloss: 2015 ↛ 2046line 2015 didn't jump to line 2046 because the condition on line 2015 was always true
2016 # Code duplicating a lot of clean-up operations from later in
2017 # this block. We want to clean up the "supergloss" as much as
2018 # possible, in almost the same way as a normal gloss.
2019 supergloss = rawgloss
2021 if supergloss.startswith("; "): 2021 ↛ 2022line 2021 didn't jump to line 2022 because the condition on line 2021 was never true
2022 supergloss = supergloss[1:].strip()
2024 if supergloss.startswith(("^†", "†")):
2025 data_append(sense_base, "tags", "obsolete")
2026 supergloss = supergloss[2:].strip()
2027 elif supergloss.startswith("^‡"): 2027 ↛ 2028line 2027 didn't jump to line 2028 because the condition on line 2027 was never true
2028 data_extend(sense_base, "tags", ["obsolete", "historical"])
2029 supergloss = supergloss[2:].strip()
2031 # remove [14th century...] style brackets at the end
2032 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2034 if supergloss.startswith((",", ":")): 2034 ↛ 2035line 2034 didn't jump to line 2035 because the condition on line 2034 was never true
2035 supergloss = supergloss[1:]
2036 supergloss = supergloss.strip()
2037 if supergloss.startswith("N. of "): 2037 ↛ 2038line 2037 didn't jump to line 2038 because the condition on line 2037 was never true
2038 supergloss = "Name of " + supergloss[6:]
2039 supergloss = supergloss[2:]
2040 data_append(sense_base, "glosses", supergloss)
2041 if supergloss in ("A person:",): 2041 ↛ 2042line 2041 didn't jump to line 2042 because the condition on line 2041 was never true
2042 data_append(sense_base, "tags", "g-person")
2044 # The main recursive call (except for the exceptions at the
2045 # start of this function).
2046 for sublist in subentries:
2047 if not ( 2047 ↛ 2050line 2047 didn't jump to line 2050 because the condition on line 2047 was never true
2048 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2049 ):
2050 wxr.wtp.debug(
2051 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2052 f"with items that are not LISTs",
2053 sortid="page/1511/20230119",
2054 )
2055 continue
2056 for item in sublist.children:
2057 if not ( 2057 ↛ 2061line 2057 didn't jump to line 2061 because the condition on line 2057 was never true
2058 isinstance(item, WikiNode)
2059 and item.kind == NodeKind.LIST_ITEM
2060 ):
2061 continue
2062 # copy sense_base to prevent cross-contamination between
2063 # subglosses and other subglosses and superglosses
2064 sense_base2 = copy.deepcopy(sense_base)
2065 if parse_sense_node(item, sense_base2, pos): 2065 ↛ 2056line 2065 didn't jump to line 2056 because the condition on line 2065 was always true
2066 added = True
2068 # Capture examples.
2069 # This is called after the recursive calls above so that
2070 # sense_base is not contaminated with meta-data from
2071 # example entries for *this* gloss.
2072 examples = []
2073 if wxr.config.capture_examples: 2073 ↛ 2077line 2073 didn't jump to line 2077 because the condition on line 2073 was always true
2074 examples = extract_examples(others, sense_base)
2076 # push_sense() succeeded somewhere down-river, so skip this level
2077 if added:
2078 if examples:
2079 # this higher-up gloss has examples that we do not want to skip
2080 wxr.wtp.debug(
2081 "'{}[...]' gloss has examples we want to keep, "
2082 "but there are subglosses.".format(repr(rawgloss[:30])),
2083 sortid="page/1498/20230118",
2084 )
2085 else:
2086 return True
2088 # Some entries, e.g., "iacebam", have weird sentences in quotes
2089 # after the gloss, but these sentences don't seem to be intended
2090 # as glosses. Skip them.
2091 indexed_subglosses = list(
2092 (i, gl)
2093 for i, gl in enumerate(subglosses)
2094 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2095 )
2097 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2097 ↛ 2098line 2097 didn't jump to line 2098 because the condition on line 2097 was never true
2098 gl = indexed_subglosses[0][1].strip()
2099 if gl.endswith(":"):
2100 gl = gl[:-1].strip()
2101 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2102 if parsed is not None:
2103 infl_tags, infl_dts = parsed
2104 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2105 # Interpret others as a particular form under
2106 # "inflection of"
2107 data_extend(sense_base, "tags", infl_tags)
2108 data_extend(sense_base, "form_of", infl_dts)
2109 indexed_subglosses = indexed_subglosses[1:]
2110 elif not infl_dts:
2111 data_extend(sense_base, "tags", infl_tags)
2112 indexed_subglosses = indexed_subglosses[1:]
2114 # Create senses for remaining subglosses
2115 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2116 gloss = gloss.strip()
2117 if not gloss and len(indexed_subglosses) > 1: 2117 ↛ 2118line 2117 didn't jump to line 2118 because the condition on line 2117 was never true
2118 continue
2119 # Push a new sense (if the last one is not empty)
2120 if push_sense(): 2120 ↛ 2121line 2120 didn't jump to line 2121 because the condition on line 2120 was never true
2121 added = True
2122 # if gloss not in sense_data.get("raw_glosses", ()):
2123 # data_append(sense_data, "raw_glosses", gloss)
2124 if i == 0 and examples:
2125 # In a multi-line gloss, associate examples
2126 # with only one of them.
2127 # XXX or you could use gloss_i == len(indexed_subglosses)
2128 # to associate examples with the *last* one.
2129 data_extend(sense_data, "examples", examples)
2130 if gloss.startswith("; ") and gloss_i > 0: 2130 ↛ 2131line 2130 didn't jump to line 2131 because the condition on line 2130 was never true
2131 gloss = gloss[1:].strip()
2132 # If the gloss starts with †, mark as obsolete
2133 if gloss.startswith("^†"): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true
2134 data_append(sense_data, "tags", "obsolete")
2135 gloss = gloss[2:].strip()
2136 elif gloss.startswith("^‡"): 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true
2137 data_extend(sense_data, "tags", ["obsolete", "historical"])
2138 gloss = gloss[2:].strip()
2139 # Copy data for all senses to this sense
2140 for k, v in sense_base.items():
2141 if isinstance(v, (list, tuple)):
2142 if k != "tags":
2143 # Tags handled below (countable/uncountable special)
2144 data_extend(sense_data, k, v)
2145 else:
2146 assert k not in ("tags", "categories", "topics")
2147 sense_data[k] = v # type:ignore[literal-required]
2148 # Parse the gloss for this particular sense
2149 m = QUALIFIERS_RE.match(gloss)
2150 # (...): ... or (...(...)...): ...
2151 if m:
2152 parse_sense_qualifier(wxr, m.group(1), sense_data)
2153 gloss = gloss[m.end() :].strip()
2155 # Remove common suffix "[from 14th c.]" and similar
2156 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2158 # Check to make sure we don't have unhandled list items in gloss
2159 ofs = max(gloss.find("#"), gloss.find("* "))
2160 if ofs > 10 and "(#)" not in gloss: 2160 ↛ 2161line 2160 didn't jump to line 2161 because the condition on line 2160 was never true
2161 wxr.wtp.debug(
2162 "gloss may contain unhandled list items: {}".format(gloss),
2163 sortid="page/1412",
2164 )
2165 elif "\n" in gloss: 2165 ↛ 2166line 2165 didn't jump to line 2166 because the condition on line 2165 was never true
2166 wxr.wtp.debug(
2167 "gloss contains newline: {}".format(gloss),
2168 sortid="page/1416",
2169 )
2171 # Kludge, some glosses have a comma after initial qualifiers in
2172 # parentheses
2173 if gloss.startswith((",", ":")): 2173 ↛ 2174line 2173 didn't jump to line 2174 because the condition on line 2173 was never true
2174 gloss = gloss[1:]
2175 gloss = gloss.strip()
2176 if gloss.endswith(":"): 2176 ↛ 2177line 2176 didn't jump to line 2177 because the condition on line 2176 was never true
2177 gloss = gloss[:-1].strip()
2178 if gloss.startswith("N. of "): 2178 ↛ 2179line 2178 didn't jump to line 2179 because the condition on line 2178 was never true
2179 gloss = "Name of " + gloss[6:]
2180 if gloss.startswith("†"): 2180 ↛ 2181line 2180 didn't jump to line 2181 because the condition on line 2180 was never true
2181 data_append(sense_data, "tags", "obsolete")
2182 gloss = gloss[1:]
2183 elif gloss.startswith("^†"): 2183 ↛ 2184line 2183 didn't jump to line 2184 because the condition on line 2183 was never true
2184 data_append(sense_data, "tags", "obsolete")
2185 gloss = gloss[2:]
2187 # Copy tags from sense_base if any. This will not copy
2188 # countable/uncountable if either was specified in the sense,
2189 # as sometimes both are specified in word head but only one
2190 # in individual senses.
2191 countability_tags = []
2192 base_tags = sense_base.get("tags", ())
2193 sense_tags = sense_data.get("tags", ())
2194 for tag in base_tags:
2195 if tag in ("countable", "uncountable"):
2196 if tag not in countability_tags: 2196 ↛ 2198line 2196 didn't jump to line 2198 because the condition on line 2196 was always true
2197 countability_tags.append(tag)
2198 continue
2199 if tag not in sense_tags:
2200 data_append(sense_data, "tags", tag)
2201 if countability_tags:
2202 if ( 2202 ↛ 2211line 2202 didn't jump to line 2211 because the condition on line 2202 was always true
2203 "countable" not in sense_tags
2204 and "uncountable" not in sense_tags
2205 ):
2206 data_extend(sense_data, "tags", countability_tags)
2208 # If outer gloss specifies a form-of ("inflection of", see
2209 # aquamarine/German), try to parse the inner glosses as
2210 # tags for an inflected form.
2211 if "form-of" in sense_base.get("tags", ()):
2212 parsed = parse_alt_or_inflection_of(
2213 wxr, gloss, gloss_template_args
2214 )
2215 if parsed is not None: 2215 ↛ 2221line 2215 didn't jump to line 2221 because the condition on line 2215 was always true
2216 infl_tags, infl_dts = parsed
2217 if not infl_dts and infl_tags: 2217 ↛ 2221line 2217 didn't jump to line 2221 because the condition on line 2217 was always true
2218 # Interpret as a particular form under "inflection of"
2219 data_extend(sense_data, "tags", infl_tags)
2221 if not gloss: 2221 ↛ 2222line 2221 didn't jump to line 2222 because the condition on line 2221 was never true
2222 data_append(sense_data, "tags", "empty-gloss")
2223 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 2223 ↛ 2224line 2223 didn't jump to line 2224 because the condition on line 2223 was never true
2224 if (
2225 gloss_i == 0
2226 and len(sense_data.get("glosses", tuple())) >= 1
2227 ):
2228 # If we added a "high-level gloss" from rawgloss, but this
2229 # is that same gloss_i, add this instead of the raw_gloss
2230 # from before if they're different: the rawgloss was not
2231 # cleaned exactly the same as this later gloss
2232 sense_data["glosses"][-1] = gloss
2233 else:
2234 # Add the gloss for the sense.
2235 data_append(sense_data, "glosses", gloss)
2237 # Kludge: there are cases (e.g., etc./Swedish) where there are
2238 # two abbreviations in the same sense, both generated by the
2239 # {{abbreviation of|...}} template. Handle these with some magic.
2240 position = 0
2241 split_glosses = []
2242 for m in re.finditer(r"Abbreviation of ", gloss): 2242 ↛ 2243line 2242 didn't jump to line 2243 because the loop on line 2242 never started
2243 if m.start() != position:
2244 split_glosses.append(gloss[position : m.start()])
2245 position = m.start()
2246 split_glosses.append(gloss[position:])
2247 for gloss in split_glosses:
2248 # Check if this gloss describes an alt-of or inflection-of
2249 if (
2250 lang_code != "en"
2251 and " " not in gloss
2252 and distw([word], gloss) < 0.3
2253 ):
2254 # Don't try to parse gloss if it is one word
2255 # that is close to the word itself for non-English words
2256 # (probable translations of a tag/form name)
2257 continue
2258 parsed = parse_alt_or_inflection_of(
2259 wxr, gloss, gloss_template_args
2260 )
2261 if parsed is None:
2262 continue
2263 tags, dts = parsed
2264 if not dts and tags: 2264 ↛ 2267line 2264 didn't jump to line 2267 because the condition on line 2264 was always true
2265 data_extend(sense_data, "tags", tags)
2266 continue
2267 for dt in dts: # type:ignore[union-attr]
2268 ftags = list(tag for tag in tags if tag != "form-of")
2269 if "alt-of" in tags:
2270 data_extend(sense_data, "tags", ftags)
2271 data_append(sense_data, "alt_of", dt)
2272 elif "compound-of" in tags:
2273 data_extend(sense_data, "tags", ftags)
2274 data_append(sense_data, "compound_of", dt)
2275 elif "synonym-of" in tags:
2276 data_extend(dt, "tags", ftags)
2277 data_append(sense_data, "synonyms", dt)
2278 elif tags and dt.get("word", "").startswith("of "):
2279 dt["word"] = dt["word"][3:]
2280 data_append(sense_data, "tags", "form-of")
2281 data_extend(sense_data, "tags", ftags)
2282 data_append(sense_data, "form_of", dt)
2283 elif "form-of" in tags:
2284 data_extend(sense_data, "tags", tags)
2285 data_append(sense_data, "form_of", dt)
2287 if len(sense_data) == 0:
2288 if len(sense_base.get("tags", [])) == 0: 2288 ↛ 2290line 2288 didn't jump to line 2290 because the condition on line 2288 was always true
2289 del sense_base["tags"]
2290 sense_data.update(sense_base)
2291 if push_sense(): 2291 ↛ 2295line 2291 didn't jump to line 2295 because the condition on line 2291 was always true
2292 # push_sense succeded in adding a sense to pos_data
2293 added = True
2294 # print("PARSE_SENSE DONE:", pos_datas[-1])
2295 return added
2297 def parse_inflection(
2298 node: WikiNode, section: str, pos: Optional[str]
2299 ) -> None:
2300 """Parses inflection data (declension, conjugation) from the given
2301 page. This retrieves the actual inflection template
2302 parameters, which are very useful for applications that need
2303 to learn the inflection classes and generate inflected
2304 forms."""
2305 assert isinstance(node, WikiNode)
2306 assert isinstance(section, str)
2307 assert pos is None or isinstance(pos, str)
2308 # print("parse_inflection:", node)
2310 if pos is None: 2310 ↛ 2311line 2310 didn't jump to line 2311 because the condition on line 2310 was never true
2311 wxr.wtp.debug(
2312 "inflection table outside part-of-speech", sortid="page/1812"
2313 )
2314 return
2316 def inflection_template_fn(
2317 name: str, ht: TemplateArgs
2318 ) -> Optional[str]:
2319 # print("decl_conj_template_fn", name, ht)
2320 if is_panel_template(wxr, name):
2321 return ""
2322 if name in ("is-u-mutation",):
2323 # These are not to be captured as an exception to the
2324 # generic code below
2325 return None
2326 m = re.search(
2327 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2328 r"declension|inflection|mut|mutation)($|-)",
2329 name,
2330 )
2331 if m:
2332 args_ht = clean_template_args(wxr, ht)
2333 dt = {"name": name, "args": args_ht}
2334 data_append(pos_data, "inflection_templates", dt)
2336 return None
2338 # Convert the subtree back to Wikitext, then expand all and parse,
2339 # capturing templates in the process
2340 text = wxr.wtp.node_to_wikitext(node.children)
2342 # Split text into separate sections for each to-level template
2343 brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"]
2344 template_sections = []
2345 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2346 # Because there is the possibility of triple curly braces
2347 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2348 # count nesting depth using pairs of two brackets, but
2349 # instead use singular braces ("{ }").
2350 # Because template delimiters should be balanced, regardless
2351 # of whether {{ or {{{ is used, and because we only care
2352 # about the outer-most delimiters (the highest level template)
2353 # we can just count the single braces when those single
2354 # braces are part of a group.
2356 # print(f"Parse inflection: {text=}")
2357 # print(repr(brace_matches))
2358 if len(brace_matches) > 1: 2358 ↛ 2359line 2358 didn't jump to line 2359 because the condition on line 2358 was never true
2359 tsection: list[str] = []
2360 after_templates = False # kludge to keep any text
2361 # before first template
2362 # with the first template;
2363 # otherwise, text
2364 # goes with preceding template
2365 for m in brace_matches:
2366 if m.startswith("\n; ") and after_templates:
2367 after_templates = False
2368 template_sections.append(tsection)
2369 tsection = []
2370 tsection.append(m)
2371 elif m.startswith("{{"):
2372 if template_nesting == 0 and after_templates:
2373 template_sections.append(tsection)
2374 tsection = []
2375 # start new section
2376 after_templates = True
2377 template_nesting += len(m)
2378 tsection.append(m)
2379 elif m.startswith("}}"):
2380 template_nesting -= len(m)
2381 if template_nesting < 0:
2382 wxr.wtp.error(
2383 "Negatively nested braces, "
2384 "couldn't split inflection templates, "
2385 "{}/{} section {}".format(word, language, section),
2386 sortid="page/1871",
2387 )
2388 template_sections = [] # use whole text
2389 break
2390 tsection.append(m)
2391 else:
2392 tsection.append(m)
2393 if tsection: # dangling tsection
2394 template_sections.append(tsection)
2395 # Why do it this way around? The parser has a preference
2396 # to associate bits outside of tables with the preceding
2397 # table (`after`-variable), so a new tsection begins
2398 # at {{ and everything before it belongs to the previous
2399 # template.
2401 texts = []
2402 if not template_sections: 2402 ↛ 2405line 2402 didn't jump to line 2405 because the condition on line 2402 was always true
2403 texts = [text]
2404 else:
2405 for tsection in template_sections:
2406 texts.append("".join(tsection))
2407 if template_nesting != 0: 2407 ↛ 2408line 2407 didn't jump to line 2408 because the condition on line 2407 was never true
2408 wxr.wtp.error(
2409 "Template nesting error: "
2410 "template_nesting = {} "
2411 "couldn't split inflection templates, "
2412 "{}/{} section {}".format(
2413 template_nesting, word, language, section
2414 ),
2415 sortid="page/1896",
2416 )
2417 texts = [text]
2418 for text in texts:
2419 tree = wxr.wtp.parse(
2420 text, expand_all=True, template_fn=inflection_template_fn
2421 )
2423 # Parse inflection tables from the section. The data is stored
2424 # under "forms".
2425 if wxr.config.capture_inflections: 2425 ↛ 2418line 2425 didn't jump to line 2418 because the condition on line 2425 was always true
2426 tablecontext = None
2427 m = re.search(r"{{([^}{|]+)\|?", text)
2428 if m: 2428 ↛ 2429line 2428 didn't jump to line 2429 because the condition on line 2428 was never true
2429 template_name = m.group(1)
2430 tablecontext = TableContext(template_name)
2432 parse_inflection_section(
2433 wxr,
2434 pos_data,
2435 word,
2436 language,
2437 pos,
2438 section,
2439 tree,
2440 tablecontext=tablecontext,
2441 )
2443 def get_subpage_section(
2444 title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]]
2445 ) -> Optional[Union[WikiNode, str]]:
2446 """Loads a subpage of the given page, and finds the section
2447 for the given language, part-of-speech, and section title. This
2448 is used for finding translations and other sections on subpages."""
2449 assert isinstance(language, str)
2450 assert isinstance(title, str)
2451 assert isinstance(subtitle, str)
2452 assert isinstance(seq, (list, tuple))
2453 for x in seq:
2454 assert isinstance(x, str)
2455 subpage_title = word + "/" + subtitle
2456 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2457 if subpage_content is None:
2458 wxr.wtp.error(
2459 "/translations not found despite "
2460 "{{see translation subpage|...}}",
2461 sortid="page/1934",
2462 )
2463 return None
2465 def recurse(
2466 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2467 ) -> Optional[Union[str, WikiNode]]:
2468 # print(f"seq: {seq}")
2469 if not seq:
2470 return node
2471 if not isinstance(node, WikiNode):
2472 return None
2473 # print(f"node.kind: {node.kind}")
2474 if node.kind in LEVEL_KINDS:
2475 t = clean_node(wxr, None, node.largs[0])
2476 # print(f"t: {t} == seq[0]: {seq[0]}?")
2477 if t.lower() == seq[0].lower():
2478 seq = seq[1:]
2479 if not seq:
2480 return node
2481 for n in node.children:
2482 ret = recurse(n, seq)
2483 if ret is not None:
2484 return ret
2485 return None
2487 tree = wxr.wtp.parse(
2488 subpage_content,
2489 pre_expand=True,
2490 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2491 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2492 )
2493 assert tree.kind == NodeKind.ROOT
2494 ret = recurse(tree, seq)
2495 if ret is None:
2496 wxr.wtp.debug(
2497 "Failed to find subpage section {}/{} seq {}".format(
2498 title, subtitle, seq
2499 ),
2500 sortid="page/1963",
2501 )
2502 return ret
2504 def parse_linkage(
2505 data: WordData, field: str, linkagenode: WikiNode
2506 ) -> None:
2507 assert isinstance(data, dict)
2508 assert isinstance(field, str)
2509 assert isinstance(linkagenode, WikiNode)
2510 # if field == "synonyms":
2511 # print("field", field)
2512 # print("data", data)
2513 # print("children:")
2514 # print(linkagenode.children)
2515 if not wxr.config.capture_linkages: 2515 ↛ 2516line 2515 didn't jump to line 2516 because the condition on line 2515 was never true
2516 return
2517 have_panel_template = False
2518 toplevel_text = []
2519 next_navframe_sense = None # Used for "(sense):" before NavFrame
2521 def parse_linkage_item(
2522 contents: list[Union[str, WikiNode]],
2523 field: str,
2524 sense: Optional[str] = None,
2525 ):
2526 assert isinstance(contents, (list, tuple))
2527 assert isinstance(field, str)
2528 assert sense is None or isinstance(sense, str)
2530 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2531 # .format(field, sense, contents))
2533 parts: list[str] = []
2534 ruby: list[tuple[str, str]] = []
2535 urls: list[str] = []
2536 # data about link text; this is used to skip splitting on
2537 # linkage text items that contain stuff like commas; for
2538 # example "Hunde, die bellen, beißen nicht" in article
2539 # beißen is split into "Hunde", "die bellen" etc.
2540 # We take that link text and use it, eventually,
2541 # in split_at_comma_semi to skip splitting on those
2542 # commas.
2543 links_that_should_not_be_split: list[str] = []
2545 def item_recurse(
2546 contents: list[Union[str, WikiNode]], italic=False
2547 ) -> None:
2548 assert isinstance(contents, (list, tuple))
2549 nonlocal sense
2550 nonlocal ruby
2551 nonlocal parts
2552 # print("ITEM_RECURSE:", contents)
2553 for node in contents:
2554 if isinstance(node, str): 2554 ↛ 2557line 2554 didn't jump to line 2557 because the condition on line 2554 was always true
2555 parts.append(node)
2556 continue
2557 kind = node.kind
2558 # print("ITEM_RECURSE KIND:", kind,
2559 # node.sarg if node.sarg else node.largs)
2560 if kind == NodeKind.LIST:
2561 if parts:
2562 sense1: Optional[str]
2563 sense1 = clean_node(wxr, None, parts)
2564 if sense1.endswith(":"):
2565 sense1 = sense1[:-1].strip()
2566 if sense1.startswith("(") and sense1.endswith(")"):
2567 sense1 = sense1[1:-1].strip()
2568 if sense1.lower() == TRANSLATIONS_TITLE:
2569 sense1 = None
2570 # print("linkage item_recurse LIST sense1:", sense1)
2571 parse_linkage_recurse(
2572 node.children, field, sense=sense1 or sense
2573 )
2574 parts = []
2575 else:
2576 parse_linkage_recurse(node.children, field, sense)
2577 elif kind in (
2578 NodeKind.TABLE,
2579 NodeKind.TABLE_ROW,
2580 NodeKind.TABLE_CELL,
2581 ):
2582 parse_linkage_recurse(node.children, field, sense)
2583 elif kind in (
2584 NodeKind.TABLE_HEADER_CELL,
2585 NodeKind.TABLE_CAPTION,
2586 ):
2587 continue
2588 elif kind == NodeKind.HTML:
2589 classes = (node.attrs.get("class") or "").split()
2590 if node.sarg in ("gallery", "ref", "cite", "caption"):
2591 continue
2592 elif node.sarg == "ruby":
2593 rb = parse_ruby(wxr, node)
2594 if rb:
2595 ruby.append(rb)
2596 parts.append(rb[0])
2597 continue
2598 elif node.sarg == "math":
2599 parts.append(clean_node(wxr, None, node))
2600 continue
2601 elif "interProject" in classes:
2602 continue # These do not seem to be displayed
2603 if "NavFrame" in classes:
2604 parse_linkage_recurse(node.children, field, sense)
2605 else:
2606 item_recurse(node.children, italic=italic)
2607 elif kind == NodeKind.ITALIC:
2608 item_recurse(node.children, italic=True)
2609 elif kind == NodeKind.LINK:
2610 ignore = False
2611 if isinstance(node.largs[0][0], str):
2612 v1 = node.largs[0][0].strip().lower()
2613 if v1.startswith(
2614 ns_title_prefix_tuple(wxr, "Category", True)
2615 + ns_title_prefix_tuple(wxr, "File", True)
2616 ):
2617 ignore = True
2618 if not ignore:
2619 v = node.largs[-1]
2620 if (
2621 len(node.largs) == 1
2622 and len(v) > 0
2623 and isinstance(v[0], str)
2624 and v[0][0] == ":"
2625 ):
2626 v = [v[0][1:]] + list(v[1:]) # type:ignore
2627 if isinstance(v[0], str) and not v[0].isalnum():
2628 links_that_should_not_be_split.append(
2629 "".join(v[0])
2630 ) # type: ignore
2631 item_recurse(v, italic=italic)
2632 elif kind == NodeKind.URL:
2633 if len(node.largs) < 2 and node.largs:
2634 # Naked url captured
2635 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2636 continue
2637 if len(node.largs) == 2:
2638 # Url from link with text
2639 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2640 # print(f"{node.largs=!r}")
2641 # print("linkage recurse URL {}".format(node))
2642 item_recurse(node.largs[-1], italic=italic)
2643 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD):
2644 item_recurse(node.children, italic=italic)
2645 else:
2646 wxr.wtp.debug(
2647 "linkage item_recurse unhandled {}: {}".format(
2648 node.kind, node
2649 ),
2650 sortid="page/2073",
2651 )
2653 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2654 # .format(contents))
2656 item_recurse(contents)
2657 item = clean_node(wxr, None, parts)
2658 # print("LINKAGE ITEM CONTENTS:", parts)
2659 # print("CLEANED ITEM: {!r}".format(item))
2660 # print(f"URLS {urls=!r}")
2662 return parse_linkage_item_text(
2663 wxr,
2664 word,
2665 data,
2666 field,
2667 item,
2668 sense,
2669 ruby,
2670 pos_datas,
2671 is_reconstruction,
2672 urls or None,
2673 links_that_should_not_be_split or None,
2674 )
2676 def parse_linkage_recurse(
2677 contents: list[Union[WikiNode, str]],
2678 field: str,
2679 sense: Optional[str],
2680 ) -> None:
2681 assert isinstance(contents, (list, tuple))
2682 assert sense is None or isinstance(sense, str)
2683 nonlocal next_navframe_sense
2684 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2685 for node in contents:
2686 if isinstance(node, str):
2687 # Ignore top-level text, generally comments before the
2688 # linkages list. However, if no linkages are found, then
2689 # use this for linkages (not all words use bullet points
2690 # for linkages).
2691 toplevel_text.append(node)
2692 continue
2693 assert isinstance(node, WikiNode)
2694 kind = node.kind
2695 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2696 if kind == NodeKind.LIST:
2697 parse_linkage_recurse(node.children, field, sense)
2698 elif kind == NodeKind.LIST_ITEM: 2698 ↛ 2705line 2698 didn't jump to line 2705 because the condition on line 2698 was always true
2699 v = parse_linkage_item(node.children, field, sense)
2700 if v: 2700 ↛ 2704line 2700 didn't jump to line 2704 because the condition on line 2700 was never true
2701 # parse_linkage_item() can return a value that should
2702 # be used as the sense for the follow-on linkages,
2703 # which are typically provided in a table (see 滿)
2704 next_navframe_sense = v
2705 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2706 parse_linkage_recurse(node.children, field, sense)
2707 elif kind == NodeKind.TABLE_CELL:
2708 parse_linkage_item(node.children, field, sense)
2709 elif kind in (
2710 NodeKind.TABLE_CAPTION,
2711 NodeKind.TABLE_HEADER_CELL,
2712 NodeKind.PREFORMATTED,
2713 NodeKind.BOLD,
2714 ):
2715 continue
2716 elif kind == NodeKind.HTML:
2717 # Recurse to process inside the HTML for most tags
2718 if node.sarg in ("gallery", "ref", "cite", "caption"):
2719 continue
2720 classes = (node.attrs.get("class") or "").split()
2721 if node.sarg == "li":
2722 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2723 v = parse_linkage_item(node.children, field, sense)
2724 if v:
2725 next_navframe_sense = v
2726 elif "qualifier-content" in classes:
2727 sense1 = clean_node(wxr, None, node.children)
2728 if sense1.endswith(":"):
2729 sense1 = sense1[:-1].strip()
2730 if sense and sense1:
2731 wxr.wtp.debug(
2732 "linkage qualifier-content on multiple "
2733 "levels: {!r} and {!r}".format(sense, sense1),
2734 sortid="page/2170",
2735 )
2736 parse_linkage_recurse(node.children, field, sense1)
2737 elif "NavFrame" in classes:
2738 # NavFrame uses previously assigned next_navframe_sense
2739 # (from a "(sense):" item) and clears it afterwards
2740 parse_linkage_recurse(
2741 node.children, field, sense or next_navframe_sense
2742 )
2743 next_navframe_sense = None
2744 else:
2745 parse_linkage_recurse(node.children, field, sense)
2746 elif kind in LEVEL_KINDS:
2747 # Just recurse to any possible subsections
2748 parse_linkage_recurse(node.children, field, sense)
2749 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2750 # Skip these on top level; at least sometimes bold is
2751 # used for indicating a subtitle
2752 continue
2753 elif kind == NodeKind.LINK:
2754 # Recurse into the last argument
2755 # Apparently ":/" is used as a link to "/", so strip
2756 # initial value
2757 parse_linkage_recurse(node.largs[-1], field, sense)
2758 else:
2759 wxr.wtp.debug(
2760 "parse_linkage_recurse unhandled {}: {}".format(
2761 kind, node
2762 ),
2763 sortid="page/2196",
2764 )
2766 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2767 nonlocal have_panel_template
2768 if is_panel_template(wxr, name):
2769 have_panel_template = True
2770 return ""
2771 return None
2773 def parse_zh_synonyms(
2774 parsed: list[Union[WikiNode, str]],
2775 data: list[LinkageData],
2776 hdrs: list[str],
2777 root_word: str,
2778 ) -> None:
2779 """Parses Chinese dialectal synonyms tables"""
2780 for item in parsed:
2781 if isinstance(item, WikiNode):
2782 if item.kind == NodeKind.TABLE_ROW:
2783 cleaned = clean_node(wxr, None, item.children)
2784 # print("cleaned:", repr(cleaned))
2785 if any(
2786 [
2787 "Variety" in cleaned,
2788 "Location" in cleaned,
2789 "Words" in cleaned,
2790 ]
2791 ):
2792 pass
2793 else:
2794 split = cleaned.split("\n")
2795 new_hdrs = split[:-1]
2796 if len(new_hdrs) == 2:
2797 hdrs = [new_hdrs[0]]
2798 new_hdrs.pop(0)
2799 combined_hdrs = [x.strip() for x in hdrs + new_hdrs]
2800 tags = []
2801 words = split[-1].split(",")
2802 for hdr in combined_hdrs:
2803 hdr = hdr.replace("(", ",")
2804 hdr = hdr.replace(")", "")
2805 hdr = hdr.replace("N.", "Northern,")
2806 hdr = hdr.replace("S.", "Southern,")
2807 new = hdr.split(",")
2808 for tag in sorted(new):
2809 tag = tag.strip()
2810 tag = tag.replace(" ", "-")
2811 if tag in valid_tags:
2812 tags.append(tag)
2813 else:
2814 if tag in zh_tag_lookup:
2815 tags.extend(zh_tag_lookup[tag])
2816 else:
2817 print(
2818 f"MISSING ZH SYNONYM TAG for "
2819 f"root {root_word}, word "
2820 f"{words}: {tag}"
2821 )
2822 sys.stdout.flush()
2824 for word in words:
2825 data.append(
2826 {"word": word.strip(), "tags": tags}
2827 )
2828 elif item.kind == NodeKind.HTML:
2829 cleaned = clean_node(wxr, None, item.children)
2830 if "Synonyms of" in cleaned:
2831 cleaned = cleaned.replace("Synonyms of ", "")
2832 root_word = cleaned
2833 parse_zh_synonyms(item.children, data, hdrs, root_word)
2834 else:
2835 parse_zh_synonyms(item.children, data, hdrs, root_word)
2837 def parse_zh_synonyms_list(
2838 parsed: list[Union[WikiNode, str]],
2839 data: list[LinkageData],
2840 hdrs: list[str],
2841 root_word: str,
2842 ) -> None:
2843 """Parses Chinese dialectal synonyms tables (list format)"""
2844 for item in parsed:
2845 if isinstance(item, WikiNode):
2846 if item.kind == NodeKind.LIST_ITEM:
2847 cleaned = clean_node(wxr, None, item.children)
2848 # print("cleaned:", repr(cleaned))
2849 if any(
2850 [
2851 "Variety" in cleaned,
2852 "Location" in cleaned,
2853 "Words" in cleaned,
2854 ]
2855 ):
2856 pass
2857 else:
2858 cleaned = cleaned.replace("(", ",")
2859 cleaned = cleaned.replace(")", "")
2860 split = cleaned.split(",")
2861 # skip empty words / titles
2862 if split[0] == "":
2863 continue
2864 words = split[0].split("/")
2865 new_hdrs = [x.strip() for x in split[1:]]
2866 tags = []
2867 roman = None
2868 for tag in sorted(new_hdrs):
2869 if tag in valid_tags:
2870 tags.append(tag)
2871 elif tag in zh_tag_lookup:
2872 tags.extend(zh_tag_lookup[tag])
2873 elif (
2874 classify_desc(tag) == "romanization"
2875 and roman is None
2876 ):
2877 roman = tag
2878 else:
2879 print(
2880 f"MISSING ZH SYNONYM TAG "
2881 f"(possibly pinyin) - root "
2882 f"{root_word}, word {words}: {tag}"
2883 )
2884 sys.stdout.flush()
2886 for word in words:
2887 dt: LinkageData = {"word": word.strip()}
2888 if tags:
2889 dt["tags"] = tags
2890 if roman is not None:
2891 dt["roman"] = roman
2892 data.append(dt)
2893 elif item.kind == NodeKind.HTML:
2894 cleaned = clean_node(wxr, None, item.children)
2895 if cleaned.find("Synonyms of") >= 0:
2896 cleaned = cleaned.replace("Synonyms of ", "")
2897 root_word = cleaned
2898 parse_zh_synonyms_list(
2899 item.children, data, hdrs, root_word
2900 )
2901 else:
2902 parse_zh_synonyms_list(
2903 item.children, data, hdrs, root_word
2904 )
2906 def contains_kind(
2907 children: list[Union[WikiNode, str]], nodekind: NodeKind
2908 ) -> bool:
2909 assert isinstance(children, list)
2910 for item in children:
2911 if not isinstance(item, WikiNode):
2912 continue
2913 if item.kind == nodekind:
2914 return True
2915 elif contains_kind(item.children, nodekind):
2916 return True
2917 return False
2919 # Main body of parse_linkage()
2920 text = wxr.wtp.node_to_wikitext(linkagenode.children)
2921 parsed = wxr.wtp.parse(
2922 text, expand_all=True, template_fn=linkage_template_fn1
2923 )
2924 if field == "synonyms" and lang_code == "zh": 2924 ↛ 2925line 2924 didn't jump to line 2925 because the condition on line 2924 was never true
2925 synonyms: list[LinkageData] = []
2926 if contains_kind(parsed.children, NodeKind.LIST):
2927 parse_zh_synonyms_list(parsed.children, synonyms, [], "")
2928 else:
2929 parse_zh_synonyms(parsed.children, synonyms, [], "")
2930 # print(json.dumps(synonyms, indent=4, ensure_ascii=False))
2931 data_extend(data, "synonyms", synonyms)
2932 parse_linkage_recurse(parsed.children, field, None)
2933 if not data.get(field) and not have_panel_template: 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true
2934 text = "".join(toplevel_text).strip()
2935 if "\n" not in text and "," in text and text.count(",") > 3:
2936 if not text.startswith("See "):
2937 parse_linkage_item([text], field, None)
2939 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2940 """Parses translations for a word. This may also pull in translations
2941 from separate translation subpages."""
2942 assert isinstance(data, dict)
2943 assert isinstance(xlatnode, WikiNode)
2944 # print("===== PARSE_TRANSLATIONS {} {} {}"
2945 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2946 # print("parse_translations xlatnode={}".format(xlatnode))
2947 if not wxr.config.capture_translations: 2947 ↛ 2948line 2947 didn't jump to line 2948 because the condition on line 2947 was never true
2948 return
2949 sense_parts: list[Union[WikiNode, str]] = []
2950 sense: Optional[str] = None
2952 def parse_translation_item(
2953 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2954 ) -> None:
2955 nonlocal sense
2956 assert isinstance(contents, list)
2957 assert lang is None or isinstance(lang, str)
2958 # print("PARSE_TRANSLATION_ITEM:", contents)
2960 langcode: Optional[str] = None
2961 if sense is None:
2962 sense = clean_node(wxr, data, sense_parts).strip()
2963 # print("sense <- clean_node: ", sense)
2964 idx = sense.find("See also translations at")
2965 if idx > 0: 2965 ↛ 2966line 2965 didn't jump to line 2966 because the condition on line 2965 was never true
2966 wxr.wtp.debug(
2967 "Skipping translation see also: {}".format(sense),
2968 sortid="page/2361",
2969 )
2970 sense = sense[:idx].strip()
2971 if sense.endswith(":"): 2971 ↛ 2972line 2971 didn't jump to line 2972 because the condition on line 2971 was never true
2972 sense = sense[:-1].strip()
2973 if sense.endswith("—"): 2973 ↛ 2974line 2973 didn't jump to line 2974 because the condition on line 2973 was never true
2974 sense = sense[:-1].strip()
2975 translations_from_template: list[str] = []
2977 def translation_item_template_fn(
2978 name: str, ht: TemplateArgs
2979 ) -> Optional[str]:
2980 nonlocal langcode
2981 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2982 if is_panel_template(wxr, name):
2983 return ""
2984 if name in ("t+check", "t-check", "t-needed"):
2985 # We ignore these templates. They seem to have outright
2986 # garbage in some entries, and very varying formatting in
2987 # others. These should be transitory and unreliable
2988 # anyway.
2989 return "__IGNORE__"
2990 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2991 code = ht.get(1)
2992 if code:
2993 if langcode and code != langcode:
2994 wxr.wtp.debug(
2995 "inconsistent language codes {} vs "
2996 "{} in translation item: {!r} {}".format(
2997 langcode, code, name, ht
2998 ),
2999 sortid="page/2386",
3000 )
3001 langcode = code
3002 tr = ht.get(2)
3003 if tr:
3004 tr = clean_node(wxr, None, [tr])
3005 translations_from_template.append(tr)
3006 return None
3007 if name == "t-egy":
3008 langcode = "egy"
3009 return None
3010 if name == "ttbc":
3011 code = ht.get(1)
3012 if code:
3013 langcode = code
3014 return None
3015 if name == "trans-see":
3016 wxr.wtp.error(
3017 "UNIMPLEMENTED trans-see template", sortid="page/2405"
3018 )
3019 return ""
3020 if name.endswith("-top"):
3021 return ""
3022 if name.endswith("-bottom"):
3023 return ""
3024 if name.endswith("-mid"):
3025 return ""
3026 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
3027 # .format(name),
3028 # sortid="page/2414")
3029 return None
3031 sublists = list(
3032 x
3033 for x in contents
3034 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
3035 )
3036 contents = list(
3037 x
3038 for x in contents
3039 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
3040 )
3042 item = clean_node(
3043 wxr, data, contents, template_fn=translation_item_template_fn
3044 )
3045 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
3047 # Parse the translation item.
3048 if item: 3048 ↛ exitline 3048 didn't return from function 'parse_translation_item' because the condition on line 3048 was always true
3049 lang = parse_translation_item_text(
3050 wxr,
3051 word,
3052 data,
3053 item,
3054 sense,
3055 lang,
3056 langcode,
3057 translations_from_template,
3058 is_reconstruction,
3059 )
3061 # Handle sublists. They are frequently used for different
3062 # scripts for the language and different variants of the
3063 # language. We will include the lower-level header as a
3064 # tag in those cases.
3065 for listnode in sublists: 3065 ↛ 3066line 3065 didn't jump to line 3066 because the loop on line 3065 never started
3066 assert listnode.kind == NodeKind.LIST
3067 for node in listnode.children:
3068 if not isinstance(node, WikiNode):
3069 continue
3070 if node.kind == NodeKind.LIST_ITEM:
3071 parse_translation_item(node.children, lang=lang)
3073 def parse_translation_template(node: WikiNode) -> None:
3074 assert isinstance(node, WikiNode)
3076 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3077 nonlocal sense_parts
3078 nonlocal sense
3079 if is_panel_template(wxr, name):
3080 return ""
3081 if name == "see also":
3082 # XXX capture
3083 # XXX for example, "/" has top-level list containing
3084 # see also items. So also should parse those.
3085 return ""
3086 if name == "trans-see":
3087 # XXX capture
3088 return ""
3089 if name == "see translation subpage":
3090 sense_parts = []
3091 sense = None
3092 sub = ht.get(1, "")
3093 if sub:
3094 m = re.match(
3095 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3096 )
3097 else:
3098 m = None
3099 etym = ""
3100 etym_numbered = ""
3101 pos = ""
3102 if m:
3103 etym_numbered = m.group(1)
3104 etym = m.group(2)
3105 pos = m.group(3)
3106 if not sub:
3107 wxr.wtp.debug(
3108 "no part-of-speech in "
3109 "{{see translation subpage|...}}, "
3110 "defaulting to just wxr.wtp.section "
3111 "(= language)",
3112 sortid="page/2468",
3113 )
3114 # seq sent to get_subpage_section without sub and pos
3115 seq = [
3116 language,
3117 TRANSLATIONS_TITLE,
3118 ]
3119 elif (
3120 m
3121 and etym.lower().strip() in ETYMOLOGY_TITLES
3122 and pos.lower() in POS_TITLES
3123 ):
3124 seq = [
3125 language,
3126 etym_numbered,
3127 pos,
3128 TRANSLATIONS_TITLE,
3129 ]
3130 elif sub.lower() in POS_TITLES:
3131 # seq with sub but not pos
3132 seq = [
3133 language,
3134 sub,
3135 TRANSLATIONS_TITLE,
3136 ]
3137 else:
3138 # seq with sub and pos
3139 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3140 if pos.lower() not in POS_TITLES:
3141 wxr.wtp.debug(
3142 "unhandled see translation subpage: "
3143 "language={} sub={} "
3144 "wxr.wtp.subsection={}".format(
3145 language, sub, wxr.wtp.subsection
3146 ),
3147 sortid="page/2478",
3148 )
3149 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3150 subnode = get_subpage_section(
3151 wxr.wtp.title or "MISSING_TITLE",
3152 TRANSLATIONS_TITLE,
3153 seq,
3154 )
3155 if subnode is not None and isinstance(subnode, WikiNode):
3156 parse_translations(data, subnode)
3157 else:
3158 # Failed to find the normal subpage section
3159 seq = [TRANSLATIONS_TITLE]
3160 subnode = get_subpage_section(
3161 wxr.wtp.title or "MISSING_TITLE",
3162 TRANSLATIONS_TITLE,
3163 seq,
3164 )
3165 if subnode is not None and isinstance(
3166 subnode, WikiNode
3167 ):
3168 parse_translations(data, subnode)
3169 return ""
3170 if name in (
3171 "c",
3172 "C",
3173 "categorize",
3174 "cat",
3175 "catlangname",
3176 "topics",
3177 "top",
3178 "qualifier",
3179 "cln",
3180 ):
3181 # These are expanded in the default way
3182 return None
3183 if name in ("trans-top",):
3184 # XXX capture id from trans-top? Capture sense here
3185 # instead of trying to parse it from expanded content?
3186 if ht.get(1):
3187 sense_parts = []
3188 sense = ht.get(1)
3189 else:
3190 sense_parts = []
3191 sense = None
3192 return None
3193 if name in (
3194 "trans-bottom",
3195 "trans-mid",
3196 "checktrans-mid",
3197 "checktrans-bottom",
3198 ):
3199 return None
3200 if name == "checktrans-top":
3201 sense_parts = []
3202 sense = None
3203 return ""
3204 if name == "trans-top-also":
3205 # XXX capture?
3206 sense_parts = []
3207 sense = None
3208 return ""
3209 wxr.wtp.error(
3210 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3211 name, ht
3212 ),
3213 sortid="page/2517",
3214 )
3215 return ""
3217 wxr.wtp.expand(
3218 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3219 )
3221 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3222 nonlocal sense
3223 nonlocal sense_parts
3224 for node in xlatnode.children:
3225 # print(node)
3226 if isinstance(node, str):
3227 if sense: 3227 ↛ 3228line 3227 didn't jump to line 3228 because the condition on line 3227 was never true
3228 if not node.isspace():
3229 wxr.wtp.debug(
3230 "skipping string in the middle of "
3231 "translations: {}".format(node),
3232 sortid="page/2530",
3233 )
3234 continue
3235 # Add a part to the sense
3236 sense_parts.append(node)
3237 sense = None
3238 continue
3239 assert isinstance(node, WikiNode)
3240 kind = node.kind
3241 if kind == NodeKind.LIST: 3241 ↛ 3250line 3241 didn't jump to line 3250 because the condition on line 3241 was always true
3242 for item in node.children:
3243 if not isinstance(item, WikiNode): 3243 ↛ 3244line 3243 didn't jump to line 3244 because the condition on line 3243 was never true
3244 continue
3245 if item.kind != NodeKind.LIST_ITEM: 3245 ↛ 3246line 3245 didn't jump to line 3246 because the condition on line 3245 was never true
3246 continue
3247 if item.sarg == ":": 3247 ↛ 3248line 3247 didn't jump to line 3248 because the condition on line 3247 was never true
3248 continue
3249 parse_translation_item(item.children)
3250 elif kind == NodeKind.LIST_ITEM and node.sarg == ":":
3251 # Silently skip list items that are just indented; these
3252 # are used for text between translations, such as indicating
3253 # translations that need to be checked.
3254 pass
3255 elif kind == NodeKind.TEMPLATE:
3256 parse_translation_template(node)
3257 elif kind in (
3258 NodeKind.TABLE,
3259 NodeKind.TABLE_ROW,
3260 NodeKind.TABLE_CELL,
3261 ):
3262 parse_translation_recurse(node)
3263 elif kind == NodeKind.HTML:
3264 if node.attrs.get("class") == "NavFrame":
3265 # Reset ``sense_parts`` (and force recomputing
3266 # by clearing ``sense``) as each NavFrame specifies
3267 # its own sense. This helps eliminate garbage coming
3268 # from text at the beginning at the translations
3269 # section.
3270 sense_parts = []
3271 sense = None
3272 # for item in node.children:
3273 # if not isinstance(item, WikiNode):
3274 # continue
3275 # parse_translation_recurse(item)
3276 parse_translation_recurse(node)
3277 elif kind in LEVEL_KINDS:
3278 # Sub-levels will be recursed elsewhere
3279 pass
3280 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3281 parse_translation_recurse(node)
3282 elif kind == NodeKind.PREFORMATTED:
3283 print("parse_translation_recurse: PREFORMATTED:", node)
3284 elif kind == NodeKind.LINK:
3285 arg0 = node.largs[0]
3286 # Kludge: I've seen occasional normal links to translation
3287 # subpages from main pages (e.g., language/English/Noun
3288 # in July 2021) instead of the normal
3289 # {{see translation subpage|...}} template. This should
3290 # handle them. Note: must be careful not to read other
3291 # links, particularly things like in "human being":
3292 # "a human being -- see [[man/translations]]" (group title)
3293 if (
3294 isinstance(arg0, (list, tuple))
3295 and arg0
3296 and isinstance(arg0[0], str)
3297 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3298 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3299 == wxr.wtp.title
3300 ):
3301 wxr.wtp.debug(
3302 "translations subpage link found on main "
3303 "page instead "
3304 "of normal {{see translation subpage|...}}",
3305 sortid="page/2595",
3306 )
3307 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3308 if sub.lower() in POS_TITLES:
3309 seq = [
3310 language,
3311 sub,
3312 TRANSLATIONS_TITLE,
3313 ]
3314 subnode = get_subpage_section(
3315 wxr.wtp.title,
3316 TRANSLATIONS_TITLE,
3317 seq,
3318 )
3319 if subnode is not None and isinstance(
3320 subnode, WikiNode
3321 ):
3322 parse_translations(data, subnode)
3323 else:
3324 wxr.wtp.error(
3325 "/translations link outside part-of-speech"
3326 )
3328 if (
3329 len(arg0) >= 1
3330 and isinstance(arg0[0], str)
3331 and not arg0[0].lower().startswith("category:")
3332 ):
3333 for x in node.largs[-1]:
3334 if isinstance(x, str):
3335 sense_parts.append(x)
3336 else:
3337 parse_translation_recurse(x)
3338 elif not sense:
3339 sense_parts.append(node)
3340 else:
3341 wxr.wtp.debug(
3342 "skipping text between translation items/senses: "
3343 "{}".format(node),
3344 sortid="page/2621",
3345 )
3347 # Main code of parse_translation(). We want ``sense`` to be assigned
3348 # regardless of recursion levels, and thus the code is structured
3349 # to define at this level and recurse in parse_translation_recurse().
3350 parse_translation_recurse(xlatnode)
3352 def parse_etymology(data: WordData, node: WikiNode) -> None:
3353 """Parses an etymology section."""
3354 assert isinstance(data, dict)
3355 assert isinstance(node, WikiNode)
3357 templates: list[TemplateData] = []
3359 # Counter for preventing the capture of etymology templates
3360 # when we are inside templates that we want to ignore (i.e.,
3361 # not capture).
3362 ignore_count = 0
3364 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3365 nonlocal ignore_count
3366 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3367 return ""
3368 if re.match(ignored_etymology_templates_re, name):
3369 ignore_count += 1
3370 return None
3372 # CONTINUE_HERE
3374 def etym_post_template_fn(
3375 name: str, ht: TemplateArgs, expansion: str
3376 ) -> None:
3377 nonlocal ignore_count
3378 if name in wikipedia_templates:
3379 parse_wikipedia_template(wxr, data, ht)
3380 return None
3381 if re.match(ignored_etymology_templates_re, name):
3382 ignore_count -= 1
3383 return None
3384 if ignore_count == 0:
3385 ht = clean_template_args(wxr, ht)
3386 expansion = clean_node(wxr, None, expansion)
3387 templates.append(
3388 {"name": name, "args": ht, "expansion": expansion}
3389 )
3390 return None
3392 # Remove any subsections
3393 contents = list(
3394 x
3395 for x in node.children
3396 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3397 )
3398 # Convert to text, also capturing templates using post_template_fn
3399 text = clean_node(
3400 wxr,
3401 None,
3402 contents,
3403 template_fn=etym_template_fn,
3404 post_template_fn=etym_post_template_fn,
3405 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3406 # Save the collected information.
3407 if len(text) > 0: 3407 ↛ 3409line 3407 didn't jump to line 3409 because the condition on line 3407 was always true
3408 data["etymology_text"] = text
3409 if len(templates) > 0: 3409 ↛ 3414line 3409 didn't jump to line 3414 because the condition on line 3409 was never true
3410 # Some etymology templates, like Template:root do not generate
3411 # text, so they should be added here. Elsewhere, we check
3412 # for Template:root and add some text to the expansion to please
3413 # the validation.
3414 data["etymology_templates"] = templates
3416 for child_node in node.find_child_recursively( 3416 ↛ exitline 3416 didn't return from function 'parse_etymology' because the loop on line 3416 didn't complete
3417 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3418 ):
3419 if child_node.kind in LEVEL_KIND_FLAGS: 3419 ↛ 3421line 3419 didn't jump to line 3421 because the condition on line 3419 was always true
3420 break
3421 elif isinstance(
3422 child_node, TemplateNode
3423 ) and child_node.template_name in ["zh-x", "zh-q"]:
3424 if "etymology_examples" not in data:
3425 data["etymology_examples"] = []
3426 data["etymology_examples"].extend(
3427 extract_template_zh_x(
3428 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3429 )
3430 )
3432 def parse_descendants(
3433 data: WordData, node: WikiNode, is_proto_root_derived_section=False
3434 ) -> None:
3435 """Parses a Descendants section. Also used on Derived terms and
3436 Extensions sections when we are dealing with a root of a reconstructed
3437 language (i.e. is_proto_root_derived_section == True), as they use the
3438 same structure. In the latter case, The wiktionary convention is not to
3439 title the section as descendants since the immediate offspring of the
3440 roots are morphologically derived terms within the same proto-language.
3441 Still, since the rest of the section lists true descendants, we use the
3442 same function. Entries in the descendants list that are technically
3443 derived terms will have a field "tags": ["derived"]."""
3444 assert isinstance(data, dict)
3445 assert isinstance(node, WikiNode)
3446 assert isinstance(is_proto_root_derived_section, bool)
3448 descendants = []
3450 # Most templates that are not in a LIST should be ignored as they only
3451 # add formatting, like "desc-top", "der-top3", etc. Any template in
3452 # unignored_non_list_templates actually contains relevant descendant
3453 # info. E.g. "CJKV" is often the only line at all in descendants
3454 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would
3455 # be skipped if we didn't handle it specially as it is not part of a
3456 # LIST, and additionally is in panel_templates. There are probably more
3457 # such templates that should be added to this...
3458 unignored_non_list_templates: list[str] = ["CJKV"]
3460 def process_list_item_children(
3461 sarg: str, children: list[Union[str, WikiNode]]
3462 ) -> None:
3463 assert isinstance(sarg, str)
3464 assert isinstance(children, list)
3465 # The descendants section is a hierarchical bulleted listed. sarg is
3466 # usually some number of "*" characters indicating the level of
3467 # indentation of the line, e.g. "***" indicates the line will be
3468 # thrice-indented. A bare ";" is used to indicate a subtitle-like
3469 # line with no indentation. ":" at the end of one or more "*"s is
3470 # used to indicate that the bullet will not be displayed.
3471 item_data: DescendantData = {"depth": sarg.count("*")}
3472 templates: list[TemplateData] = []
3473 is_derived = False
3475 # Counter for preventing the capture of templates when we are inside
3476 # templates that we want to ignore (i.e., not capture).
3477 ignore_count = 0
3479 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3480 nonlocal ignore_count
3481 if (
3482 is_panel_template(wxr, name)
3483 and name not in unignored_non_list_templates
3484 ):
3485 return ""
3486 if re.match(ignored_descendants_templates_re, name):
3487 ignore_count += 1
3488 return None
3490 def desc_post_template_fn(
3491 name: str, ht: TemplateArgs, expansion: str
3492 ) -> None:
3493 nonlocal ignore_count
3494 if name in wikipedia_templates:
3495 parse_wikipedia_template(wxr, data, ht)
3496 return None
3497 if re.match(ignored_descendants_templates_re, name):
3498 ignore_count -= 1
3499 return None
3500 if ignore_count == 0:
3501 ht = clean_template_args(wxr, ht)
3502 nonlocal is_derived
3503 # If we're in a proto-root Derived terms or Extensions
3504 # section, and the current list item has a link template
3505 # to a term in the same proto-language, then we tag this
3506 # descendant entry with "derived"
3507 is_derived = (
3508 is_proto_root_derived_section
3509 and (name == "l" or name == "link")
3510 and ("1" in ht and ht["1"] == lang_code)
3511 )
3512 expansion = clean_node(wxr, None, expansion)
3513 templates.append(
3514 {"name": name, "args": ht, "expansion": expansion}
3515 )
3516 return None
3518 text = clean_node(
3519 wxr,
3520 None,
3521 children,
3522 template_fn=desc_template_fn,
3523 post_template_fn=desc_post_template_fn,
3524 )
3525 item_data["templates"] = templates
3526 item_data["text"] = text
3527 if is_derived:
3528 item_data["tags"] = ["derived"]
3529 descendants.append(item_data)
3531 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]:
3532 for i, child in enumerate(node.children):
3533 if isinstance(child, WikiNode):
3534 yield (i, child)
3536 def get_sublist_index(list_item: WikiNode) -> Optional[int]:
3537 for i, child in node_children(list_item):
3538 if child.kind == NodeKind.LIST:
3539 return i
3540 return None
3542 def get_descendants(node: WikiNode) -> None:
3543 """Appends the data for every list item in every list in node
3544 to descendants."""
3545 for _, c in node_children(node):
3546 if (
3547 c.kind == NodeKind.TEMPLATE
3548 and c.largs
3549 and len(c.largs[0]) == 1
3550 and isinstance(c.largs[0][0], str)
3551 and c.largs[0][0] in unignored_non_list_templates
3552 ):
3553 # Some Descendants sections have no wikitext list. Rather,
3554 # the list is entirely generated by a single template (see
3555 # e.g. the use of {{CJKV}} in Chinese entries).
3556 process_list_item_children("", [c])
3557 elif c.kind == NodeKind.HTML:
3558 # The Descendants sections for many languages feature
3559 # templates that generate html to add styling (e.g. using
3560 # multiple columns) to the list, so that the actual wikitext
3561 # list items are found within a <div>. We look within the
3562 # children of the html node for the actual list items.
3563 get_descendants(c)
3564 elif c.kind == NodeKind.LIST:
3565 get_descendants(c)
3566 elif c.kind == NodeKind.LIST_ITEM:
3567 # If a LIST_ITEM has subitems in a sublist, usually its
3568 # last child is a LIST. However, sometimes after the LIST
3569 # there is one or more trailing LIST_ITEMs, like "\n" or
3570 # a reference template. If there is a sublist, we discard
3571 # everything after it.
3572 i = get_sublist_index(c)
3573 if i is not None:
3574 process_list_item_children(c.sarg, c.children[:i])
3575 get_descendants(c.children[i]) # type: ignore[arg-type]
3576 else:
3577 process_list_item_children(c.sarg, c.children)
3579 # parse_descendants() actual work starts here
3580 get_descendants(node)
3582 # if e.g. on a PIE page, there may be both Derived terms and Extensions
3583 # sections, in which case this function will be called multiple times,
3584 # so we have to check if descendants exists first.
3585 if "descendants" in data:
3586 data["descendants"].extend(descendants)
3587 else:
3588 data["descendants"] = descendants
3590 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3591 """This recurses into a subtree in the parse tree for a page."""
3592 nonlocal etym_data
3593 nonlocal pos_data
3594 nonlocal inside_level_four
3596 redirect_list: list[str] = [] # for `zh-see` template
3598 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3599 """This is called for otherwise unprocessed parts of the page.
3600 We still expand them so that e.g. Category links get captured."""
3601 if name in wikipedia_templates: 3601 ↛ 3602line 3601 didn't jump to line 3602 because the condition on line 3601 was never true
3602 data = select_data()
3603 parse_wikipedia_template(wxr, data, ht)
3604 return None
3605 if is_panel_template(wxr, name): 3605 ↛ 3606line 3605 didn't jump to line 3606 because the condition on line 3605 was never true
3606 return ""
3607 return None
3609 for node in treenode.children:
3610 # print(node)
3611 if not isinstance(node, WikiNode):
3612 # print(" X{}".format(repr(node)[:40]))
3613 continue
3614 if isinstance(node, TemplateNode):
3615 if process_soft_redirect_template(wxr, node, redirect_list):
3616 continue
3617 elif node.template_name == "zh-forms": 3617 ↛ 3618line 3617 didn't jump to line 3618 because the condition on line 3617 was never true
3618 process_zh_forms_templates(wxr, node, base_data)
3620 if node.kind not in LEVEL_KINDS:
3621 # XXX handle e.g. wikipedia links at the top of a language
3622 # XXX should at least capture "also" at top of page
3623 if node.kind in (
3624 NodeKind.HLINE,
3625 NodeKind.LIST,
3626 NodeKind.LIST_ITEM,
3627 ):
3628 continue
3629 # print(" UNEXPECTED: {}".format(node))
3630 # Clean the node to collect category links
3631 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3632 continue
3633 t = clean_node(
3634 wxr, etym_data, node.sarg if node.sarg else node.largs
3635 )
3636 t = t.lower()
3637 # XXX these counts were never implemented fully, and even this
3638 # gets discarded: Search STATISTICS_IMPLEMENTATION
3639 wxr.config.section_counts[t] += 1
3640 # print("PROCESS_CHILDREN: T:", repr(t))
3641 if t in IGNORED_TITLES: 3641 ↛ 3642line 3641 didn't jump to line 3642 because the condition on line 3641 was never true
3642 pass
3643 elif t.startswith(PRONUNCIATION_TITLE): 3643 ↛ 3648line 3643 didn't jump to line 3648 because the condition on line 3643 was never true
3644 # Chinese Pronunciation section kludge; we demote these to
3645 # be level 4 instead of 3 so that they're part of a larger
3646 # etymology hierarchy; usually the data here is empty and
3647 # acts as an inbetween between POS and Etymology data
3648 inside_level_four = True
3649 if t.startswith(PRONUNCIATION_TITLE + " "):
3650 # Pronunciation 1, etc, are used in Chinese Glyphs,
3651 # and each of them may have senses under Definition
3652 push_level_four_section()
3653 wxr.wtp.start_subsection(None)
3654 if wxr.config.capture_pronunciation:
3655 data = select_data()
3656 parse_pronunciation(
3657 wxr,
3658 node,
3659 data,
3660 etym_data,
3661 have_etym,
3662 base_data,
3663 lang_code,
3664 )
3665 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3666 push_etym()
3667 wxr.wtp.start_subsection(None)
3668 if wxr.config.capture_etymologies: 3668 ↛ 3733line 3668 didn't jump to line 3733 because the condition on line 3668 was always true
3669 m = re.search(r"\s(\d+)$", t)
3670 if m: 3670 ↛ 3671line 3670 didn't jump to line 3671 because the condition on line 3670 was never true
3671 etym_data["etymology_number"] = int(m.group(1))
3672 parse_etymology(etym_data, node)
3673 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 3673 ↛ 3674line 3673 didn't jump to line 3674 because the condition on line 3673 was never true
3674 data = select_data()
3675 parse_descendants(data, node)
3676 elif ( 3676 ↛ 3682line 3676 didn't jump to line 3682 because the condition on line 3676 was never true
3677 t in PROTO_ROOT_DERIVED_TITLES
3678 and pos == "root"
3679 and is_reconstruction
3680 and wxr.config.capture_descendants
3681 ):
3682 data = select_data()
3683 parse_descendants(data, node, True)
3684 elif t == TRANSLATIONS_TITLE:
3685 data = select_data()
3686 parse_translations(data, node)
3687 elif t in INFLECTION_TITLES: 3687 ↛ 3688line 3687 didn't jump to line 3688 because the condition on line 3687 was never true
3688 parse_inflection(node, t, pos)
3689 else:
3690 lst = t.split()
3691 while len(lst) > 1 and lst[-1].isdigit(): 3691 ↛ 3692line 3691 didn't jump to line 3692 because the condition on line 3691 was never true
3692 lst = lst[:-1]
3693 t_no_number = " ".join(lst).lower()
3694 if t_no_number in POS_TITLES:
3695 push_pos()
3696 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3697 pos = dt["pos"] or "MISSING_POS"
3698 wxr.wtp.start_subsection(t)
3699 if "debug" in dt: 3699 ↛ 3700line 3699 didn't jump to line 3700 because the condition on line 3699 was never true
3700 wxr.wtp.debug(
3701 "{} in section {}".format(dt["debug"], t),
3702 sortid="page/2755",
3703 )
3704 if "warning" in dt: 3704 ↛ 3705line 3704 didn't jump to line 3705 because the condition on line 3704 was never true
3705 wxr.wtp.warning(
3706 "{} in section {}".format(dt["warning"], t),
3707 sortid="page/2759",
3708 )
3709 if "error" in dt: 3709 ↛ 3710line 3709 didn't jump to line 3710 because the condition on line 3709 was never true
3710 wxr.wtp.error(
3711 "{} in section {}".format(dt["error"], t),
3712 sortid="page/2763",
3713 )
3714 # Parse word senses for the part-of-speech
3715 parse_part_of_speech(node, pos)
3716 if "tags" in dt: 3716 ↛ 3717line 3716 didn't jump to line 3717 because the condition on line 3716 was never true
3717 for pdata in pos_datas:
3718 data_extend(pdata, "tags", dt["tags"])
3719 elif t_no_number in LINKAGE_TITLES: 3719 ↛ 3723line 3719 didn't jump to line 3723 because the condition on line 3719 was always true
3720 rel = LINKAGE_TITLES[t_no_number]
3721 data = select_data()
3722 parse_linkage(data, rel, node)
3723 elif t_no_number == COMPOUNDS_TITLE:
3724 data = select_data()
3725 if wxr.config.capture_compounds:
3726 parse_linkage(data, "derived", node)
3728 # XXX parse interesting templates also from other sections. E.g.,
3729 # {{Letter|...}} in ===See also===
3730 # Also <gallery>
3732 # Recurse to children of this node, processing subtitles therein
3733 stack.append(t)
3734 process_children(node, pos)
3735 stack.pop()
3737 if len(redirect_list) > 0:
3738 if len(pos_data) > 0:
3739 pos_data["redirects"] = redirect_list
3740 if "pos" not in pos_data: 3740 ↛ 3741line 3740 didn't jump to line 3741 because the condition on line 3740 was never true
3741 pos_data["pos"] = "soft-redirect"
3742 else:
3743 new_page_data = copy.deepcopy(base_data)
3744 new_page_data["redirects"] = redirect_list
3745 if "pos" not in new_page_data: 3745 ↛ 3747line 3745 didn't jump to line 3747 because the condition on line 3745 was always true
3746 new_page_data["pos"] = "soft-redirect"
3747 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3748 page_datas.append(new_page_data)
3750 def extract_examples(
3751 others: list[WikiNode], sense_base: SenseData
3752 ) -> list[ExampleData]:
3753 """Parses through a list of definitions and quotes to find examples.
3754 Returns a list of example dicts to be added to sense data. Adds
3755 meta-data, mostly categories, into sense_base."""
3756 assert isinstance(others, list)
3757 examples: list[ExampleData] = []
3759 for sub in others:
3760 if not sub.sarg.endswith((":", "*")): 3760 ↛ 3761line 3760 didn't jump to line 3761 because the condition on line 3760 was never true
3761 continue
3762 for item in sub.children:
3763 if not isinstance(item, WikiNode): 3763 ↛ 3764line 3763 didn't jump to line 3764 because the condition on line 3763 was never true
3764 continue
3765 if item.kind != NodeKind.LIST_ITEM: 3765 ↛ 3766line 3765 didn't jump to line 3766 because the condition on line 3765 was never true
3766 continue
3767 usex_type = None
3768 example_template_args = []
3769 example_template_names = []
3770 taxons = set()
3772 # Bypass this function when parsing Chinese, Japanese and
3773 # quotation templates.
3774 new_example_lists = extract_example_list_item(
3775 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3776 )
3777 if len(new_example_lists) > 0: 3777 ↛ 3778line 3777 didn't jump to line 3778 because the condition on line 3777 was never true
3778 examples.extend(new_example_lists)
3779 continue
3781 def usex_template_fn(
3782 name: str, ht: TemplateArgs
3783 ) -> Optional[str]:
3784 nonlocal usex_type
3785 if is_panel_template(wxr, name):
3786 return ""
3787 if name in usex_templates:
3788 usex_type = "example"
3789 example_template_args.append(ht)
3790 example_template_names.append(name)
3791 elif name in quotation_templates:
3792 usex_type = "quotation"
3793 elif name in taxonomy_templates:
3794 taxons.update(ht.get(1, "").split())
3795 for prefix in template_linkages:
3796 if re.search(
3797 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3798 ):
3799 return ""
3800 return None
3802 # bookmark
3803 ruby: list[tuple[str, str]] = []
3804 contents = item.children
3805 if lang_code == "ja":
3806 # Capture ruby contents if this is a Japanese language
3807 # example.
3808 # print(contents)
3809 if ( 3809 ↛ 3814line 3809 didn't jump to line 3814 because the condition on line 3809 was never true
3810 contents
3811 and isinstance(contents, str)
3812 and re.match(r"\s*$", contents[0])
3813 ):
3814 contents = contents[1:]
3815 exp = wxr.wtp.parse(
3816 wxr.wtp.node_to_wikitext(contents),
3817 # post_template_fn=head_post_template_fn,
3818 expand_all=True,
3819 )
3820 rub, rest = extract_ruby(wxr, exp.children)
3821 if rub: 3821 ↛ 3825line 3821 didn't jump to line 3825 because the condition on line 3821 was always true
3822 for rtup in rub:
3823 ruby.append(rtup)
3824 contents = rest
3825 subtext = clean_node(
3826 wxr, sense_base, contents, template_fn=usex_template_fn
3827 )
3829 frozen_taxons = frozenset(taxons)
3830 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3832 # print(f"{subtext=}")
3833 subtext = re.sub(
3834 r"\s*\(please add an English "
3835 r"translation of this "
3836 r"(example|usage example|quote)\)",
3837 "",
3838 subtext,
3839 ).strip()
3840 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3841 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3842 # print("subtext:", repr(subtext))
3844 lines = subtext.splitlines()
3845 # print(lines)
3847 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3848 lines = list(
3849 x
3850 for x in lines
3851 if not re.match(
3852 r"(Synonyms: |Antonyms: |Hyponyms: |"
3853 r"Synonym: |Antonym: |Hyponym: |"
3854 r"Hypernyms: |Derived terms: |"
3855 r"Related terms: |"
3856 r"Hypernym: |Derived term: |"
3857 r"Coordinate terms:|"
3858 r"Related term: |"
3859 r"For more quotations using )",
3860 x,
3861 )
3862 )
3863 tr = ""
3864 ref = ""
3865 roman = ""
3866 # for line in lines:
3867 # print("LINE:", repr(line))
3868 # print(classify_desc(line))
3869 if len(lines) == 1 and lang_code != "en": 3869 ↛ 3870line 3869 didn't jump to line 3870 because the condition on line 3869 was never true
3870 parts = example_splitter_re.split(lines[0])
3871 if (
3872 len(parts) > 2
3873 and len(example_template_args) == 1
3874 and any(
3875 ("―" in s) or ("—" in s)
3876 for s in example_template_args[0].values()
3877 )
3878 ):
3879 if nparts := synch_splits_with_args(
3880 lines[0], example_template_args[0]
3881 ):
3882 parts = nparts
3883 if (
3884 len(example_template_args) == 1
3885 and "lit" in example_template_args[0]
3886 ):
3887 # ugly brute-force kludge in case there's a lit= arg
3888 literally = example_template_args[0].get("lit", "")
3889 if literally:
3890 literally = (
3891 " (literally, “"
3892 + clean_value(wxr, literally)
3893 + "”)"
3894 )
3895 else:
3896 literally = ""
3897 if (
3898 len(example_template_args) == 1
3899 and len(parts) == 2
3900 and len(example_template_args[0])
3901 - (
3902 # horrible kludge to ignore these arguments
3903 # when calculating how many there are
3904 sum(
3905 s in example_template_args[0]
3906 for s in (
3907 "lit", # generates text, but we handle it
3908 "inline",
3909 "noenum",
3910 "nocat",
3911 "sort",
3912 )
3913 )
3914 )
3915 == 3
3916 and clean_value(
3917 wxr, example_template_args[0].get(2, "")
3918 )
3919 == parts[0].strip()
3920 and clean_value(
3921 wxr,
3922 (
3923 example_template_args[0].get(3)
3924 or example_template_args[0].get("translation")
3925 or example_template_args[0].get("t", "")
3926 )
3927 + literally, # in case there's a lit= argument
3928 )
3929 == parts[1].strip()
3930 ):
3931 # {{exampletemplate|ex|Foo bar baz|English translation}}
3932 # is a pretty reliable 'heuristic', so we use it here
3933 # before the others. To be extra sure the template
3934 # doesn't do anything weird, we compare the arguments
3935 # and the output to each other.
3936 lines = [parts[0].strip()]
3937 tr = parts[1].strip()
3938 elif (
3939 len(parts) == 2
3940 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3941 ):
3942 # These other branches just do some simple heuristics w/
3943 # the expanded output of the template (if applicable).
3944 lines = [parts[0].strip()]
3945 tr = parts[1].strip()
3946 elif (
3947 len(parts) == 3
3948 and classify_desc2(parts[1])
3949 in ("romanization", "english")
3950 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3951 ):
3952 lines = [parts[0].strip()]
3953 roman = parts[1].strip()
3954 tr = parts[2].strip()
3955 else:
3956 parts = re.split(r"\s+-\s+", lines[0])
3957 if (
3958 len(parts) == 2
3959 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3960 ):
3961 lines = [parts[0].strip()]
3962 tr = parts[1].strip()
3963 elif len(lines) > 1:
3964 if any( 3964 ↛ 3967line 3964 didn't jump to line 3967 because the condition on line 3964 was never true
3965 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3966 ) and not (len(example_template_names) == 1):
3967 refs: list[str] = []
3968 for i in range(len(lines)):
3969 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]):
3970 break
3971 refs.append(lines[i].strip())
3972 if re.search(r"[]\d:)]\s*$", lines[i]):
3973 break
3974 ref = " ".join(refs)
3975 lines = lines[i + 1 :]
3976 if (
3977 lang_code != "en"
3978 and len(lines) >= 2
3979 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3980 ):
3981 i = len(lines) - 1
3982 while (
3983 i > 1
3984 and classify_desc2(lines[i - 1])
3985 in ENGLISH_TEXTS
3986 ):
3987 i -= 1
3988 tr = "\n".join(lines[i:])
3989 lines = lines[:i]
3990 if len(lines) >= 2:
3991 if classify_desc2(lines[-1]) == "romanization":
3992 roman = lines[-1].strip()
3993 lines = lines[:-1]
3995 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true
3996 ref = lines[0]
3997 lines = lines[1:]
3998 elif lang_code != "en" and len(lines) == 2: 3998 ↛ 3999line 3998 didn't jump to line 3999 because the condition on line 3998 was never true
3999 cls1 = classify_desc2(lines[0])
4000 cls2 = classify_desc2(lines[1])
4001 if cls2 in ENGLISH_TEXTS and cls1 != "english":
4002 tr = lines[1]
4003 lines = [lines[0]]
4004 elif cls1 in ENGLISH_TEXTS and cls2 != "english":
4005 tr = lines[0]
4006 lines = [lines[1]]
4007 elif (
4008 re.match(r"^[#*]*:+", lines[1])
4009 and classify_desc2(
4010 re.sub(r"^[#*:]+\s*", "", lines[1])
4011 )
4012 in ENGLISH_TEXTS
4013 ):
4014 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
4015 lines = [lines[0]]
4016 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
4017 # Both were classified as English, but
4018 # presumably one is not. Assume first is
4019 # non-English, as that seems more common.
4020 tr = lines[1]
4021 lines = [lines[0]]
4022 elif ( 4022 ↛ 4038line 4022 didn't jump to line 4038 because the condition on line 4022 was always true
4023 usex_type != "quotation"
4024 and lang_code != "en"
4025 and len(lines) == 3
4026 ):
4027 cls1 = classify_desc2(lines[0])
4028 cls2 = classify_desc2(lines[1])
4029 cls3 = classify_desc2(lines[2])
4030 if ( 4030 ↛ 4061line 4030 didn't jump to line 4061 because the condition on line 4030 was always true
4031 cls3 == "english"
4032 and cls2 in ("english", "romanization")
4033 and cls1 != "english"
4034 ):
4035 tr = lines[2].strip()
4036 roman = lines[1].strip()
4037 lines = [lines[0].strip()]
4038 elif (
4039 usex_type == "quotation"
4040 and lang_code != "en"
4041 and len(lines) > 2
4042 ):
4043 # for x in lines:
4044 # print(" LINE: {}: {}"
4045 # .format(classify_desc2(x), x))
4046 if re.match(r"^[#*]*:+\s*$", lines[1]):
4047 ref = lines[0]
4048 lines = lines[2:]
4049 cls1 = classify_desc2(lines[-1])
4050 if cls1 == "english":
4051 i = len(lines) - 1
4052 while (
4053 i > 1
4054 and classify_desc2(lines[i - 1])
4055 == ENGLISH_TEXTS
4056 ):
4057 i -= 1
4058 tr = "\n".join(lines[i:])
4059 lines = lines[:i]
4061 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
4062 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
4063 tr = re.sub(r"^[#*:]+\s*", "", tr)
4064 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
4065 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
4066 ref = re.sub(r"^[#*:]+\s*", "", ref)
4067 ref = re.sub(
4068 r", (volume |number |page )?“?"
4069 r"\(please specify ([^)]|\(s\))*\)”?|"
4070 ", text here$",
4071 "",
4072 ref,
4073 )
4074 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
4075 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
4076 subtext = "\n".join(x for x in lines if x)
4077 if not tr and lang_code != "en": 4077 ↛ 4078line 4077 didn't jump to line 4078 because the condition on line 4077 was never true
4078 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
4079 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS:
4080 tr = m.group(2)
4081 subtext = subtext[: m.start()] + m.group(1)
4082 elif lines:
4083 parts = re.split(r"\s*[―—]+\s*", lines[0])
4084 if (
4085 len(parts) == 2
4086 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4087 ):
4088 subtext = parts[0].strip()
4089 tr = parts[1].strip()
4090 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
4091 subtext = re.sub(
4092 r"(please add an English translation of "
4093 r"this (quote|usage example))",
4094 "",
4095 subtext,
4096 )
4097 subtext = re.sub(
4098 r"\s*→New International Version " "translation$",
4099 "",
4100 subtext,
4101 ) # e.g. pis/Tok Pisin (Bible)
4102 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4103 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4104 note = None
4105 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4106 if ( 4106 ↛ 4114line 4106 didn't jump to line 4114 because the condition on line 4106 was never true
4107 m is not None
4108 and lang_code != "en"
4109 and (
4110 m.group(1).startswith("with ")
4111 or classify_desc2(m.group(1)) == "english"
4112 )
4113 ):
4114 note = m.group(1)
4115 subtext = subtext[m.end() :]
4116 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4117 ref = re.sub(r",\s*→ISBN", "", ref)
4118 ref = ref.strip()
4119 if ref.endswith(":") or ref.endswith(","): 4119 ↛ 4120line 4119 didn't jump to line 4120 because the condition on line 4119 was never true
4120 ref = ref[:-1].strip()
4121 ref = re.sub(r"\s+,\s+", ", ", ref)
4122 ref = re.sub(r"\s+", " ", ref)
4123 if ref and not subtext: 4123 ↛ 4124line 4123 didn't jump to line 4124 because the condition on line 4123 was never true
4124 subtext = ref
4125 ref = ""
4126 if subtext: 4126 ↛ 3762line 4126 didn't jump to line 3762 because the condition on line 4126 was always true
4127 dt: ExampleData = {"text": subtext}
4128 if ref: 4128 ↛ 4129line 4128 didn't jump to line 4129 because the condition on line 4128 was never true
4129 dt["ref"] = ref
4130 if tr:
4131 dt["english"] = tr
4132 if usex_type: 4132 ↛ 4133line 4132 didn't jump to line 4133 because the condition on line 4132 was never true
4133 dt["type"] = usex_type
4134 if note: 4134 ↛ 4135line 4134 didn't jump to line 4135 because the condition on line 4134 was never true
4135 dt["note"] = note
4136 if roman:
4137 dt["roman"] = roman
4138 if ruby:
4139 dt["ruby"] = ruby
4140 examples.append(dt)
4142 return examples
4144 # Main code of parse_language()
4145 # Process the section
4146 stack.append(language)
4147 process_children(langnode, None)
4148 stack.pop()
4150 # Finalize word entires
4151 push_etym()
4152 ret = []
4153 for data in page_datas:
4154 merge_base(data, base_data)
4155 ret.append(data)
4157 # Copy all tags to word senses
4158 for data in ret:
4159 if "senses" not in data: 4159 ↛ 4160line 4159 didn't jump to line 4160 because the condition on line 4159 was never true
4160 continue
4161 # WordData should not have a 'tags' field, but if it does, it's
4162 # deleted and its contents removed and placed in each sense;
4163 # that's why the type ignores.
4164 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4165 if "tags" in data: 4165 ↛ 4166line 4165 didn't jump to line 4166 because the condition on line 4165 was never true
4166 del data["tags"] # type: ignore[typeddict-item]
4167 for sense in data["senses"]:
4168 data_extend(sense, "tags", tags)
4170 return ret
4173def parse_wikipedia_template(
4174 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4175) -> None:
4176 """Helper function for parsing {{wikipedia|...}} and related templates."""
4177 assert isinstance(wxr, WiktextractContext)
4178 assert isinstance(data, dict)
4179 assert isinstance(ht, dict)
4180 langid = clean_node(wxr, data, ht.get("lang", ()))
4181 pagename = (
4182 clean_node(wxr, data, ht.get(1, ()))
4183 or wxr.wtp.title
4184 or "MISSING_PAGE_TITLE"
4185 )
4186 if langid:
4187 data_append(data, "wikipedia", langid + ":" + pagename)
4188 else:
4189 data_append(data, "wikipedia", pagename)
4192def parse_top_template(
4193 wxr: WiktextractContext, node: WikiNode, data: WordData
4194) -> None:
4195 """Parses a template that occurs on the top-level in a page, before any
4196 language subtitles."""
4197 assert isinstance(wxr, WiktextractContext)
4198 assert isinstance(node, WikiNode)
4199 assert isinstance(data, dict)
4201 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4202 if name in wikipedia_templates:
4203 parse_wikipedia_template(wxr, data, ht)
4204 return None
4205 if is_panel_template(wxr, name):
4206 return ""
4207 if name in ("reconstruction",):
4208 return ""
4209 if name.lower() == "also":
4210 # XXX shows related words that might really have been the intended
4211 # word, capture them
4212 return ""
4213 if name == "see also":
4214 # XXX capture
4215 return ""
4216 if name == "cardinalbox":
4217 # XXX capture
4218 return ""
4219 if name == "character info":
4220 # XXX capture
4221 return ""
4222 if name == "commonscat":
4223 # XXX capture link to Wikimedia commons
4224 return ""
4225 if name == "wrongtitle":
4226 # XXX this should be captured to replace page title with the
4227 # correct title. E.g. ⿰亻革家
4228 return ""
4229 if name == "wikidata":
4230 arg = clean_node(wxr, data, ht.get(1, ()))
4231 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4232 data_append(data, "wikidata", arg)
4233 return ""
4234 wxr.wtp.debug(
4235 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4236 sortid="page/2870",
4237 )
4238 return ""
4240 clean_node(wxr, None, [node], template_fn=top_template_fn)
4243def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4244 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4245 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4246 that are next to each other."""
4248 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4249 # section get overwritten. In this case, let's just combine the two.
4251 # In Chinese entries, Pronunciation can be preceded on the
4252 # same level 3 by its Etymology *and* Glyph Origin sections:
4253 # ===Glyph Origin===
4254 # ===Etymology===
4255 # ===Pronunciation===
4256 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4257 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4258 # are now level 6
4260 # Known lowercase PoS names are in part_of_speech_map
4261 # Known lowercase linkage section names are in linkage_map
4263 old = re.split(
4264 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4265 )
4267 parts = []
4268 npar = 4 # Number of parentheses in above expression
4269 parts.append(old[0])
4270 prev_level = None
4271 level = None
4272 skip_level_title = False # When combining etymology sections
4273 for i in range(1, len(old), npar + 1):
4274 left = old[i]
4275 right = old[i + npar - 1]
4276 # remove Wikilinks in title
4277 title = re.sub(r"^\[\[", "", old[i + 1])
4278 title = re.sub(r"\]\]$", "", title)
4279 prev_level = level
4280 level = len(left)
4281 part = old[i + npar]
4282 if level != len(right): 4282 ↛ 4283line 4282 didn't jump to line 4283 because the condition on line 4282 was never true
4283 wxr.wtp.debug(
4284 "subtitle has unbalanced levels: "
4285 "{!r} has {} on the left and {} on the right".format(
4286 title, left, right
4287 ),
4288 sortid="page/2904",
4289 )
4290 lc = title.lower()
4291 if name_to_code(title, "en") != "":
4292 if level > 2: 4292 ↛ 4293line 4292 didn't jump to line 4293 because the condition on line 4292 was never true
4293 wxr.wtp.debug(
4294 "subtitle has language name {} at level {}".format(
4295 title, level
4296 ),
4297 sortid="page/2911",
4298 )
4299 level = 2
4300 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4301 if level > 3: 4301 ↛ 4302line 4301 didn't jump to line 4302 because the condition on line 4301 was never true
4302 wxr.wtp.debug(
4303 "etymology section {} at level {}".format(title, level),
4304 sortid="page/2917",
4305 )
4306 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 4306 ↛ 4308line 4306 didn't jump to line 4308 because the condition on line 4306 was never true
4307 # sections cheek-to-cheek
4308 skip_level_title = True
4309 # Modify the title of previous ("Glyph Origin") section, in
4310 # case we have a meaningful title like "Etymology 1"
4311 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4312 level = 3
4313 elif lc.startswith(PRONUNCIATION_TITLE): 4313 ↛ 4316line 4313 didn't jump to line 4316 because the condition on line 4313 was never true
4314 # Pronunciation is now a level between POS and Etymology, so
4315 # we need to shift everything down by one
4316 level = 4
4317 elif lc in POS_TITLES:
4318 level = 5
4319 elif lc == TRANSLATIONS_TITLE:
4320 level = 6
4321 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 4321 ↛ 4323line 4321 didn't jump to line 4323 because the condition on line 4321 was always true
4322 level = 6
4323 elif lc in INFLECTION_TITLES:
4324 level = 6
4325 elif lc == DESCENDANTS_TITLE:
4326 level = 6
4327 elif title in PROTO_ROOT_DERIVED_TITLES:
4328 level = 6
4329 elif lc in IGNORED_TITLES:
4330 level = 6
4331 else:
4332 level = 6
4333 if skip_level_title: 4333 ↛ 4334line 4333 didn't jump to line 4334 because the condition on line 4333 was never true
4334 skip_level_title = False
4335 parts.append(part)
4336 else:
4337 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4338 parts.append(part)
4339 # print("=" * level, title)
4340 # if level != len(left):
4341 # print(" FIXED LEVEL OF {} {} -> {}"
4342 # .format(title, len(left), level))
4344 text = "".join(parts)
4345 # print(text)
4346 return text
4349def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4350 # Skip translation pages
4351 if word.endswith("/" + TRANSLATIONS_TITLE): 4351 ↛ 4352line 4351 didn't jump to line 4352 because the condition on line 4351 was never true
4352 return []
4354 if wxr.config.verbose: 4354 ↛ 4355line 4354 didn't jump to line 4355 because the condition on line 4354 was never true
4355 logger.info(f"Parsing page: {word}")
4357 wxr.config.word = word
4358 wxr.wtp.start_page(word)
4360 # Remove <noinclude> and similar tags from main pages. They
4361 # should not appear there, but at least net/Elfdala has one and it
4362 # is probably not the only one.
4363 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4364 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4365 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4367 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4368 # pages that have, for example, Translations section under Linkage, or
4369 # Translations section on the same level as Noun. Enforce a proper
4370 # hierarchy by manipulating the subtitle levels in certain cases.
4371 text = fix_subtitle_hierarchy(wxr, text)
4373 # Parse the page, pre-expanding those templates that are likely to
4374 # influence parsing
4375 tree = wxr.wtp.parse(
4376 text,
4377 pre_expand=True,
4378 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4379 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4380 )
4381 # from wikitextprocessor.parser import print_tree
4382 # print("PAGE PARSE:", print_tree(tree))
4384 top_data: WordData = {}
4386 # Iterate over top-level titles, which should be languages for normal
4387 # pages
4388 by_lang = defaultdict(list)
4389 for langnode in tree.children:
4390 if not isinstance(langnode, WikiNode):
4391 continue
4392 if langnode.kind == NodeKind.TEMPLATE: 4392 ↛ 4393line 4392 didn't jump to line 4393 because the condition on line 4392 was never true
4393 parse_top_template(wxr, langnode, top_data)
4394 continue
4395 if langnode.kind == NodeKind.LINK: 4395 ↛ 4397line 4395 didn't jump to line 4397 because the condition on line 4395 was never true
4396 # Some pages have links at top level, e.g., "trees" in Wiktionary
4397 continue
4398 if langnode.kind != NodeKind.LEVEL2: 4398 ↛ 4399line 4398 didn't jump to line 4399 because the condition on line 4398 was never true
4399 wxr.wtp.debug(
4400 f"unexpected top-level node: {langnode}", sortid="page/3014"
4401 )
4402 continue
4403 lang = clean_node(
4404 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4405 )
4406 lang_code = name_to_code(lang, "en")
4407 if lang_code == "": 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true
4408 wxr.wtp.debug(
4409 f"unrecognized language name: {lang}", sortid="page/3019"
4410 )
4411 if ( 4411 ↛ 4415line 4411 didn't jump to line 4415 because the condition on line 4411 was never true
4412 wxr.config.capture_language_codes
4413 and lang_code not in wxr.config.capture_language_codes
4414 ):
4415 continue
4416 wxr.wtp.start_section(lang)
4418 # Collect all words from the page.
4419 # print(f"{langnode=}")
4420 datas = parse_language(wxr, langnode, lang, lang_code)
4422 # Propagate fields resulting from top-level templates to this
4423 # part-of-speech.
4424 for data in datas:
4425 if "lang" not in data: 4425 ↛ 4426line 4425 didn't jump to line 4426 because the condition on line 4425 was never true
4426 wxr.wtp.debug(
4427 "internal error -- no lang in data: {}".format(data),
4428 sortid="page/3034",
4429 )
4430 continue
4431 for k, v in top_data.items(): 4431 ↛ 4432line 4431 didn't jump to line 4432 because the loop on line 4431 never started
4432 assert isinstance(v, (list, tuple))
4433 data_extend(data, k, v)
4434 by_lang[data["lang"]].append(data)
4436 # XXX this code is clearly out of date. There is no longer a "conjugation"
4437 # field. FIX OR REMOVE.
4438 # Do some post-processing on the words. For example, we may distribute
4439 # conjugation information to all the words.
4440 ret = []
4441 for lang, lang_datas in by_lang.items():
4442 ret.extend(lang_datas)
4444 for x in ret:
4445 if x["word"] != word:
4446 if word.startswith("Unsupported titles/"): 4446 ↛ 4452line 4446 didn't jump to line 4452 because the condition on line 4446 was always true
4447 wxr.wtp.debug(
4448 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4449 sortid="20231101/3578page.py",
4450 )
4451 else:
4452 wxr.wtp.debug(
4453 f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'",
4454 sortid="20231101/3582page.py",
4455 )
4456 x["original_title"] = word
4457 # validate tag data
4458 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4459 return ret
4462def recursively_separate_raw_tags(
4463 wxr: WiktextractContext, data: dict[str, Any]
4464) -> None:
4465 if not isinstance(data, dict): 4465 ↛ 4466line 4465 didn't jump to line 4466 because the condition on line 4465 was never true
4466 wxr.wtp.error(
4467 "'data' is not dict; most probably "
4468 "data has a list that contains at least one dict and "
4469 "at least one non-dict item",
4470 sortid="en/page-4016/20240419",
4471 )
4472 return
4473 new_tags: list[str] = []
4474 raw_tags: list[str] = data.get("raw_tags", [])
4475 for field, val in data.items():
4476 if field == "tags":
4477 for tag in val:
4478 if tag not in valid_tags: 4478 ↛ 4479line 4478 didn't jump to line 4479 because the condition on line 4478 was never true
4479 raw_tags.append(tag)
4480 else:
4481 new_tags.append(tag)
4482 if isinstance(val, list):
4483 if len(val) > 0 and isinstance(val[0], dict):
4484 for d in val:
4485 recursively_separate_raw_tags(wxr, d)
4486 if "tags" in data and not new_tags: 4486 ↛ 4487line 4486 didn't jump to line 4487 because the condition on line 4486 was never true
4487 del data["tags"]
4488 elif new_tags:
4489 data["tags"] = new_tags
4490 if raw_tags: 4490 ↛ 4491line 4490 didn't jump to line 4491 because the condition on line 4490 was never true
4491 data["raw_tags"] = raw_tags
4494def process_soft_redirect_template(
4495 wxr: WiktextractContext,
4496 template_node: TemplateNode,
4497 redirect_pages: list[str],
4498) -> bool:
4499 # return `True` if the template is soft redirect template
4500 if template_node.template_name == "zh-see":
4501 # https://en.wiktionary.org/wiki/Template:zh-see
4502 title = clean_node(
4503 wxr, None, template_node.template_parameters.get(1, "")
4504 )
4505 if title != "": 4505 ↛ 4507line 4505 didn't jump to line 4507 because the condition on line 4505 was always true
4506 redirect_pages.append(title)
4507 return True
4508 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4509 # https://en.wiktionary.org/wiki/Template:ja-see
4510 for key, value in template_node.template_parameters.items():
4511 if isinstance(key, int): 4511 ↛ 4510line 4511 didn't jump to line 4510 because the condition on line 4511 was always true
4512 title = clean_node(wxr, None, value)
4513 if title != "": 4513 ↛ 4510line 4513 didn't jump to line 4510 because the condition on line 4513 was always true
4514 redirect_pages.append(title)
4515 return True
4516 return False
4519def process_zh_forms_templates(
4520 wxr: WiktextractContext,
4521 template_node: TemplateNode,
4522 base_data: WordData,
4523) -> None:
4524 # https://en.wiktionary.org/wiki/Template:zh-forms
4525 if "forms" not in base_data:
4526 base_data["forms"] = []
4527 for p_name, p_value in template_node.template_parameters.items():
4528 if not isinstance(p_name, str):
4529 continue
4530 if re.fullmatch(r"s\d*", p_name):
4531 form_data: FormData = {
4532 "form": clean_node(wxr, None, p_value),
4533 "tags": ["Simplified Chinese"],
4534 }
4535 if len(form_data["form"]) > 0:
4536 base_data["forms"].append(form_data)
4537 elif re.fullmatch(r"t\d+", p_name):
4538 form_data = {
4539 "form": clean_node(wxr, None, p_value),
4540 "tags": ["Traditional Chinese"],
4541 }
4542 if len(form_data["form"]) > 0:
4543 base_data["forms"].append(form_data)
4544 elif p_name == "alt":
4545 for form_text in clean_node(wxr, None, p_value).split(","):
4546 texts = form_text.split("-")
4547 form_data = {"form": texts[0]}
4548 if len(texts) > 1:
4549 # pronunciation data could be added after "-"
4550 # see https://en.wiktionary.org/wiki/新婦
4551 form_data["raw_tags"] = texts[1:]
4552 if len(form_data["form"]) > 0:
4553 base_data["forms"].append(form_data)
4554 elif p_name == "lit":
4555 lit = clean_node(wxr, None, p_value)
4556 if lit != "":
4557 base_data["literal_meaning"] = lit
4558 if len(base_data["forms"]) == 0:
4559 del base_data["forms"]