Coverage for src/wiktextract/extractor/en/page.py: 73%
1974 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 08:12 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 08:12 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8import sys
9from collections import defaultdict
10from functools import partial
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Iterable,
15 Iterator,
16 Optional,
17 Set,
18 Union,
19 cast,
20)
22from mediawiki_langcodes import get_all_names, name_to_code
23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
24from wikitextprocessor.parser import (
25 LEVEL_KIND_FLAGS,
26 GeneralNode,
27 NodeKind,
28 TemplateNode,
29 WikiNode,
30)
32from ...clean import clean_template_args, clean_value
33from ...datautils import (
34 data_append,
35 data_extend,
36 ns_title_prefix_tuple,
37)
38from ...page import (
39 LEVEL_KINDS,
40 clean_node,
41 is_panel_template,
42 recursively_extract,
43)
44from ...tags import valid_tags
45from ...wxr_context import WiktextractContext
46from ...wxr_logging import logger
47from ..ruby import extract_ruby, parse_ruby
48from ..share import strip_nodes
49from .example import extract_example_list_item, extract_template_zh_x
50from .form_descriptions import (
51 classify_desc,
52 decode_tags,
53 distw,
54 parse_alt_or_inflection_of,
55 parse_sense_qualifier,
56 parse_word_head,
57)
58from .inflection import TableContext, parse_inflection_section
59from .info_templates import (
60 INFO_TEMPLATE_FUNCS,
61 parse_info_template_arguments,
62 parse_info_template_node,
63)
64from .linkages import extract_alt_form_section, parse_linkage_item_text
65from .parts_of_speech import PARTS_OF_SPEECH
66from .section_titles import (
67 COMPOUNDS_TITLE,
68 DESCENDANTS_TITLE,
69 ETYMOLOGY_TITLES,
70 IGNORED_TITLES,
71 INFLECTION_TITLES,
72 LINKAGE_TITLES,
73 POS_TITLES,
74 PRONUNCIATION_TITLE,
75 PROTO_ROOT_DERIVED_TITLES,
76 TRANSLATIONS_TITLE,
77)
78from .translations import parse_translation_item_text
79from .type_utils import (
80 DescendantData,
81 ExampleData,
82 FormData,
83 LinkageData,
84 SenseData,
85 SoundData,
86 TemplateData,
87 WordData,
88)
89from .unsupported_titles import unsupported_title_map
91# When determining whether a string is 'english', classify_desc
92# might return 'taxonomic' which is English text 99% of the time.
93ENGLISH_TEXTS = ("english", "taxonomic")
95# Matches head tag
96HEAD_TAG_RE = re.compile(
97 r"^(head|Han char|arabic-noun|arabic-noun-form|"
98 r"hangul-symbol|syllable-hangul)$|"
99 + r"^(latin|"
100 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
101 + r")-("
102 + "|".join(
103 [
104 "abbr",
105 "adj",
106 "adjective",
107 "adjective form",
108 "adjective-form",
109 "adv",
110 "adverb",
111 "affix",
112 "animal command",
113 "art",
114 "article",
115 "aux",
116 "bound pronoun",
117 "bound-pronoun",
118 "Buyla",
119 "card num",
120 "card-num",
121 "cardinal",
122 "chunom",
123 "classifier",
124 "clitic",
125 "cls",
126 "cmene",
127 "cmavo",
128 "colloq-verb",
129 "colverbform",
130 "combining form",
131 "combining-form",
132 "comparative",
133 "con",
134 "concord",
135 "conj",
136 "conjunction",
137 "conjug",
138 "cont",
139 "contr",
140 "converb",
141 "daybox",
142 "decl",
143 "decl noun",
144 "def",
145 "dem",
146 "det",
147 "determ",
148 "Deva",
149 "ending",
150 "entry",
151 "form",
152 "fuhivla",
153 "gerund",
154 "gismu",
155 "hanja",
156 "hantu",
157 "hanzi",
158 "head",
159 "ideophone",
160 "idiom",
161 "inf",
162 "indef",
163 "infixed pronoun",
164 "infixed-pronoun",
165 "infl",
166 "inflection",
167 "initialism",
168 "int",
169 "interfix",
170 "interj",
171 "interjection",
172 "jyut",
173 "latin",
174 "letter",
175 "locative",
176 "lujvo",
177 "monthbox",
178 "mutverb",
179 "name",
180 "nisba",
181 "nom",
182 "noun",
183 "noun form",
184 "noun-form",
185 "noun plural",
186 "noun-plural",
187 "nounprefix",
188 "num",
189 "number",
190 "numeral",
191 "ord",
192 "ordinal",
193 "par",
194 "part",
195 "part form",
196 "part-form",
197 "participle",
198 "particle",
199 "past",
200 "past neg",
201 "past-neg",
202 "past participle",
203 "past-participle",
204 "perfect participle",
205 "perfect-participle",
206 "personal pronoun",
207 "personal-pronoun",
208 "pref",
209 "prefix",
210 "phrase",
211 "pinyin",
212 "plural noun",
213 "plural-noun",
214 "pos",
215 "poss-noun",
216 "post",
217 "postp",
218 "postposition",
219 "PP",
220 "pp",
221 "ppron",
222 "pred",
223 "predicative",
224 "prep",
225 "prep phrase",
226 "prep-phrase",
227 "preposition",
228 "present participle",
229 "present-participle",
230 "pron",
231 "prondem",
232 "pronindef",
233 "pronoun",
234 "prop",
235 "proper noun",
236 "proper-noun",
237 "proper noun form",
238 "proper-noun form",
239 "proper noun-form",
240 "proper-noun-form",
241 "prov",
242 "proverb",
243 "prpn",
244 "prpr",
245 "punctuation mark",
246 "punctuation-mark",
247 "regnoun",
248 "rel",
249 "rom",
250 "romanji",
251 "root",
252 "sign",
253 "suff",
254 "suffix",
255 "syllable",
256 "symbol",
257 "verb",
258 "verb form",
259 "verb-form",
260 "verbal noun",
261 "verbal-noun",
262 "verbnec",
263 "vform",
264 ]
265 )
266 + r")(-|/|\+|$)"
267)
269# Head-templates causing problems (like newlines) that can be squashed into
270# an empty string in the template handler while saving their template
271# data for later.
272WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
274FLOATING_TABLE_TEMPLATES: set[str] = {
275 # az-suffix-form creates a style=floatright div that is otherwise
276 # deleted; if it is not pre-expanded, we can intercept the template
277 # so we add this set into do_not_pre_expand, and intercept the
278 # templates in parse_part_of_speech
279 "az-suffix-forms",
280 "az-inf-p",
281 "kk-suffix-forms",
282 "ky-suffix-forms",
283 "tr-inf-p",
284 "tr-suffix-forms",
285 "tt-suffix-forms",
286 "uz-suffix-forms",
287}
288# These two should contain template names that should always be
289# pre-expanded when *first* processing the tree, or not pre-expanded
290# so that the template are left in place with their identifying
291# name intact for later filtering.
293DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
294DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
296# Additional templates to be expanded in the pre-expand phase
297ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
298 "multitrans",
299 "multitrans-nowiki",
300 "trans-top",
301 "trans-top-also",
302 "trans-bottom",
303 "checktrans-top",
304 "checktrans-bottom",
305 "col1",
306 "col2",
307 "col3",
308 "col4",
309 "col5",
310 "col1-u",
311 "col2-u",
312 "col3-u",
313 "col4-u",
314 "col5-u",
315 "check deprecated lang param usage",
316 "deprecated code",
317 "ru-verb-alt-ё",
318 "ru-noun-alt-ё",
319 "ru-adj-alt-ё",
320 "ru-proper noun-alt-ё",
321 "ru-pos-alt-ё",
322 "ru-alt-ё",
323 "inflection of",
324 "no deprecated lang param usage",
325 "transclude", # these produce sense entries (or other lists)
326 "tcl",
327}
329# Inverse linkage for those that have them
330linkage_inverses: dict[str, str] = {
331 # XXX this is not currently used, move to post-processing
332 "synonyms": "synonyms",
333 "hypernyms": "hyponyms",
334 "hyponyms": "hypernyms",
335 "holonyms": "meronyms",
336 "meronyms": "holonyms",
337 "derived": "derived_from",
338 "coordinate_terms": "coordinate_terms",
339 "troponyms": "hypernyms",
340 "antonyms": "antonyms",
341 "instances": "instance_of",
342 "related": "related",
343}
345# Templates that are used to form panels on pages and that
346# should be ignored in various positions
347PANEL_TEMPLATES: set[str] = {
348 "Character info",
349 "CJKV",
350 "French personal pronouns",
351 "French possessive adjectives",
352 "French possessive pronouns",
353 "Han etym",
354 "Japanese demonstratives",
355 "Latn-script",
356 "LDL",
357 "MW1913Abbr",
358 "Number-encoding",
359 "Nuttall",
360 "Spanish possessive adjectives",
361 "Spanish possessive pronouns",
362 "USRegionDisputed",
363 "Webster 1913",
364 "ase-rfr",
365 "attention",
366 "attn",
367 "beer",
368 "broken ref",
369 "ca-compass",
370 "character info",
371 "character info/var",
372 "checksense",
373 "compass-fi",
374 "copyvio suspected",
375 "delete",
376 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
377 "etystub",
378 "examples",
379 "hu-corr",
380 "hu-suff-pron",
381 "interwiktionary",
382 "ja-kanjitab",
383 "ko-hanja-search",
384 "look",
385 "maintenance box",
386 "maintenance line",
387 "mediagenic terms",
388 "merge",
389 "missing template",
390 "morse links",
391 "move",
392 "multiple images",
393 "no inline",
394 "picdic",
395 "picdicimg",
396 "picdiclabel",
397 "polyominoes",
398 "predidential nomics",
399 "punctuation", # This actually gets pre-expanded
400 "reconstructed",
401 "request box",
402 "rf-sound example",
403 "rfaccents",
404 "rfap",
405 "rfaspect",
406 "rfc",
407 "rfc-auto",
408 "rfc-header",
409 "rfc-level",
410 "rfc-pron-n",
411 "rfc-sense",
412 "rfclarify",
413 "rfd",
414 "rfd-redundant",
415 "rfd-sense",
416 "rfdate",
417 "rfdatek",
418 "rfdef",
419 "rfe",
420 "rfe/dowork",
421 "rfex",
422 "rfexp",
423 "rfform",
424 "rfgender",
425 "rfi",
426 "rfinfl",
427 "rfm",
428 "rfm-sense",
429 "rfp",
430 "rfp-old",
431 "rfquote",
432 "rfquote-sense",
433 "rfquotek",
434 "rfref",
435 "rfscript",
436 "rft2",
437 "rftaxon",
438 "rftone",
439 "rftranslit",
440 "rfv",
441 "rfv-etym",
442 "rfv-pron",
443 "rfv-quote",
444 "rfv-sense",
445 "selfref",
446 "split",
447 "stroke order", # XXX consider capturing this?
448 "stub entry",
449 "t-needed",
450 "tbot entry",
451 "tea room",
452 "tea room sense",
453 # "ttbc", - XXX needed in at least on/Preposition/Translation page
454 "unblock",
455 "unsupportedpage",
456 "video frames",
457 "was wotd",
458 "wrongtitle",
459 "zh-forms",
460 "zh-hanzi-box",
461 "no entry",
462}
464# lookup table for the tags of Chinese dialectal synonyms
465zh_tag_lookup: dict[str, list[str]] = {
466 "Formal": ["formal"],
467 "Written-Standard-Chinese": ["Standard-Chinese"],
468 "historical or Internet slang": ["historical", "internet-slang"],
469 "now usually derogatory or offensive": ["offensive", "derogatory"],
470 "lofty": [],
471}
473# Template name prefixes used for language-specific panel templates (i.e.,
474# templates that create side boxes or notice boxes or that should generally
475# be ignored).
476PANEL_PREFIXES: set[str] = {
477 "list:compass points/",
478 "list:Gregorian calendar months/",
479 "RQ:",
480}
482# Templates used for wikipedia links.
483wikipedia_templates: set[str] = {
484 "wikipedia",
485 "slim-wikipedia",
486 "w",
487 "W",
488 "swp",
489 "wiki",
490 "Wikipedia",
491 "wtorw",
492}
493for x in PANEL_PREFIXES & wikipedia_templates: 493 ↛ 494line 493 didn't jump to line 494 because the loop on line 493 never started
494 print(
495 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
496 x
497 )
498 )
500# Mapping from a template name (without language prefix) for the main word
501# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
502# it could validly occur. This is used as just a sanity check to give
503# warnings about probably incorrect coding in Wiktionary.
504template_allowed_pos_map: dict[str, list[str]] = {
505 "abbr": ["abbrev"],
506 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
507 "plural noun": ["noun", "name"],
508 "plural-noun": ["noun", "name"],
509 "proper noun": ["noun", "name"],
510 "proper-noun": ["name", "noun"],
511 "prop": ["name", "noun"],
512 "verb": ["verb", "phrase"],
513 "gerund": ["verb"],
514 "particle": ["adv", "particle"],
515 "adj": ["adj", "adj_noun"],
516 "pron": ["pron", "noun"],
517 "name": ["name", "noun"],
518 "adv": ["adv", "intj", "conj", "particle"],
519 "phrase": ["phrase", "prep_phrase"],
520 "noun phrase": ["phrase"],
521 "ordinal": ["num"],
522 "number": ["num"],
523 "pos": ["affix", "name", "num"],
524 "suffix": ["suffix", "affix"],
525 "character": ["character"],
526 "letter": ["character"],
527 "kanji": ["character"],
528 "cont": ["abbrev"],
529 "interj": ["intj"],
530 "con": ["conj"],
531 "part": ["particle"],
532 "prep": ["prep", "postp"],
533 "postp": ["postp"],
534 "misspelling": ["noun", "adj", "verb", "adv"],
535 "part-form": ["verb"],
536}
537for k, v in template_allowed_pos_map.items():
538 for x in v:
539 if x not in PARTS_OF_SPEECH: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true
540 print(
541 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
542 "".format(x, k, v)
543 )
544 assert False
547# Templates ignored during etymology extraction, i.e., these will not be listed
548# in the extracted etymology templates.
549ignored_etymology_templates: list[str] = [
550 "...",
551 "IPAchar",
552 "ipachar",
553 "ISBN",
554 "isValidPageName",
555 "redlink category",
556 "deprecated code",
557 "check deprecated lang param usage",
558 "para",
559 "p",
560 "cite",
561 "Cite news",
562 "Cite newsgroup",
563 "cite paper",
564 "cite MLLM 1976",
565 "cite journal",
566 "cite news/documentation",
567 "cite paper/documentation",
568 "cite video game",
569 "cite video game/documentation",
570 "cite newsgroup",
571 "cite newsgroup/documentation",
572 "cite web/documentation",
573 "cite news",
574 "Cite book",
575 "Cite-book",
576 "cite book",
577 "cite web",
578 "cite-usenet",
579 "cite-video/documentation",
580 "Cite-journal",
581 "rfe",
582 "catlangname",
583 "cln",
584 "langname-lite",
585 "no deprecated lang param usage",
586 "mention",
587 "m",
588 "m-self",
589 "link",
590 "l",
591 "ll",
592 "l-self",
593]
594# Regexp for matching ignored etymology template names. This adds certain
595# prefixes to the names listed above.
596ignored_etymology_templates_re = re.compile(
597 r"^((cite-|R:|RQ:).*|"
598 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
599 + r")$"
600)
602# Regexp for matching ignored descendants template names. Right now we just
603# copy the ignored etymology templates
604ignored_descendants_templates_re = ignored_etymology_templates_re
606# Set of template names that are used to define usage examples. If the usage
607# example contains one of these templates, then it its type is set to
608# "example"
609usex_templates: set[str] = {
610 "afex",
611 "affixusex",
612 "co", # {{collocation}} acts like a example template, specifically for
613 # pairs of combinations of words that are more common than you'd
614 # except would be randomly; hlavní#Czech
615 "coi",
616 "collocation",
617 "el-example",
618 "el-x",
619 "example",
620 "examples",
621 "he-usex",
622 "he-x",
623 "hi-usex",
624 "hi-x",
625 "ja-usex-inline",
626 "ja-usex",
627 "ja-x",
628 "jbo-example",
629 "jbo-x",
630 "km-usex",
631 "km-x",
632 "ko-usex",
633 "ko-x",
634 "lo-usex",
635 "lo-x",
636 "ne-x",
637 "ne-usex",
638 "prefixusex",
639 "ryu-usex",
640 "ryu-x",
641 "shn-usex",
642 "shn-x",
643 "suffixusex",
644 "th-usex",
645 "th-x",
646 "ur-usex",
647 "ur-x",
648 "usex",
649 "usex-suffix",
650 "ux",
651 "uxi",
652}
654stop_head_at_these_templates: set[str] = {
655 "category",
656 "cat",
657 "topics",
658 "catlangname",
659 "c",
660 "C",
661 "top",
662 "cln",
663}
665# Set of template names that are used to define quotation examples. If the
666# usage example contains one of these templates, then its type is set to
667# "quotation".
668quotation_templates: set[str] = {
669 "collapse-quote",
670 "quote-av",
671 "quote-book",
672 "quote-GYLD",
673 "quote-hansard",
674 "quotei",
675 "quote-journal",
676 "quotelite",
677 "quote-mailing list",
678 "quote-meta",
679 "quote-newsgroup",
680 "quote-song",
681 "quote-text",
682 "quote",
683 "quote-us-patent",
684 "quote-video game",
685 "quote-web",
686 "quote-wikipedia",
687 "wikiquote",
688 "Wikiquote",
689}
691taxonomy_templates = {
692 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
693 "taxfmt",
694 "taxlink",
695 "taxlink2",
696 "taxlinknew",
697 "taxlook",
698}
700# Template name component to linkage section listing. Integer section means
701# default section, starting at that argument.
702# XXX not used anymore, except for the first elements: moved to
703# template_linkages
704# template_linkage_mappings: list[list[Union[str, int]]] = [
705# ["syn", "synonyms"],
706# ["synonyms", "synonyms"],
707# ["ant", "antonyms"],
708# ["antonyms", "antonyms"],
709# ["hyp", "hyponyms"],
710# ["hyponyms", "hyponyms"],
711# ["der", "derived"],
712# ["derived terms", "derived"],
713# ["coordinate terms", "coordinate_terms"],
714# ["rel", "related"],
715# ["col", 2],
716# ]
718# Template names, this was exctracted from template_linkage_mappings,
719# because the code using template_linkage_mappings was actually not used
720# (but not removed).
721template_linkages_to_ignore_in_examples: set[str] = {
722 "syn",
723 "synonyms",
724 "ant",
725 "antonyms",
726 "hyp",
727 "hyponyms",
728 "der",
729 "derived terms",
730 "coordinate terms",
731 "cot",
732 "rel",
733 "col",
734 "inline alt forms",
735 "alti",
736 "comeronyms",
737 "holonyms",
738 "holo",
739 "hypernyms",
740 "hyper",
741 "meronyms,"
742 "mero",
743 "troponyms",
744 "perfectives",
745 "pf",
746 "imperfectives",
747 "impf",
748 "syndiff",
749 "synsee",
750}
752# Maps template name used in a word sense to a linkage field that it adds.
753sense_linkage_templates: dict[str, str] = {
754 "syn": "synonyms",
755 "synonyms": "synonyms",
756 "synsee": "synonyms",
757 "syndiff": "synonyms",
758 "hyp": "hyponyms",
759 "hyponyms": "hyponyms",
760 "ant": "antonyms",
761 "antonyms": "antonyms",
762 "alti": "related",
763 "inline alt forms": "related",
764 "coordinate terms": "coordinate_terms",
765 "cot": "coordinate_terms",
766 "comeronyms": "related",
767 "holonyms": "holonyms",
768 "holo": "holonyms",
769 "hypernyms": "hypernyms",
770 "hyper": "hypernyms",
771 "meronyms": "meronyms",
772 "mero": "meronyms",
773 "troponyms": "troponyms",
774 "perfectives": "related",
775 "pf": "related",
776 "imperfectives": "related",
777 "impf": "related",
778}
780sense_linkage_templates_tags: dict[str, list[str]] = {
781 "alti": ["alternative"],
782 "inline alt forms": ["alternative"],
783 "comeronyms": ["comeronym"],
784 "perfectives": ["perfective"],
785 "pf": ["perfective"],
786 "imperfectives": ["imperfective"],
787 "impf": ["imperfective"],
788}
791def decode_html_entities(v: Union[str, int]) -> str:
792 """Decodes HTML entities from a value, converting them to the respective
793 Unicode characters/strings."""
794 if isinstance(v, int):
795 # I changed this to return str(v) instead of v = str(v),
796 # but there might have been the intention to have more logic
797 # here. html.unescape would not do anything special with an integer,
798 # it needs html escape symbols (&xx;).
799 return str(v)
800 return html.unescape(v)
803def parse_sense_linkage(
804 wxr: WiktextractContext,
805 data: SenseData,
806 name: str,
807 ht: TemplateArgs,
808 pos: str,
809) -> None:
810 """Parses a linkage (synonym, etc) specified in a word sense."""
811 assert isinstance(wxr, WiktextractContext)
812 assert isinstance(data, dict)
813 assert isinstance(name, str)
814 assert isinstance(ht, dict)
815 field = sense_linkage_templates[name]
816 field_tags = sense_linkage_templates_tags.get(name, [])
817 for i in range(2, 20):
818 w = ht.get(i) or ""
819 w = clean_node(wxr, data, w)
820 is_thesaurus = False
821 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
822 if w.startswith(alias): 822 ↛ 823line 822 didn't jump to line 823 because the condition on line 822 was never true
823 is_thesaurus = True
824 w = w[len(alias) :]
825 if w != wxr.wtp.title:
826 from ...thesaurus import search_thesaurus
828 lang_code = clean_node(wxr, None, ht.get(1, ""))
829 for t_data in search_thesaurus(
830 wxr.thesaurus_db_conn, w, lang_code, pos, field
831 ):
832 l_data = {
833 "word": t_data.term,
834 "source": "Thesaurus:" + w,
835 }
836 if len(t_data.tags) > 0:
837 l_data["tags"] = t_data.tags
838 if len(t_data.raw_tags) > 0:
839 l_data["raw_tags"] = t_data.raw_tags
840 data_append(data, field, l_data)
841 break
842 if not w:
843 break
844 if is_thesaurus: 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true
845 continue
846 tags: list[str] = []
847 topics: list[str] = []
848 english: Optional[str] = None
849 # Try to find qualifiers for this synonym
850 q = ht.get("q{}".format(i - 1))
851 if q:
852 cls = classify_desc(q)
853 if cls == "tags":
854 tagsets1, topics1 = decode_tags(q)
855 for ts in tagsets1:
856 tags.extend(ts)
857 topics.extend(topics1)
858 elif cls == "english": 858 ↛ 864line 858 didn't jump to line 864 because the condition on line 858 was always true
859 if english: 859 ↛ 860line 859 didn't jump to line 860 because the condition on line 859 was never true
860 english += "; " + q
861 else:
862 english = q
863 # Try to find English translation for this synonym
864 t = ht.get("t{}".format(i - 1))
865 if t: 865 ↛ 866line 865 didn't jump to line 866 because the condition on line 865 was never true
866 if english:
867 english += "; " + t
868 else:
869 english = t
871 # See if the linkage contains a parenthesized alt
872 alt = None
873 m = re.search(r"\(([^)]+)\)$", w)
874 if m: 874 ↛ 875line 874 didn't jump to line 875 because the condition on line 874 was never true
875 w = w[: m.start()].strip()
876 alt = m.group(1)
878 dt = {"word": w}
879 if field_tags: 879 ↛ 880line 879 didn't jump to line 880 because the condition on line 879 was never true
880 data_extend(dt, "tags", field_tags)
881 if tags:
882 data_extend(dt, "tags", tags)
883 if topics: 883 ↛ 884line 883 didn't jump to line 884 because the condition on line 883 was never true
884 data_extend(dt, "topics", topics)
885 if english:
886 dt["english"] = english
887 if alt: 887 ↛ 888line 887 didn't jump to line 888 because the condition on line 887 was never true
888 dt["alt"] = alt
889 data_append(data, field, dt)
892EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
893example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
894captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
897def synch_splits_with_args(
898 line: str, targs: TemplateArgs
899) -> Optional[list[str]]:
900 """If it looks like there's something weird with how a line of example
901 text has been split, this function will do the splitting after counting
902 occurences of the splitting regex inside the two main template arguments
903 containing the string data for the original language example and the
904 English translations.
905 """
906 # Previously, we split without capturing groups, but here we want to
907 # keep the original splitting hyphen regex intact.
908 fparts = captured_splitters_re.split(line)
909 new_parts = []
910 # ["First", " – ", "second", " – ", "third..."] from OL argument
911 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
912 new_parts.append("".join(fparts[:first]))
913 # Translation argument
914 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
915 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
916 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
917 new_parts.append("".join(fparts[first + 1 : second]))
919 if all(new_parts): # no empty strings from the above spaghetti
920 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
921 return new_parts
922 else:
923 return None
926QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
927QUALIFIERS_RE = re.compile(QUALIFIERS)
928# (...): ... or (...(...)...): ...
931def parse_language(
932 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
933) -> list[WordData]:
934 """Iterates over the text of the page, returning words (parts-of-speech)
935 defined on the page one at a time. (Individual word senses for the
936 same part-of-speech are typically encoded in the same entry.)"""
937 # imported here to avoid circular import
938 from .pronunciation import parse_pronunciation
940 assert isinstance(wxr, WiktextractContext)
941 assert isinstance(langnode, WikiNode)
942 assert isinstance(language, str)
943 assert isinstance(lang_code, str)
944 # print("parse_language", language)
946 is_reconstruction = False
947 word: str = wxr.wtp.title # type: ignore[assignment]
948 unsupported_prefix = "Unsupported titles/"
949 if word.startswith(unsupported_prefix):
950 w = word[len(unsupported_prefix) :]
951 if w in unsupported_title_map: 951 ↛ 954line 951 didn't jump to line 954 because the condition on line 951 was always true
952 word = unsupported_title_map[w]
953 else:
954 wxr.wtp.error(
955 "Unimplemented unsupported title: {}".format(word),
956 sortid="page/870",
957 )
958 word = w
959 elif word.startswith("Reconstruction:"): 959 ↛ 960line 959 didn't jump to line 960 because the condition on line 959 was never true
960 word = word[word.find("/") + 1 :]
961 is_reconstruction = True
963 base_data: WordData = {
964 "word": word,
965 "lang": language,
966 "lang_code": lang_code,
967 }
968 if is_reconstruction: 968 ↛ 969line 968 didn't jump to line 969 because the condition on line 968 was never true
969 data_append(base_data, "tags", "reconstruction")
970 sense_data: SenseData = {}
971 pos_data: WordData = {} # For a current part-of-speech
972 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
973 etym_data: WordData = {} # For one etymology
974 pos_datas: list[SenseData] = []
975 level_four_datas: list[WordData] = []
976 etym_datas: list[WordData] = []
977 page_datas: list[WordData] = []
978 have_etym = False
979 inside_level_four = False # This is for checking if the etymology section
980 # or article has a Pronunciation section, for Chinese mostly; because
981 # Chinese articles can have three level three sections (two etymology
982 # sections and pronunciation sections) one after another, we need a kludge
983 # to better keep track of whether we're in a normal "etym" or inside a
984 # "level four" (which is what we've turned the level three Pron sections
985 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
986 # a step.
987 stack: list[str] = [] # names of items on the "stack"
989 def merge_base(data: WordData, base: WordData) -> None:
990 for k, v in base.items():
991 # Copy the value to ensure that we don't share lists or
992 # dicts between structures (even nested ones).
993 v = copy.deepcopy(v)
994 if k not in data:
995 # The list was copied above, so this will not create shared ref
996 data[k] = v # type: ignore[literal-required]
997 continue
998 if data[k] == v: # type: ignore[literal-required]
999 continue
1000 if ( 1000 ↛ 1008line 1000 didn't jump to line 1008 because the condition on line 1000 was always true
1001 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1002 or isinstance(
1003 v,
1004 (list, tuple), # Should this be "and"?
1005 )
1006 ):
1007 data[k] = list(data[k]) + list(v) # type: ignore
1008 elif data[k] != v: # type: ignore[literal-required]
1009 wxr.wtp.warning(
1010 "conflicting values for {} in merge_base: "
1011 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1012 sortid="page/904",
1013 )
1015 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1016 """Remove unnecessary keys from dict values
1017 in a list comprehension..."""
1018 if key in pron:
1019 pron.pop(key) # type: ignore
1020 return pron
1022 # If the result has sounds, eliminate sounds that have a prefix that
1023 # does not match "word" or one of "forms"
1024 if "sounds" in data and "word" in data:
1025 accepted = [data["word"]]
1026 accepted.extend(f["form"] for f in data.get("forms", dict()))
1027 data["sounds"] = list(
1028 s
1029 for s in data["sounds"]
1030 if "form" not in s or s["form"] in accepted
1031 )
1032 # If the result has sounds, eliminate sounds that have a pos that
1033 # does not match "pos"
1034 if "sounds" in data and "pos" in data:
1035 data["sounds"] = list(
1036 complementary_pop(s, "pos")
1037 for s in data["sounds"]
1038 # "pos" is not a field of SoundData, correctly, so we're
1039 # removing it here. It's a kludge on a kludge on a kludge.
1040 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1041 )
1043 def push_sense() -> bool:
1044 """Starts collecting data for a new word sense. This returns True
1045 if a sense was added."""
1046 nonlocal sense_data
1047 tags = sense_data.get("tags", ())
1048 if (
1049 not sense_data.get("glosses")
1050 and "translation-hub" not in tags
1051 and "no-gloss" not in tags
1052 ):
1053 return False
1055 if ( 1055 ↛ 1065line 1055 didn't jump to line 1065 because the condition on line 1055 was never true
1056 (
1057 "participle" in sense_data.get("tags", ())
1058 or "infinitive" in sense_data.get("tags", ())
1059 )
1060 and "alt_of" not in sense_data
1061 and "form_of" not in sense_data
1062 and "etymology_text" in etym_data
1063 and etym_data["etymology_text"] != ""
1064 ):
1065 etym = etym_data["etymology_text"]
1066 etym = etym.split(". ")[0]
1067 ret = parse_alt_or_inflection_of(wxr, etym, set())
1068 if ret is not None:
1069 tags, lst = ret
1070 assert isinstance(lst, (list, tuple))
1071 if "form-of" in tags:
1072 data_extend(sense_data, "form_of", lst)
1073 data_extend(sense_data, "tags", tags)
1074 elif "alt-of" in tags:
1075 data_extend(sense_data, "alt_of", lst)
1076 data_extend(sense_data, "tags", tags)
1078 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1078 ↛ 1081line 1078 didn't jump to line 1081 because the condition on line 1078 was never true
1079 "tags", ()
1080 ):
1081 data_append(sense_data, "tags", "no-gloss")
1083 pos_datas.append(sense_data)
1084 sense_data = {}
1085 return True
1087 def push_pos() -> None:
1088 """Starts collecting data for a new part-of-speech."""
1089 nonlocal pos_data
1090 nonlocal pos_datas
1091 push_sense()
1092 if wxr.wtp.subsection:
1093 data: WordData = {"senses": pos_datas}
1094 merge_base(data, pos_data)
1095 level_four_datas.append(data)
1096 pos_data = {}
1097 pos_datas = []
1098 wxr.wtp.start_subsection(None)
1100 def push_level_four_section() -> None:
1101 """Starts collecting data for a new level four sections, which
1102 is usually virtual and empty, unless the article has Chinese
1103 'Pronunciation' sections that are etymology-section-like but
1104 under etymology, and at the same level in the source. We modify
1105 the source to demote Pronunciation sections like that to level
1106 4, and other sections one step lower."""
1107 nonlocal level_four_data
1108 nonlocal level_four_datas
1109 nonlocal etym_datas
1110 push_pos()
1111 # print(f"======\n{etym_data=}")
1112 # print(f"======\n{etym_datas=}")
1113 # print(f"======\n{level_four_data=}")
1114 # print(f"======\n{level_four_datas=}")
1115 for data in level_four_datas:
1116 merge_base(data, level_four_data)
1117 etym_datas.append(data)
1118 for data in etym_datas:
1119 merge_base(data, etym_data)
1120 page_datas.append(data)
1121 level_four_data = {}
1122 level_four_datas = []
1123 etym_datas = []
1125 def push_etym() -> None:
1126 """Starts collecting data for a new etymology."""
1127 nonlocal etym_data
1128 nonlocal etym_datas
1129 nonlocal have_etym
1130 nonlocal inside_level_four
1131 have_etym = True
1132 push_level_four_section()
1133 inside_level_four = False
1134 etym_data = {}
1136 def select_data() -> WordData:
1137 """Selects where to store data (pos or etym) based on whether we
1138 are inside a pos (part-of-speech)."""
1139 # print(f"{wxr.wtp.subsection=}")
1140 # print(f"{stack=}")
1141 if wxr.wtp.subsection is not None:
1142 return pos_data
1143 if stack[-1] == language:
1144 return base_data
1145 if inside_level_four is False:
1146 return etym_data
1147 return level_four_data
1149 term_label_templates: list[TemplateData] = []
1151 def head_post_template_fn(
1152 name: str, ht: TemplateArgs, expansion: str
1153 ) -> Optional[str]:
1154 """Handles special templates in the head section of a word. Head
1155 section is the text after part-of-speech subtitle and before word
1156 sense list. Typically it generates the bold line for the word, but
1157 may also contain other useful information that often ends in
1158 side boxes. We want to capture some of that additional information."""
1159 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1160 if is_panel_template(wxr, name): 1160 ↛ 1163line 1160 didn't jump to line 1163 because the condition on line 1160 was never true
1161 # Completely ignore these templates (not even recorded in
1162 # head_templates)
1163 return ""
1164 if name == "head":
1165 # XXX are these also captured in forms? Should this special case
1166 # be removed?
1167 t = ht.get(2, "")
1168 if t == "pinyin": 1168 ↛ 1169line 1168 didn't jump to line 1169 because the condition on line 1168 was never true
1169 data_append(pos_data, "tags", "Pinyin")
1170 elif t == "romanization": 1170 ↛ 1171line 1170 didn't jump to line 1171 because the condition on line 1170 was never true
1171 data_append(pos_data, "tags", "romanization")
1172 if (
1173 HEAD_TAG_RE.search(name) is not None
1174 or name in WORD_LEVEL_HEAD_TEMPLATES
1175 ):
1176 args_ht = clean_template_args(wxr, ht)
1177 cleaned_expansion = clean_node(wxr, None, expansion)
1178 dt: TemplateData = {
1179 "name": name,
1180 "args": args_ht,
1181 "expansion": cleaned_expansion,
1182 }
1183 data_append(pos_data, "head_templates", dt)
1184 if name in WORD_LEVEL_HEAD_TEMPLATES:
1185 term_label_templates.append(dt)
1186 # Squash these, their tags are applied to the whole word,
1187 # and some cause problems like "term-label"
1188 return ""
1190 # The following are both captured in head_templates and parsed
1191 # separately
1193 if name in wikipedia_templates:
1194 # Note: various places expect to have content from wikipedia
1195 # templates, so cannot convert this to empty
1196 parse_wikipedia_template(wxr, pos_data, ht)
1197 return None
1199 if name == "number box": 1199 ↛ 1201line 1199 didn't jump to line 1201 because the condition on line 1199 was never true
1200 # XXX extract numeric value?
1201 return ""
1202 if name == "enum":
1203 # XXX extract?
1204 return ""
1205 if name == "cardinalbox": 1205 ↛ 1208line 1205 didn't jump to line 1208 because the condition on line 1205 was never true
1206 # XXX extract similar to enum?
1207 # XXX this can also occur in top-level under language
1208 return ""
1209 if name == "Han simplified forms": 1209 ↛ 1211line 1209 didn't jump to line 1211 because the condition on line 1209 was never true
1210 # XXX extract?
1211 return ""
1212 # if name == "ja-kanji forms":
1213 # # XXX extract?
1214 # return ""
1215 # if name == "vi-readings":
1216 # # XXX extract?
1217 # return ""
1218 # if name == "ja-kanji":
1219 # # XXX extract?
1220 # return ""
1221 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1221 ↛ 1223line 1221 didn't jump to line 1223 because the condition on line 1221 was never true
1222 # XXX extract?
1223 return ""
1225 return None
1227 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1228 """Parses the subsection for a part-of-speech under a language on
1229 a page."""
1230 assert isinstance(posnode, WikiNode)
1231 assert isinstance(pos, str)
1232 # print("parse_part_of_speech", pos)
1233 pos_data["pos"] = pos
1234 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1235 lists: list[list[WikiNode]] = [[]] # list of lists
1236 first_para = True
1237 first_head_tmplt = True
1238 collecting_head = True
1239 start_of_paragraph = True
1241 # XXX extract templates from posnode with recursively_extract
1242 # that break stuff, like ja-kanji or az-suffix-form.
1243 # Do the extraction with a list of template names, combined from
1244 # different lists, then separate out them into different lists
1245 # that are handled at different points of the POS section.
1246 # First, extract az-suffix-form, put it in `inflection`,
1247 # and parse `inflection`'s content when appropriate later.
1248 # The contents of az-suffix-form (and ja-kanji) that generate
1249 # divs with "floatright" in their style gets deleted by
1250 # clean_value, so templates that slip through from here won't
1251 # break anything.
1252 # XXX bookmark
1253 # print("===================")
1254 # print(posnode.children)
1256 floaters, poschildren = recursively_extract(
1257 posnode.children,
1258 lambda x: (
1259 isinstance(x, WikiNode)
1260 and (
1261 (
1262 x.kind == NodeKind.TEMPLATE
1263 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1264 )
1265 or (
1266 x.kind == NodeKind.LINK
1267 # Need to check for stringiness because some links are
1268 # broken; for example, if a template is missing an
1269 # argument, a link might look like `[[{{{1}}}...]]`
1270 and isinstance(x.largs[0][0], str)
1271 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1272 )
1273 )
1274 ),
1275 )
1276 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1277 tempnode.largs = [["Inflection"]]
1278 tempnode.children = floaters
1279 parse_inflection(tempnode, "Floating Div", pos)
1280 # print(poschildren)
1281 # XXX new above
1283 if not poschildren: 1283 ↛ 1284line 1283 didn't jump to line 1284 because the condition on line 1283 was never true
1284 if not floaters:
1285 wxr.wtp.debug(
1286 "PoS section without contents",
1287 sortid="en/page/1051/20230612",
1288 )
1289 else:
1290 wxr.wtp.debug(
1291 "PoS section without contents except for a floating table",
1292 sortid="en/page/1056/20230612",
1293 )
1294 return
1296 for node in poschildren:
1297 if isinstance(node, str):
1298 for m in re.finditer(r"\n+|[^\n]+", node):
1299 p = m.group(0)
1300 if p.startswith("\n\n") and pre:
1301 first_para = False
1302 start_of_paragraph = True
1303 break
1304 if p and collecting_head:
1305 pre[-1].append(p)
1306 continue
1307 assert isinstance(node, WikiNode)
1308 kind = node.kind
1309 if kind == NodeKind.LIST:
1310 lists[-1].append(node)
1311 collecting_head = False
1312 start_of_paragraph = True
1313 continue
1314 elif kind in LEVEL_KINDS:
1315 # Stop parsing section if encountering any kind of
1316 # level header (like ===Noun=== or ====Further Reading====).
1317 # At a quick glance, this should be the default behavior,
1318 # but if some kinds of source articles have sub-sub-sections
1319 # that should be parsed XXX it should be handled by changing
1320 # this break.
1321 break
1322 elif collecting_head and kind == NodeKind.LINK:
1323 # We might collect relevant links as they are often pictures
1324 # relating to the word
1325 if len(node.largs[0]) >= 1 and isinstance( 1325 ↛ 1340line 1325 didn't jump to line 1340 because the condition on line 1325 was always true
1326 node.largs[0][0], str
1327 ):
1328 if node.largs[0][0].startswith( 1328 ↛ 1334line 1328 didn't jump to line 1334 because the condition on line 1328 was never true
1329 ns_title_prefix_tuple(wxr, "Category")
1330 ):
1331 # [[Category:...]]
1332 # We're at the end of the file, probably, so stop
1333 # here. Otherwise the head will get garbage.
1334 break
1335 if node.largs[0][0].startswith( 1335 ↛ 1340line 1335 didn't jump to line 1340 because the condition on line 1335 was always true
1336 ns_title_prefix_tuple(wxr, "File")
1337 ):
1338 # Skips file links
1339 continue
1340 start_of_paragraph = False
1341 pre[-1].extend(node.largs[-1])
1342 elif kind == NodeKind.HTML:
1343 if node.sarg == "br":
1344 if pre[-1]: 1344 ↛ 1296line 1344 didn't jump to line 1296 because the condition on line 1344 was always true
1345 pre.append([]) # Switch to next head
1346 lists.append([]) # Lists parallels pre
1347 collecting_head = True
1348 start_of_paragraph = True
1349 elif collecting_head and node.sarg not in ( 1349 ↛ 1355line 1349 didn't jump to line 1355 because the condition on line 1349 was never true
1350 "gallery",
1351 "ref",
1352 "cite",
1353 "caption",
1354 ):
1355 start_of_paragraph = False
1356 pre[-1].append(node)
1357 else:
1358 start_of_paragraph = False
1359 elif isinstance(node, TemplateNode):
1360 # XXX Insert code here that disambiguates between
1361 # templates that generate word heads and templates
1362 # that don't.
1363 # There's head_tag_re that seems like a regex meant
1364 # to identify head templates. Too bad it's None.
1366 # ignore {{category}}, {{cat}}... etc.
1367 if node.template_name in stop_head_at_these_templates:
1368 # we've reached a template that should be at the end,
1369 continue
1371 # skip these templates; panel_templates is already used
1372 # to skip certain templates else, but it also applies to
1373 # head parsing quite well.
1374 # node.largs[0][0] should always be str, but can't type-check
1375 # that.
1376 if is_panel_template(wxr, node.template_name):
1377 continue
1378 # skip these templates
1379 # if node.largs[0][0] in skip_these_templates_in_head:
1380 # first_head_tmplt = False # no first_head_tmplt at all
1381 # start_of_paragraph = False
1382 # continue
1384 if first_head_tmplt and pre[-1]:
1385 first_head_tmplt = False
1386 start_of_paragraph = False
1387 pre[-1].append(node)
1388 elif pre[-1] and start_of_paragraph:
1389 pre.append([]) # Switch to the next head
1390 lists.append([]) # lists parallel pre
1391 collecting_head = True
1392 start_of_paragraph = False
1393 pre[-1].append(node)
1394 else:
1395 pre[-1].append(node)
1396 elif first_para:
1397 start_of_paragraph = False
1398 if collecting_head: 1398 ↛ 1296line 1398 didn't jump to line 1296 because the condition on line 1398 was always true
1399 pre[-1].append(node)
1400 # XXX use template_fn in clean_node to check that the head macro
1401 # is compatible with the current part-of-speech and generate warning
1402 # if not. Use template_allowed_pos_map.
1404 # Clean up empty pairs, and fix messes with extra newlines that
1405 # separate templates that are followed by lists wiktextract issue #314
1407 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1408 cleaned_lists: list[list[WikiNode]] = []
1409 pairless_pre_index = None
1411 for pre1, ls in zip(pre, lists):
1412 if pre1 and not ls:
1413 pairless_pre_index = len(cleaned_pre)
1414 if not pre1 and not ls: 1414 ↛ 1416line 1414 didn't jump to line 1416 because the condition on line 1414 was never true
1415 # skip [] + []
1416 continue
1417 if not ls and all(
1418 (isinstance(x, str) and not x.strip()) for x in pre1
1419 ):
1420 # skip ["\n", " "] + []
1421 continue
1422 if ls and not pre1:
1423 if pairless_pre_index is not None: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true
1424 cleaned_lists[pairless_pre_index] = ls
1425 pairless_pre_index = None
1426 continue
1427 cleaned_pre.append(pre1)
1428 cleaned_lists.append(ls)
1430 pre = cleaned_pre
1431 lists = cleaned_lists
1433 there_are_many_heads = len(pre) > 1
1434 header_tags: list[str] = []
1435 header_topics: list[str] = []
1436 previous_head_had_list = False
1438 if not any(g for g in lists):
1439 process_gloss_without_list(
1440 poschildren, pos, pos_data, header_tags, header_topics
1441 )
1442 else:
1443 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1444 # if len(ls) == 0:
1445 # # don't have gloss list
1446 # # XXX add code here to filter out 'garbage', like text
1447 # # that isn't a head template or head.
1448 # continue
1450 if all(not sl for sl in lists[i:]):
1451 if i == 0: 1451 ↛ 1452line 1451 didn't jump to line 1452 because the condition on line 1451 was never true
1452 if isinstance(node, str):
1453 wxr.wtp.debug(
1454 "first head without list of senses,"
1455 "string: '{}[...]', {}/{}".format(
1456 node[:20], word, language
1457 ),
1458 sortid="page/1689/20221215",
1459 )
1460 if isinstance(node, WikiNode):
1461 if node.largs and node.largs[0][0] in [
1462 "Han char",
1463 ]:
1464 # just ignore these templates
1465 pass
1466 else:
1467 wxr.wtp.debug(
1468 "first head without "
1469 "list of senses, "
1470 "template node "
1471 "{}, {}/{}".format(
1472 node.largs, word, language
1473 ),
1474 sortid="page/1694/20221215",
1475 )
1476 else:
1477 wxr.wtp.debug(
1478 "first head without list of senses, "
1479 "{}/{}".format(word, language),
1480 sortid="page/1700/20221215",
1481 )
1482 # no break here so that the first head always
1483 # gets processed.
1484 else:
1485 if isinstance(node, str): 1485 ↛ 1486line 1485 didn't jump to line 1486 because the condition on line 1485 was never true
1486 wxr.wtp.debug(
1487 "later head without list of senses,"
1488 "string: '{}[...]', {}/{}".format(
1489 node[:20], word, language
1490 ),
1491 sortid="page/1708/20221215",
1492 )
1493 if isinstance(node, WikiNode): 1493 ↛ 1505line 1493 didn't jump to line 1505 because the condition on line 1493 was always true
1494 wxr.wtp.debug(
1495 "later head without list of senses,"
1496 "template node "
1497 "{}, {}/{}".format(
1498 node.sarg if node.sarg else node.largs,
1499 word,
1500 language,
1501 ),
1502 sortid="page/1713/20221215",
1503 )
1504 else:
1505 wxr.wtp.debug(
1506 "later head without list of senses, "
1507 "{}/{}".format(word, language),
1508 sortid="page/1719/20221215",
1509 )
1510 break
1511 head_group = i + 1 if there_are_many_heads else None
1512 # print("parse_part_of_speech: {}: {}: pre={}"
1513 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1515 if previous_head_had_list:
1516 # We use a boolean flag here because we want to be able
1517 # let the header_tags data pass through after the loop
1518 # is over without accidentally emptying it, if there are
1519 # no pos_datas and we need a dummy data.
1520 header_tags.clear()
1521 header_topics.clear()
1523 process_gloss_header(
1524 pre1, pos, head_group, pos_data, header_tags, header_topics
1525 )
1526 for ln in ls:
1527 # Parse each list associated with this head.
1528 for node in ln.children:
1529 # Parse nodes in l.children recursively.
1530 # The recursion function uses push_sense() to
1531 # add stuff into pos_data, and returns True or
1532 # False if something is added, which bubbles upward.
1533 # If the bubble is "True", then higher levels of
1534 # the recursion will not push_sense(), because
1535 # the data is already pushed into a sub-gloss
1536 # downstream, unless the higher level has examples
1537 # that need to be put somewhere.
1538 common_data: SenseData = {
1539 "tags": list(header_tags),
1540 "topics": list(header_topics),
1541 }
1542 if head_group:
1543 common_data["head_nr"] = head_group
1544 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1546 if len(ls) > 0:
1547 previous_head_had_list = True
1548 else:
1549 previous_head_had_list = False
1551 # If there are no senses extracted, add a dummy sense. We want to
1552 # keep tags extracted from the head for the dummy sense.
1553 push_sense() # Make sure unfinished data pushed, and start clean sense
1554 if len(pos_datas) == 0:
1555 data_extend(sense_data, "tags", header_tags)
1556 data_extend(sense_data, "topics", header_topics)
1557 data_append(sense_data, "tags", "no-gloss")
1558 push_sense()
1560 def process_gloss_header(
1561 header_nodes: list[Union[WikiNode, str]],
1562 pos_type: str,
1563 header_group: Optional[int],
1564 pos_data: WordData,
1565 header_tags: list[str],
1566 header_topics: list[str],
1567 ) -> None:
1568 ruby = []
1569 links: list[str] = []
1571 # process template parse nodes here
1572 new_nodes = []
1573 info_template_data = []
1574 for node in header_nodes:
1575 # print(f"{node=}")
1576 info_data, info_out = parse_info_template_node(wxr, node, "head")
1577 if info_data or info_out:
1578 if info_data: 1578 ↛ 1580line 1578 didn't jump to line 1580 because the condition on line 1578 was always true
1579 info_template_data.append(info_data)
1580 if info_out: # including just the original node 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true
1581 new_nodes.append(info_out)
1582 else:
1583 new_nodes.append(node)
1584 header_nodes = new_nodes
1586 if info_template_data:
1587 if "info_templates" not in pos_data: 1587 ↛ 1590line 1587 didn't jump to line 1590 because the condition on line 1587 was always true
1588 pos_data["info_templates"] = info_template_data
1589 else:
1590 pos_data["info_templates"].extend(info_template_data)
1592 if not word.isalnum():
1593 # `-` is kosher, add more of these if needed.
1594 if word.replace("-", "").isalnum():
1595 pass
1596 else:
1597 # if the word contains non-letter or -number characters, it
1598 # might have something that messes with split-at-semi-comma; we
1599 # collect links so that we can skip splitting them.
1600 exp = wxr.wtp.parse(
1601 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1602 )
1603 link_nodes, _ = recursively_extract(
1604 exp.children,
1605 lambda x: isinstance(x, WikiNode)
1606 and x.kind == NodeKind.LINK,
1607 )
1608 for ln in link_nodes:
1609 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1610 if not ltext.isalnum():
1611 links.append(ltext)
1612 if word not in links: 1612 ↛ 1614line 1612 didn't jump to line 1614 because the condition on line 1612 was always true
1613 links.append(word)
1614 if lang_code == "ja":
1615 exp = wxr.wtp.parse(
1616 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1617 )
1618 rub, _ = recursively_extract(
1619 exp.children,
1620 lambda x: isinstance(x, WikiNode)
1621 and x.kind == NodeKind.HTML
1622 and x.sarg == "ruby",
1623 )
1624 if rub is not None: 1624 ↛ 1633line 1624 didn't jump to line 1633 because the condition on line 1624 was always true
1625 for r in rub: 1625 ↛ 1626line 1625 didn't jump to line 1626 because the loop on line 1625 never started
1626 if TYPE_CHECKING:
1627 # we know the lambda above in recursively_extract
1628 # returns only WikiNodes in rub
1629 assert isinstance(r, WikiNode)
1630 rt = parse_ruby(wxr, r)
1631 if rt is not None:
1632 ruby.append(rt)
1633 header_text = clean_node(
1634 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1635 )
1637 term_label_tags: list[str] = []
1638 term_label_topics: list[str] = []
1639 if len(term_label_templates) > 0:
1640 # parse term label templates; if there are other similar kinds
1641 # of templates in headers that you want to squash and apply as
1642 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1643 for templ_data in term_label_templates:
1644 # print(templ_data)
1645 expan = templ_data.get("expansion", "").strip("().,; ")
1646 if not expan: 1646 ↛ 1647line 1646 didn't jump to line 1647 because the condition on line 1646 was never true
1647 continue
1648 tlb_tagsets, tlb_topics = decode_tags(expan)
1649 for tlb_tags in tlb_tagsets:
1650 if len(tlb_tags) > 0 and not any(
1651 t.startswith("error-") for t in tlb_tags
1652 ):
1653 term_label_tags.extend(tlb_tags)
1654 term_label_topics.extend(tlb_topics)
1655 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1657 header_text = re.sub(r"\s+", " ", header_text)
1658 # print(f"{header_text=}")
1659 parse_word_head(
1660 wxr,
1661 pos_type,
1662 header_text,
1663 pos_data,
1664 is_reconstruction,
1665 header_group,
1666 ruby=ruby,
1667 links=links,
1668 )
1669 if "tags" in pos_data:
1670 # pos_data can get "tags" data from some source; type-checkers
1671 # doesn't like it, so let's ignore it.
1672 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1673 del pos_data["tags"] # type: ignore[typeddict-item]
1674 if len(term_label_tags) > 0:
1675 header_tags.extend(term_label_tags)
1676 if len(term_label_topics) > 0:
1677 header_topics.extend(term_label_topics)
1679 def process_gloss_without_list(
1680 nodes: list[Union[WikiNode, str]],
1681 pos_type: str,
1682 pos_data: WordData,
1683 header_tags: list[str],
1684 header_topics: list[str],
1685 ) -> None:
1686 # gloss text might not inside a list
1687 header_nodes: list[Union[str, WikiNode]] = []
1688 gloss_nodes: list[Union[str, WikiNode]] = []
1689 for node in strip_nodes(nodes):
1690 if isinstance(node, WikiNode):
1691 if isinstance(node, TemplateNode):
1692 if node.template_name in (
1693 "zh-see",
1694 "ja-see",
1695 "ja-see-kango",
1696 ):
1697 continue # soft redirect
1698 elif (
1699 node.template_name == "head"
1700 or node.template_name.startswith(f"{lang_code}-")
1701 ):
1702 header_nodes.append(node)
1703 continue
1704 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1704 ↛ 1706line 1704 didn't jump to line 1706 because the condition on line 1704 was always true
1705 break
1706 gloss_nodes.append(node)
1708 if len(header_nodes) > 0:
1709 process_gloss_header(
1710 header_nodes,
1711 pos_type,
1712 None,
1713 pos_data,
1714 header_tags,
1715 header_topics,
1716 )
1717 if len(gloss_nodes) > 0:
1718 process_gloss_contents(
1719 gloss_nodes,
1720 pos_type,
1721 {"tags": list(header_tags), "topics": list(header_topics)},
1722 )
1724 def parse_sense_node(
1725 node: Union[str, WikiNode], # never receives str
1726 sense_base: SenseData,
1727 pos: str,
1728 ) -> bool:
1729 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1730 Uses push_sense() to attempt adding data to pos_data in the scope
1731 of parse_language() when it reaches deep in the recursion. push_sense()
1732 returns True if it succeeds, and that is bubbled up the stack; if
1733 a sense was added downstream, the higher levels (whose shared data
1734 was already added by a subsense) do not push_sense(), unless it
1735 has examples that need to be put somewhere.
1736 """
1737 assert isinstance(sense_base, dict) # Added to every sense deeper in
1738 if not isinstance(node, WikiNode): 1738 ↛ 1740line 1738 didn't jump to line 1740 because the condition on line 1738 was never true
1739 # This doesn't seem to ever happen in practice.
1740 wxr.wtp.debug(
1741 "{}: parse_sense_node called with"
1742 "something that isn't a WikiNode".format(pos),
1743 sortid="page/1287/20230119",
1744 )
1745 return False
1747 if node.kind != NodeKind.LIST_ITEM: 1747 ↛ 1748line 1747 didn't jump to line 1748 because the condition on line 1747 was never true
1748 wxr.wtp.debug(
1749 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1750 )
1751 return False
1753 if node.sarg == ":":
1754 # Skip example entries at the highest level, ones without
1755 # a sense ("...#") above them.
1756 # If node.sarg is exactly and only ":", then it's at
1757 # the highest level; lower levels would have more
1758 # "indentation", like "#:" or "##:"
1759 return False
1761 # If a recursion call succeeds in push_sense(), bubble it up with
1762 # `added`.
1763 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1764 added = False
1766 gloss_template_args: set[str] = set()
1768 # For LISTs and LIST_ITEMS, their argument is something like
1769 # "##" or "##:", and using that we can rudimentally determine
1770 # list 'depth' if need be, and also what kind of list or
1771 # entry it is; # is for normal glosses, : for examples (indent)
1772 # and * is used for quotations on wiktionary.
1773 current_depth = node.sarg
1775 children = node.children
1777 # subentries, (presumably) a list
1778 # of subglosses below this. The list's
1779 # argument ends with #, and its depth should
1780 # be bigger than parent node.
1781 subentries = [
1782 x
1783 for x in children
1784 if isinstance(x, WikiNode)
1785 and x.kind == NodeKind.LIST
1786 and x.sarg == current_depth + "#"
1787 ]
1789 # sublists of examples and quotations. .sarg
1790 # does not end with "#".
1791 others = [
1792 x
1793 for x in children
1794 if isinstance(x, WikiNode)
1795 and x.kind == NodeKind.LIST
1796 and x.sarg != current_depth + "#"
1797 ]
1799 # the actual contents of this particular node.
1800 # can be a gloss (or a template that expands into
1801 # many glosses which we can't easily pre-expand)
1802 # or could be an "outer gloss" with more specific
1803 # subglosses, or could be a qualfier for the subglosses.
1804 contents = [
1805 x
1806 for x in children
1807 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1808 ]
1809 # If this entry has sublists of entries, we should combine
1810 # gloss information from both the "outer" and sublist content.
1811 # Sometimes the outer gloss
1812 # is more non-gloss or tags, sometimes it is a coarse sense
1813 # and the inner glosses are more specific. The outer one
1814 # does not seem to have qualifiers.
1816 # If we have one sublist with one element, treat it
1817 # specially as it may be a Wiktionary error; raise
1818 # that nested element to the same level.
1819 # XXX If need be, this block can be easily removed in
1820 # the current recursive logicand the result is one sense entry
1821 # with both glosses in the glosses list, as you would
1822 # expect. If the higher entry has examples, there will
1823 # be a higher entry with some duplicated data.
1824 if len(subentries) == 1:
1825 slc = subentries[0].children
1826 if len(slc) == 1:
1827 # copy current node and modify it so it doesn't
1828 # loop infinitely.
1829 cropped_node = copy.copy(node)
1830 cropped_node.children = [
1831 x
1832 for x in children
1833 if not (
1834 isinstance(x, WikiNode)
1835 and x.kind == NodeKind.LIST
1836 and x.sarg == current_depth + "#"
1837 )
1838 ]
1839 added |= parse_sense_node(cropped_node, sense_base, pos)
1840 nonlocal sense_data # this kludge causes duplicated raw_
1841 # glosses data if this is not done;
1842 # if the top-level (cropped_node)
1843 # does not push_sense() properly or
1844 # parse_sense_node() returns early,
1845 # sense_data is not reset. This happens
1846 # for example when you have a no-gloss
1847 # string like "(intransitive)":
1848 # no gloss, push_sense() returns early
1849 # and sense_data has duplicate data with
1850 # sense_base
1851 sense_data = {}
1852 added |= parse_sense_node(slc[0], sense_base, pos)
1853 return added
1855 return process_gloss_contents(
1856 contents,
1857 pos,
1858 sense_base,
1859 subentries,
1860 others,
1861 gloss_template_args,
1862 added,
1863 )
1865 def process_gloss_contents(
1866 contents: list[Union[str, WikiNode]],
1867 pos: str,
1868 sense_base: SenseData,
1869 subentries: list[WikiNode] = [],
1870 others: list[WikiNode] = [],
1871 gloss_template_args: Set[str] = set(),
1872 added: bool = False,
1873 ) -> bool:
1874 def sense_template_fn(
1875 name: str, ht: TemplateArgs, is_gloss: bool = False
1876 ) -> Optional[str]:
1877 # print(f"sense_template_fn: {name}, {ht}")
1878 if name in wikipedia_templates:
1879 # parse_wikipedia_template(wxr, pos_data, ht)
1880 return None
1881 if is_panel_template(wxr, name):
1882 return ""
1883 if name in INFO_TEMPLATE_FUNCS:
1884 info_data, info_exp = parse_info_template_arguments(
1885 wxr, name, ht, "sense"
1886 )
1887 if info_data or info_exp: 1887 ↛ 1893line 1887 didn't jump to line 1893 because the condition on line 1887 was always true
1888 if info_data: 1888 ↛ 1890line 1888 didn't jump to line 1890 because the condition on line 1888 was always true
1889 data_append(sense_base, "info_templates", info_data)
1890 if info_exp and isinstance(info_exp, str): 1890 ↛ 1892line 1890 didn't jump to line 1892 because the condition on line 1890 was always true
1891 return info_exp
1892 return ""
1893 if name in ("defdate",):
1894 return ""
1895 if name == "senseid":
1896 langid = clean_node(wxr, None, ht.get(1, ()))
1897 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1898 if re.match(r"Q\d+$", arg):
1899 data_append(sense_base, "wikidata", arg)
1900 data_append(sense_base, "senseid", langid + ":" + arg)
1901 if name in sense_linkage_templates:
1902 # print(f"SENSE_TEMPLATE_FN: {name}")
1903 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1904 return ""
1905 if name == "†" or name == "zh-obsolete":
1906 data_append(sense_base, "tags", "obsolete")
1907 return ""
1908 if name in {
1909 "ux",
1910 "uxi",
1911 "usex",
1912 "afex",
1913 "prefixusex",
1914 "ko-usex",
1915 "ko-x",
1916 "hi-x",
1917 "ja-usex-inline",
1918 "ja-x",
1919 "quotei",
1920 "he-x",
1921 "hi-x",
1922 "km-x",
1923 "ne-x",
1924 "shn-x",
1925 "th-x",
1926 "ur-x",
1927 }:
1928 # Usage examples are captured separately below. We don't
1929 # want to expand them into glosses even when unusual coding
1930 # is used in the entry.
1931 # These templates may slip through inside another item, but
1932 # currently we're separating out example entries (..#:)
1933 # well enough that there seems to very little contamination.
1934 if is_gloss:
1935 wxr.wtp.warning(
1936 "Example template is used for gloss text",
1937 sortid="extractor.en.page.sense_template_fn/1415",
1938 )
1939 else:
1940 return ""
1941 if name == "w": 1941 ↛ 1942line 1941 didn't jump to line 1942 because the condition on line 1941 was never true
1942 if ht.get(2) == "Wp":
1943 return ""
1944 for k, v in ht.items():
1945 v = v.strip()
1946 if v and "<" not in v:
1947 gloss_template_args.add(v)
1948 return None
1950 def extract_link_texts(item: GeneralNode) -> None:
1951 """Recursively extracts link texts from the gloss source. This
1952 information is used to select whether to remove final "." from
1953 form_of/alt_of (e.g., ihm/Hunsrik)."""
1954 if isinstance(item, (list, tuple)):
1955 for x in item:
1956 extract_link_texts(x)
1957 return
1958 if isinstance(item, str):
1959 # There seem to be HTML sections that may futher contain
1960 # unparsed links.
1961 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1961 ↛ 1962line 1961 didn't jump to line 1962 because the loop on line 1961 never started
1962 print("ITER:", m.group(0))
1963 v = m.group(1).split("|")[-1].strip()
1964 if v:
1965 gloss_template_args.add(v)
1966 return
1967 if not isinstance(item, WikiNode): 1967 ↛ 1968line 1967 didn't jump to line 1968 because the condition on line 1967 was never true
1968 return
1969 if item.kind == NodeKind.LINK:
1970 v = item.largs[-1]
1971 if ( 1971 ↛ 1977line 1971 didn't jump to line 1977 because the condition on line 1971 was always true
1972 isinstance(v, list)
1973 and len(v) == 1
1974 and isinstance(v[0], str)
1975 ):
1976 gloss_template_args.add(v[0].strip())
1977 for x in item.children:
1978 extract_link_texts(x)
1980 extract_link_texts(contents)
1982 # get the raw text of non-list contents of this node, and other stuff
1983 # like tag and category data added to sense_base
1984 # cast = no-op type-setter for the type-checker
1985 partial_template_fn = cast(
1986 TemplateFnCallable,
1987 partial(sense_template_fn, is_gloss=True),
1988 )
1989 rawgloss = clean_node(
1990 wxr,
1991 sense_base,
1992 contents,
1993 template_fn=partial_template_fn,
1994 collect_links=True,
1995 )
1997 if not rawgloss: 1997 ↛ 1998line 1997 didn't jump to line 1998 because the condition on line 1997 was never true
1998 return False
2000 # remove manually typed ordered list text at the start("1. ")
2001 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2003 # get stuff like synonyms and categories from "others",
2004 # maybe examples and quotations
2005 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2007 # The gloss could contain templates that produce more list items.
2008 # This happens commonly with, e.g., {{inflection of|...}}. Split
2009 # to parts. However, e.g. Interlingua generates multiple glosses
2010 # in HTML directly without Wikitext markup, so we must also split
2011 # by just newlines.
2012 subglosses = rawgloss.splitlines()
2014 if len(subglosses) == 0: 2014 ↛ 2015line 2014 didn't jump to line 2015 because the condition on line 2014 was never true
2015 return False
2017 if any(s.startswith("#") for s in subglosses):
2018 subtree = wxr.wtp.parse(rawgloss)
2019 # from wikitextprocessor.parser import print_tree
2020 # print("SUBTREE GENERATED BY TEMPLATE:")
2021 # print_tree(subtree)
2022 new_subentries = [
2023 x
2024 for x in subtree.children
2025 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2026 ]
2028 new_others = [
2029 x
2030 for x in subtree.children
2031 if isinstance(x, WikiNode)
2032 and x.kind == NodeKind.LIST
2033 and not x.sarg.endswith("#")
2034 ]
2036 new_contents = [
2037 clean_node(wxr, [], x)
2038 for x in subtree.children
2039 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2040 ]
2042 subentries = subentries or new_subentries
2043 others = others or new_others
2044 subglosses = new_contents
2045 rawgloss = "".join(subglosses)
2046 # Generate no gloss for translation hub pages, but add the
2047 # "translation-hub" tag for them
2048 if rawgloss == "(This entry is a translation hub.)": 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true
2049 data_append(sense_data, "tags", "translation-hub")
2050 return push_sense()
2052 # Remove certain substrings specific to outer glosses
2053 strip_ends = [", particularly:"]
2054 for x in strip_ends:
2055 if rawgloss.endswith(x):
2056 rawgloss = rawgloss[: -len(x)].strip()
2057 break
2059 # A single gloss, or possibly an outer gloss.
2060 # Check if the possible outer gloss starts with
2061 # parenthesized tags/topics
2063 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2064 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2065 m = QUALIFIERS_RE.match(rawgloss)
2066 # (...): ... or (...(...)...): ...
2067 if m:
2068 q = m.group(1)
2069 rawgloss = rawgloss[m.end() :].strip()
2070 parse_sense_qualifier(wxr, q, sense_base)
2071 if rawgloss == "A pejorative:": 2071 ↛ 2072line 2071 didn't jump to line 2072 because the condition on line 2071 was never true
2072 data_append(sense_base, "tags", "pejorative")
2073 rawgloss = ""
2074 elif rawgloss == "Short forms.": 2074 ↛ 2075line 2074 didn't jump to line 2075 because the condition on line 2074 was never true
2075 data_append(sense_base, "tags", "abbreviation")
2076 rawgloss = ""
2077 elif rawgloss == "Technical or specialized senses.": 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true
2078 rawgloss = ""
2079 elif rawgloss.startswith("inflection of "):
2080 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2081 if parsed is not None: 2081 ↛ 2090line 2081 didn't jump to line 2090 because the condition on line 2081 was always true
2082 tags, origins = parsed
2083 if origins is not None: 2083 ↛ 2085line 2083 didn't jump to line 2085 because the condition on line 2083 was always true
2084 data_extend(sense_base, "form_of", origins)
2085 if tags is not None: 2085 ↛ 2088line 2085 didn't jump to line 2088 because the condition on line 2085 was always true
2086 data_extend(sense_base, "tags", tags)
2087 else:
2088 data_append(sense_base, "tags", "form-of")
2089 else:
2090 data_append(sense_base, "tags", "form-of")
2091 if rawgloss: 2091 ↛ 2122line 2091 didn't jump to line 2122 because the condition on line 2091 was always true
2092 # Code duplicating a lot of clean-up operations from later in
2093 # this block. We want to clean up the "supergloss" as much as
2094 # possible, in almost the same way as a normal gloss.
2095 supergloss = rawgloss
2097 if supergloss.startswith("; "): 2097 ↛ 2098line 2097 didn't jump to line 2098 because the condition on line 2097 was never true
2098 supergloss = supergloss[1:].strip()
2100 if supergloss.startswith(("^†", "†")):
2101 data_append(sense_base, "tags", "obsolete")
2102 supergloss = supergloss[2:].strip()
2103 elif supergloss.startswith("^‡"): 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true
2104 data_extend(sense_base, "tags", ["obsolete", "historical"])
2105 supergloss = supergloss[2:].strip()
2107 # remove [14th century...] style brackets at the end
2108 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2110 if supergloss.startswith((",", ":")):
2111 supergloss = supergloss[1:]
2112 supergloss = supergloss.strip()
2113 if supergloss.startswith("N. of "): 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true
2114 supergloss = "Name of " + supergloss[6:]
2115 supergloss = supergloss[2:]
2116 data_append(sense_base, "glosses", supergloss)
2117 if supergloss in ("A person:",):
2118 data_append(sense_base, "tags", "g-person")
2120 # The main recursive call (except for the exceptions at the
2121 # start of this function).
2122 for sublist in subentries:
2123 if not ( 2123 ↛ 2126line 2123 didn't jump to line 2126 because the condition on line 2123 was never true
2124 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2125 ):
2126 wxr.wtp.debug(
2127 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2128 f"with items that are not LISTs",
2129 sortid="page/1511/20230119",
2130 )
2131 continue
2132 for item in sublist.children:
2133 if not ( 2133 ↛ 2137line 2133 didn't jump to line 2137 because the condition on line 2133 was never true
2134 isinstance(item, WikiNode)
2135 and item.kind == NodeKind.LIST_ITEM
2136 ):
2137 continue
2138 # copy sense_base to prevent cross-contamination between
2139 # subglosses and other subglosses and superglosses
2140 sense_base2 = copy.deepcopy(sense_base)
2141 if parse_sense_node(item, sense_base2, pos): 2141 ↛ 2132line 2141 didn't jump to line 2132 because the condition on line 2141 was always true
2142 added = True
2144 # Capture examples.
2145 # This is called after the recursive calls above so that
2146 # sense_base is not contaminated with meta-data from
2147 # example entries for *this* gloss.
2148 examples = []
2149 if wxr.config.capture_examples: 2149 ↛ 2153line 2149 didn't jump to line 2153 because the condition on line 2149 was always true
2150 examples = extract_examples(others, sense_base)
2152 # push_sense() succeeded somewhere down-river, so skip this level
2153 if added:
2154 if examples:
2155 # this higher-up gloss has examples that we do not want to skip
2156 wxr.wtp.debug(
2157 "'{}[...]' gloss has examples we want to keep, "
2158 "but there are subglosses.".format(repr(rawgloss[:30])),
2159 sortid="page/1498/20230118",
2160 )
2161 else:
2162 return True
2164 # Some entries, e.g., "iacebam", have weird sentences in quotes
2165 # after the gloss, but these sentences don't seem to be intended
2166 # as glosses. Skip them.
2167 indexed_subglosses = list(
2168 (i, gl)
2169 for i, gl in enumerate(subglosses)
2170 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2171 )
2173 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2173 ↛ 2174line 2173 didn't jump to line 2174 because the condition on line 2173 was never true
2174 gl = indexed_subglosses[0][1].strip()
2175 if gl.endswith(":"):
2176 gl = gl[:-1].strip()
2177 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2178 if parsed is not None:
2179 infl_tags, infl_dts = parsed
2180 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2181 # Interpret others as a particular form under
2182 # "inflection of"
2183 data_extend(sense_base, "tags", infl_tags)
2184 data_extend(sense_base, "form_of", infl_dts)
2185 indexed_subglosses = indexed_subglosses[1:]
2186 elif not infl_dts:
2187 data_extend(sense_base, "tags", infl_tags)
2188 indexed_subglosses = indexed_subglosses[1:]
2190 # Create senses for remaining subglosses
2191 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2192 gloss = gloss.strip()
2193 if not gloss and len(indexed_subglosses) > 1: 2193 ↛ 2194line 2193 didn't jump to line 2194 because the condition on line 2193 was never true
2194 continue
2195 # Push a new sense (if the last one is not empty)
2196 if push_sense(): 2196 ↛ 2197line 2196 didn't jump to line 2197 because the condition on line 2196 was never true
2197 added = True
2198 # if gloss not in sense_data.get("raw_glosses", ()):
2199 # data_append(sense_data, "raw_glosses", gloss)
2200 if i == 0 and examples:
2201 # In a multi-line gloss, associate examples
2202 # with only one of them.
2203 # XXX or you could use gloss_i == len(indexed_subglosses)
2204 # to associate examples with the *last* one.
2205 data_extend(sense_data, "examples", examples)
2206 if gloss.startswith("; ") and gloss_i > 0: 2206 ↛ 2207line 2206 didn't jump to line 2207 because the condition on line 2206 was never true
2207 gloss = gloss[1:].strip()
2208 # If the gloss starts with †, mark as obsolete
2209 if gloss.startswith("^†"): 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true
2210 data_append(sense_data, "tags", "obsolete")
2211 gloss = gloss[2:].strip()
2212 elif gloss.startswith("^‡"): 2212 ↛ 2213line 2212 didn't jump to line 2213 because the condition on line 2212 was never true
2213 data_extend(sense_data, "tags", ["obsolete", "historical"])
2214 gloss = gloss[2:].strip()
2215 # Copy data for all senses to this sense
2216 for k, v in sense_base.items():
2217 if isinstance(v, (list, tuple)):
2218 if k != "tags":
2219 # Tags handled below (countable/uncountable special)
2220 data_extend(sense_data, k, v)
2221 else:
2222 assert k not in ("tags", "categories", "topics")
2223 sense_data[k] = v # type:ignore[literal-required]
2224 # Parse the gloss for this particular sense
2225 m = QUALIFIERS_RE.match(gloss)
2226 # (...): ... or (...(...)...): ...
2227 if m:
2228 parse_sense_qualifier(wxr, m.group(1), sense_data)
2229 gloss = gloss[m.end() :].strip()
2231 # Remove common suffix "[from 14th c.]" and similar
2232 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2234 # Check to make sure we don't have unhandled list items in gloss
2235 ofs = max(gloss.find("#"), gloss.find("* "))
2236 if ofs > 10 and "(#)" not in gloss:
2237 wxr.wtp.debug(
2238 "gloss may contain unhandled list items: {}".format(gloss),
2239 sortid="page/1412",
2240 )
2241 elif "\n" in gloss: 2241 ↛ 2242line 2241 didn't jump to line 2242 because the condition on line 2241 was never true
2242 wxr.wtp.debug(
2243 "gloss contains newline: {}".format(gloss),
2244 sortid="page/1416",
2245 )
2247 # Kludge, some glosses have a comma after initial qualifiers in
2248 # parentheses
2249 if gloss.startswith((",", ":")):
2250 gloss = gloss[1:]
2251 gloss = gloss.strip()
2252 if gloss.endswith(":"):
2253 gloss = gloss[:-1].strip()
2254 if gloss.startswith("N. of "): 2254 ↛ 2255line 2254 didn't jump to line 2255 because the condition on line 2254 was never true
2255 gloss = "Name of " + gloss[6:]
2256 if gloss.startswith("†"): 2256 ↛ 2257line 2256 didn't jump to line 2257 because the condition on line 2256 was never true
2257 data_append(sense_data, "tags", "obsolete")
2258 gloss = gloss[1:]
2259 elif gloss.startswith("^†"): 2259 ↛ 2260line 2259 didn't jump to line 2260 because the condition on line 2259 was never true
2260 data_append(sense_data, "tags", "obsolete")
2261 gloss = gloss[2:]
2263 # Copy tags from sense_base if any. This will not copy
2264 # countable/uncountable if either was specified in the sense,
2265 # as sometimes both are specified in word head but only one
2266 # in individual senses.
2267 countability_tags = []
2268 base_tags = sense_base.get("tags", ())
2269 sense_tags = sense_data.get("tags", ())
2270 for tag in base_tags:
2271 if tag in ("countable", "uncountable"):
2272 if tag not in countability_tags: 2272 ↛ 2274line 2272 didn't jump to line 2274 because the condition on line 2272 was always true
2273 countability_tags.append(tag)
2274 continue
2275 if tag not in sense_tags:
2276 data_append(sense_data, "tags", tag)
2277 if countability_tags:
2278 if ( 2278 ↛ 2287line 2278 didn't jump to line 2287 because the condition on line 2278 was always true
2279 "countable" not in sense_tags
2280 and "uncountable" not in sense_tags
2281 ):
2282 data_extend(sense_data, "tags", countability_tags)
2284 # If outer gloss specifies a form-of ("inflection of", see
2285 # aquamarine/German), try to parse the inner glosses as
2286 # tags for an inflected form.
2287 if "form-of" in sense_base.get("tags", ()):
2288 parsed = parse_alt_or_inflection_of(
2289 wxr, gloss, gloss_template_args
2290 )
2291 if parsed is not None: 2291 ↛ 2297line 2291 didn't jump to line 2297 because the condition on line 2291 was always true
2292 infl_tags, infl_dts = parsed
2293 if not infl_dts and infl_tags: 2293 ↛ 2297line 2293 didn't jump to line 2297 because the condition on line 2293 was always true
2294 # Interpret as a particular form under "inflection of"
2295 data_extend(sense_data, "tags", infl_tags)
2297 if not gloss: 2297 ↛ 2298line 2297 didn't jump to line 2298 because the condition on line 2297 was never true
2298 data_append(sense_data, "tags", "empty-gloss")
2299 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2300 if ( 2300 ↛ 2311line 2300 didn't jump to line 2311 because the condition on line 2300 was always true
2301 gloss_i == 0
2302 and len(sense_data.get("glosses", tuple())) >= 1
2303 ):
2304 # If we added a "high-level gloss" from rawgloss, but this
2305 # is that same gloss_i, add this instead of the raw_gloss
2306 # from before if they're different: the rawgloss was not
2307 # cleaned exactly the same as this later gloss
2308 sense_data["glosses"][-1] = gloss
2309 else:
2310 # Add the gloss for the sense.
2311 data_append(sense_data, "glosses", gloss)
2313 # Kludge: there are cases (e.g., etc./Swedish) where there are
2314 # two abbreviations in the same sense, both generated by the
2315 # {{abbreviation of|...}} template. Handle these with some magic.
2316 position = 0
2317 split_glosses = []
2318 for m in re.finditer(r"Abbreviation of ", gloss):
2319 if m.start() != position: 2319 ↛ 2318line 2319 didn't jump to line 2318 because the condition on line 2319 was always true
2320 split_glosses.append(gloss[position : m.start()])
2321 position = m.start()
2322 split_glosses.append(gloss[position:])
2323 for gloss in split_glosses:
2324 # Check if this gloss describes an alt-of or inflection-of
2325 if (
2326 lang_code != "en"
2327 and " " not in gloss
2328 and distw([word], gloss) < 0.3
2329 ):
2330 # Don't try to parse gloss if it is one word
2331 # that is close to the word itself for non-English words
2332 # (probable translations of a tag/form name)
2333 continue
2334 parsed = parse_alt_or_inflection_of(
2335 wxr, gloss, gloss_template_args
2336 )
2337 if parsed is None:
2338 continue
2339 tags, dts = parsed
2340 if not dts and tags:
2341 data_extend(sense_data, "tags", tags)
2342 continue
2343 for dt in dts: # type:ignore[union-attr]
2344 ftags = list(tag for tag in tags if tag != "form-of")
2345 if "alt-of" in tags:
2346 data_extend(sense_data, "tags", ftags)
2347 data_append(sense_data, "alt_of", dt)
2348 elif "compound-of" in tags: 2348 ↛ 2349line 2348 didn't jump to line 2349 because the condition on line 2348 was never true
2349 data_extend(sense_data, "tags", ftags)
2350 data_append(sense_data, "compound_of", dt)
2351 elif "synonym-of" in tags: 2351 ↛ 2352line 2351 didn't jump to line 2352 because the condition on line 2351 was never true
2352 data_extend(dt, "tags", ftags)
2353 data_append(sense_data, "synonyms", dt)
2354 elif tags and dt.get("word", "").startswith("of "): 2354 ↛ 2355line 2354 didn't jump to line 2355 because the condition on line 2354 was never true
2355 dt["word"] = dt["word"][3:]
2356 data_append(sense_data, "tags", "form-of")
2357 data_extend(sense_data, "tags", ftags)
2358 data_append(sense_data, "form_of", dt)
2359 elif "form-of" in tags: 2359 ↛ 2343line 2359 didn't jump to line 2343 because the condition on line 2359 was always true
2360 data_extend(sense_data, "tags", tags)
2361 data_append(sense_data, "form_of", dt)
2363 if len(sense_data) == 0:
2364 if len(sense_base.get("tags", [])) == 0: 2364 ↛ 2366line 2364 didn't jump to line 2366 because the condition on line 2364 was always true
2365 del sense_base["tags"]
2366 sense_data.update(sense_base)
2367 if push_sense(): 2367 ↛ 2371line 2367 didn't jump to line 2371 because the condition on line 2367 was always true
2368 # push_sense succeded in adding a sense to pos_data
2369 added = True
2370 # print("PARSE_SENSE DONE:", pos_datas[-1])
2371 return added
2373 def parse_inflection(
2374 node: WikiNode, section: str, pos: Optional[str]
2375 ) -> None:
2376 """Parses inflection data (declension, conjugation) from the given
2377 page. This retrieves the actual inflection template
2378 parameters, which are very useful for applications that need
2379 to learn the inflection classes and generate inflected
2380 forms."""
2381 assert isinstance(node, WikiNode)
2382 assert isinstance(section, str)
2383 assert pos is None or isinstance(pos, str)
2384 # print("parse_inflection:", node)
2386 if pos is None: 2386 ↛ 2387line 2386 didn't jump to line 2387 because the condition on line 2386 was never true
2387 wxr.wtp.debug(
2388 "inflection table outside part-of-speech", sortid="page/1812"
2389 )
2390 return
2392 def inflection_template_fn(
2393 name: str, ht: TemplateArgs
2394 ) -> Optional[str]:
2395 # print("decl_conj_template_fn", name, ht)
2396 if is_panel_template(wxr, name): 2396 ↛ 2397line 2396 didn't jump to line 2397 because the condition on line 2396 was never true
2397 return ""
2398 if name in ("is-u-mutation",): 2398 ↛ 2401line 2398 didn't jump to line 2401 because the condition on line 2398 was never true
2399 # These are not to be captured as an exception to the
2400 # generic code below
2401 return None
2402 m = re.search(
2403 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2404 r"declension|inflection|mut|mutation)($|-)",
2405 name,
2406 )
2407 if m:
2408 args_ht = clean_template_args(wxr, ht)
2409 dt = {"name": name, "args": args_ht}
2410 data_append(pos_data, "inflection_templates", dt)
2412 return None
2414 # Convert the subtree back to Wikitext, then expand all and parse,
2415 # capturing templates in the process
2416 text = wxr.wtp.node_to_wikitext(node.children)
2418 # Split text into separate sections for each to-level template
2419 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2420 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2421 # The (?:...) creates a non-capturing regex group; if it was capturing,
2422 # like the group around it, it would create elements in brace_matches,
2423 # including None if it doesn't match.
2424 # 20250114: Added {| and |} into the regex because tables were being
2425 # cut into pieces by this code. Issue #973, introduction of two-part
2426 # book-end templates similar to trans-top and tran-bottom.
2427 template_sections = []
2428 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2429 # Because there is the possibility of triple curly braces
2430 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2431 # count nesting depth using pairs of two brackets, but
2432 # instead use singular braces ("{ }").
2433 # Because template delimiters should be balanced, regardless
2434 # of whether {{ or {{{ is used, and because we only care
2435 # about the outer-most delimiters (the highest level template)
2436 # we can just count the single braces when those single
2437 # braces are part of a group.
2438 table_nesting = 0
2439 # However, if we have a stray table ({| ... |}) that should always
2440 # be its own section, and should prevent templates from cutting it
2441 # into sections.
2443 # print(f"Parse inflection: {text=}")
2444 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2445 if len(brace_matches) > 1:
2446 tsection: list[str] = []
2447 after_templates = False # kludge to keep any text
2448 # before first template
2449 # with the first template;
2450 # otherwise, text
2451 # goes with preceding template
2452 for m in brace_matches:
2453 if m.startswith("\n; ") and after_templates: 2453 ↛ 2454line 2453 didn't jump to line 2454 because the condition on line 2453 was never true
2454 after_templates = False
2455 template_sections.append(tsection)
2456 tsection = []
2457 tsection.append(m)
2458 elif m.startswith("{{") or m.endswith("{|"):
2459 if (
2460 template_nesting == 0
2461 and after_templates
2462 and table_nesting == 0
2463 ):
2464 template_sections.append(tsection)
2465 tsection = []
2466 # start new section
2467 after_templates = True
2468 if m.startswith("{{"):
2469 template_nesting += 1
2470 else:
2471 # m.endswith("{|")
2472 table_nesting += 1
2473 tsection.append(m)
2474 elif m.startswith("}}") or m.endswith("|}"):
2475 if m.startswith("}}"):
2476 template_nesting -= 1
2477 if template_nesting < 0: 2477 ↛ 2478line 2477 didn't jump to line 2478 because the condition on line 2477 was never true
2478 wxr.wtp.error(
2479 "Negatively nested braces, "
2480 "couldn't split inflection templates, "
2481 "{}/{} section {}".format(
2482 word, language, section
2483 ),
2484 sortid="page/1871",
2485 )
2486 template_sections = [] # use whole text
2487 break
2488 else:
2489 table_nesting -= 1
2490 if table_nesting < 0: 2490 ↛ 2491line 2490 didn't jump to line 2491 because the condition on line 2490 was never true
2491 wxr.wtp.error(
2492 "Negatively nested table braces, "
2493 "couldn't split inflection section, "
2494 "{}/{} section {}".format(
2495 word, language, section
2496 ),
2497 sortid="page/20250114",
2498 )
2499 template_sections = [] # use whole text
2500 break
2501 tsection.append(m)
2502 else:
2503 tsection.append(m)
2504 if tsection: # dangling tsection 2504 ↛ 2512line 2504 didn't jump to line 2512 because the condition on line 2504 was always true
2505 template_sections.append(tsection)
2506 # Why do it this way around? The parser has a preference
2507 # to associate bits outside of tables with the preceding
2508 # table (`after`-variable), so a new tsection begins
2509 # at {{ and everything before it belongs to the previous
2510 # template.
2512 texts = []
2513 if not template_sections:
2514 texts = [text]
2515 else:
2516 for tsection in template_sections:
2517 texts.append("".join(tsection))
2518 if template_nesting != 0: 2518 ↛ 2519line 2518 didn't jump to line 2519 because the condition on line 2518 was never true
2519 wxr.wtp.error(
2520 "Template nesting error: "
2521 "template_nesting = {} "
2522 "couldn't split inflection templates, "
2523 "{}/{} section {}".format(
2524 template_nesting, word, language, section
2525 ),
2526 sortid="page/1896",
2527 )
2528 texts = [text]
2529 for text in texts:
2530 tree = wxr.wtp.parse(
2531 text, expand_all=True, template_fn=inflection_template_fn
2532 )
2534 if not text.strip():
2535 continue
2537 # Parse inflection tables from the section. The data is stored
2538 # under "forms".
2539 if wxr.config.capture_inflections: 2539 ↛ 2529line 2539 didn't jump to line 2529 because the condition on line 2539 was always true
2540 tablecontext = None
2541 m = re.search(r"{{([^}{|]+)\|?", text)
2542 if m:
2543 template_name = m.group(1)
2544 tablecontext = TableContext(template_name)
2546 parse_inflection_section(
2547 wxr,
2548 pos_data,
2549 word,
2550 language,
2551 pos,
2552 section,
2553 tree,
2554 tablecontext=tablecontext,
2555 )
2557 def get_subpage_section(
2558 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2559 ) -> Optional[Union[WikiNode, str]]:
2560 """Loads a subpage of the given page, and finds the section
2561 for the given language, part-of-speech, and section title. This
2562 is used for finding translations and other sections on subpages."""
2563 assert isinstance(language, str)
2564 assert isinstance(title, str)
2565 assert isinstance(subtitle, str)
2566 assert isinstance(seqs, (list, tuple))
2567 for seq in seqs:
2568 for x in seq:
2569 assert isinstance(x, str)
2570 subpage_title = word + "/" + subtitle
2571 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2572 if subpage_content is None:
2573 wxr.wtp.error(
2574 "/translations not found despite "
2575 "{{see translation subpage|...}}",
2576 sortid="page/1934",
2577 )
2578 return None
2580 def recurse(
2581 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2582 ) -> Optional[Union[str, WikiNode]]:
2583 # print(f"seq: {seq}")
2584 if not seq:
2585 return node
2586 if not isinstance(node, WikiNode):
2587 return None
2588 # print(f"node.kind: {node.kind}")
2589 if node.kind in LEVEL_KINDS:
2590 t = clean_node(wxr, None, node.largs[0])
2591 # print(f"t: {t} == seq[0]: {seq[0]}?")
2592 if t.lower() == seq[0].lower():
2593 seq = seq[1:]
2594 if not seq:
2595 return node
2596 for n in node.children:
2597 ret = recurse(n, seq)
2598 if ret is not None:
2599 return ret
2600 return None
2602 tree = wxr.wtp.parse(
2603 subpage_content,
2604 pre_expand=True,
2605 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2606 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2607 )
2608 assert tree.kind == NodeKind.ROOT
2609 for seq in seqs:
2610 ret = recurse(tree, seq)
2611 if ret is None:
2612 wxr.wtp.debug(
2613 "Failed to find subpage section {}/{} seq {}".format(
2614 title, subtitle, seq
2615 ),
2616 sortid="page/1963",
2617 )
2618 return ret
2620 def parse_linkage(
2621 data: WordData, field: str, linkagenode: WikiNode
2622 ) -> None:
2623 assert isinstance(data, dict)
2624 assert isinstance(field, str)
2625 assert isinstance(linkagenode, WikiNode)
2626 # if field == "synonyms":
2627 # print("field", field)
2628 # print("data", data)
2629 # print("children:")
2630 # print(linkagenode.children)
2631 if not wxr.config.capture_linkages: 2631 ↛ 2632line 2631 didn't jump to line 2632 because the condition on line 2631 was never true
2632 return
2633 have_panel_template = False
2634 toplevel_text = []
2635 next_navframe_sense = None # Used for "(sense):" before NavFrame
2637 def parse_linkage_item(
2638 contents: list[Union[str, WikiNode]],
2639 field: str,
2640 sense: Optional[str] = None,
2641 ):
2642 assert isinstance(contents, (list, tuple))
2643 assert isinstance(field, str)
2644 assert sense is None or isinstance(sense, str)
2646 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2647 # .format(field, sense, contents))
2649 parts: list[str] = []
2650 ruby: list[tuple[str, str]] = []
2651 urls: list[str] = []
2652 # data about link text; this is used to skip splitting on
2653 # linkage text items that contain stuff like commas; for
2654 # example "Hunde, die bellen, beißen nicht" in article
2655 # beißen is split into "Hunde", "die bellen" etc.
2656 # We take that link text and use it, eventually,
2657 # in split_at_comma_semi to skip splitting on those
2658 # commas.
2659 links_that_should_not_be_split: list[str] = []
2661 def item_recurse(
2662 contents: list[Union[str, WikiNode]], italic=False
2663 ) -> None:
2664 assert isinstance(contents, (list, tuple))
2665 nonlocal sense
2666 nonlocal ruby
2667 nonlocal parts
2668 # print("ITEM_RECURSE:", contents)
2669 for node in contents:
2670 if isinstance(node, str):
2671 parts.append(node)
2672 continue
2673 kind = node.kind
2674 # print("ITEM_RECURSE KIND:", kind,
2675 # node.sarg if node.sarg else node.largs)
2676 if kind == NodeKind.LIST:
2677 if parts: 2677 ↛ 2692line 2677 didn't jump to line 2692 because the condition on line 2677 was always true
2678 sense1: Optional[str]
2679 sense1 = clean_node(wxr, None, parts)
2680 if sense1.endswith(":"):
2681 sense1 = sense1[:-1].strip()
2682 if sense1.startswith("(") and sense1.endswith(")"): 2682 ↛ 2683line 2682 didn't jump to line 2683 because the condition on line 2682 was never true
2683 sense1 = sense1[1:-1].strip()
2684 if sense1.lower() == TRANSLATIONS_TITLE: 2684 ↛ 2685line 2684 didn't jump to line 2685 because the condition on line 2684 was never true
2685 sense1 = None
2686 # print("linkage item_recurse LIST sense1:", sense1)
2687 parse_linkage_recurse(
2688 node.children, field, sense=sense1 or sense
2689 )
2690 parts = []
2691 else:
2692 parse_linkage_recurse(node.children, field, sense)
2693 elif kind in ( 2693 ↛ 2698line 2693 didn't jump to line 2698 because the condition on line 2693 was never true
2694 NodeKind.TABLE,
2695 NodeKind.TABLE_ROW,
2696 NodeKind.TABLE_CELL,
2697 ):
2698 parse_linkage_recurse(node.children, field, sense)
2699 elif kind in ( 2699 ↛ 2703line 2699 didn't jump to line 2703 because the condition on line 2699 was never true
2700 NodeKind.TABLE_HEADER_CELL,
2701 NodeKind.TABLE_CAPTION,
2702 ):
2703 continue
2704 elif kind == NodeKind.HTML: 2704 ↛ 2705line 2704 didn't jump to line 2705 because the condition on line 2704 was never true
2705 classes = (node.attrs.get("class") or "").split()
2706 if node.sarg in ("gallery", "ref", "cite", "caption"):
2707 continue
2708 elif node.sarg == "ruby":
2709 rb = parse_ruby(wxr, node)
2710 if rb:
2711 ruby.append(rb)
2712 parts.append(rb[0])
2713 continue
2714 elif node.sarg == "math":
2715 parts.append(clean_node(wxr, None, node))
2716 continue
2717 elif "interProject" in classes:
2718 continue # These do not seem to be displayed
2719 if "NavFrame" in classes:
2720 parse_linkage_recurse(node.children, field, sense)
2721 else:
2722 item_recurse(node.children, italic=italic)
2723 elif kind == NodeKind.ITALIC:
2724 item_recurse(node.children, italic=True)
2725 elif kind == NodeKind.LINK:
2726 ignore = False
2727 if isinstance(node.largs[0][0], str): 2727 ↛ 2669line 2727 didn't jump to line 2669 because the condition on line 2727 was always true
2728 v1 = node.largs[0][0].strip().lower()
2729 if v1.startswith( 2729 ↛ 2733line 2729 didn't jump to line 2733 because the condition on line 2729 was never true
2730 ns_title_prefix_tuple(wxr, "Category", True)
2731 + ns_title_prefix_tuple(wxr, "File", True)
2732 ):
2733 ignore = True
2734 if not ignore: 2734 ↛ 2669line 2734 didn't jump to line 2669 because the condition on line 2734 was always true
2735 v = node.largs[-1]
2736 if (
2737 len(node.largs) == 1
2738 and len(v) > 0
2739 and isinstance(v[0], str)
2740 and v[0][0] == ":"
2741 ):
2742 v = [v[0][1:]] + list(v[1:]) # type:ignore
2743 if isinstance(v[0], str) and not v[0].isalnum():
2744 links_that_should_not_be_split.append(
2745 "".join(v[0])
2746 ) # type: ignore
2747 item_recurse(v, italic=italic)
2748 elif kind == NodeKind.URL:
2749 if len(node.largs) < 2 and node.largs:
2750 # Naked url captured
2751 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2752 continue
2753 if len(node.largs) == 2: 2753 ↛ 2758line 2753 didn't jump to line 2758 because the condition on line 2753 was always true
2754 # Url from link with text
2755 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2756 # print(f"{node.largs=!r}")
2757 # print("linkage recurse URL {}".format(node))
2758 item_recurse(node.largs[-1], italic=italic)
2759 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2759 ↛ 2762line 2759 didn't jump to line 2762 because the condition on line 2759 was always true
2760 item_recurse(node.children, italic=italic)
2761 else:
2762 wxr.wtp.debug(
2763 "linkage item_recurse unhandled {}: {}".format(
2764 node.kind, node
2765 ),
2766 sortid="page/2073",
2767 )
2769 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2770 # .format(contents))
2772 item_recurse(contents)
2773 item = clean_node(wxr, None, parts)
2774 # print("LINKAGE ITEM CONTENTS:", parts)
2775 # print("CLEANED ITEM: {!r}".format(item))
2776 # print(f"URLS {urls=!r}")
2778 return parse_linkage_item_text(
2779 wxr,
2780 word,
2781 data,
2782 field,
2783 item,
2784 sense,
2785 ruby,
2786 pos_datas,
2787 is_reconstruction,
2788 urls or None,
2789 links_that_should_not_be_split or None,
2790 )
2792 def parse_linkage_recurse(
2793 contents: list[Union[WikiNode, str]],
2794 field: str,
2795 sense: Optional[str],
2796 ) -> None:
2797 assert isinstance(contents, (list, tuple))
2798 assert sense is None or isinstance(sense, str)
2799 nonlocal next_navframe_sense
2800 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2801 for node in contents:
2802 if isinstance(node, str):
2803 # Ignore top-level text, generally comments before the
2804 # linkages list. However, if no linkages are found, then
2805 # use this for linkages (not all words use bullet points
2806 # for linkages).
2807 toplevel_text.append(node)
2808 continue
2809 assert isinstance(node, WikiNode)
2810 kind = node.kind
2811 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2812 if kind == NodeKind.LIST:
2813 parse_linkage_recurse(node.children, field, sense)
2814 elif kind == NodeKind.LIST_ITEM:
2815 v = parse_linkage_item(node.children, field, sense)
2816 if v: 2816 ↛ 2820line 2816 didn't jump to line 2820 because the condition on line 2816 was never true
2817 # parse_linkage_item() can return a value that should
2818 # be used as the sense for the follow-on linkages,
2819 # which are typically provided in a table (see 滿)
2820 next_navframe_sense = v
2821 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2822 parse_linkage_recurse(node.children, field, sense)
2823 elif kind == NodeKind.TABLE_CELL:
2824 parse_linkage_item(node.children, field, sense)
2825 elif kind in (
2826 NodeKind.TABLE_CAPTION,
2827 NodeKind.TABLE_HEADER_CELL,
2828 NodeKind.PREFORMATTED,
2829 NodeKind.BOLD,
2830 ):
2831 continue
2832 elif kind == NodeKind.HTML: 2832 ↛ 2834line 2832 didn't jump to line 2834 because the condition on line 2832 was never true
2833 # Recurse to process inside the HTML for most tags
2834 if node.sarg in ("gallery", "ref", "cite", "caption"):
2835 continue
2836 classes = (node.attrs.get("class") or "").split()
2837 if node.sarg == "li":
2838 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2839 v = parse_linkage_item(node.children, field, sense)
2840 if v:
2841 next_navframe_sense = v
2842 elif "qualifier-content" in classes:
2843 sense1 = clean_node(wxr, None, node.children)
2844 if sense1.endswith(":"):
2845 sense1 = sense1[:-1].strip()
2846 if sense and sense1:
2847 wxr.wtp.debug(
2848 "linkage qualifier-content on multiple "
2849 "levels: {!r} and {!r}".format(sense, sense1),
2850 sortid="page/2170",
2851 )
2852 parse_linkage_recurse(node.children, field, sense1)
2853 elif "NavFrame" in classes:
2854 # NavFrame uses previously assigned next_navframe_sense
2855 # (from a "(sense):" item) and clears it afterwards
2856 parse_linkage_recurse(
2857 node.children, field, sense or next_navframe_sense
2858 )
2859 next_navframe_sense = None
2860 else:
2861 parse_linkage_recurse(node.children, field, sense)
2862 elif kind in LEVEL_KINDS: 2862 ↛ 2864line 2862 didn't jump to line 2864 because the condition on line 2862 was never true
2863 # Just recurse to any possible subsections
2864 parse_linkage_recurse(node.children, field, sense)
2865 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2866 # Skip these on top level; at least sometimes bold is
2867 # used for indicating a subtitle
2868 continue
2869 elif kind == NodeKind.LINK: 2869 ↛ 2875line 2869 didn't jump to line 2875 because the condition on line 2869 was always true
2870 # Recurse into the last argument
2871 # Apparently ":/" is used as a link to "/", so strip
2872 # initial value
2873 parse_linkage_recurse(node.largs[-1], field, sense)
2874 else:
2875 wxr.wtp.debug(
2876 "parse_linkage_recurse unhandled {}: {}".format(
2877 kind, node
2878 ),
2879 sortid="page/2196",
2880 )
2882 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2883 nonlocal have_panel_template
2884 if is_panel_template(wxr, name):
2885 have_panel_template = True
2886 return ""
2887 return None
2889 def parse_zh_synonyms(
2890 parsed: list[Union[WikiNode, str]],
2891 data: list[LinkageData],
2892 hdrs: list[str],
2893 root_word: str,
2894 ) -> None:
2895 """Parses Chinese dialectal synonyms tables"""
2896 for item in parsed:
2897 if isinstance(item, WikiNode):
2898 if item.kind == NodeKind.TABLE_ROW: 2898 ↛ 2899line 2898 didn't jump to line 2899 because the condition on line 2898 was never true
2899 cleaned = clean_node(wxr, None, item.children)
2900 # print("cleaned:", repr(cleaned))
2901 if any(
2902 [
2903 "Variety" in cleaned,
2904 "Location" in cleaned,
2905 "Words" in cleaned,
2906 ]
2907 ):
2908 pass
2909 else:
2910 split = cleaned.split("\n")
2911 new_hdrs = split[:-1]
2912 if len(new_hdrs) == 2:
2913 hdrs = [new_hdrs[0]]
2914 new_hdrs.pop(0)
2915 combined_hdrs = [x.strip() for x in hdrs + new_hdrs]
2916 tags = []
2917 words = split[-1].split(",")
2918 for hdr in combined_hdrs:
2919 hdr = hdr.replace("(", ",")
2920 hdr = hdr.replace(")", "")
2921 hdr = hdr.replace("N.", "Northern,")
2922 hdr = hdr.replace("S.", "Southern,")
2923 new = hdr.split(",")
2924 for tag in sorted(new):
2925 tag = tag.strip()
2926 tag = tag.replace(" ", "-")
2927 if tag in valid_tags:
2928 tags.append(tag)
2929 else:
2930 if tag in zh_tag_lookup:
2931 tags.extend(zh_tag_lookup[tag])
2932 else:
2933 print(
2934 f"MISSING ZH SYNONYM TAG for "
2935 f"root {root_word}, word "
2936 f"{words}: {tag}"
2937 )
2938 sys.stdout.flush()
2940 for word in words:
2941 data.append(
2942 {"word": word.strip(), "tags": tags}
2943 )
2944 elif item.kind == NodeKind.HTML: 2944 ↛ 2945line 2944 didn't jump to line 2945 because the condition on line 2944 was never true
2945 cleaned = clean_node(wxr, None, item.children)
2946 if "Synonyms of" in cleaned:
2947 cleaned = cleaned.replace("Synonyms of ", "")
2948 root_word = cleaned
2949 parse_zh_synonyms(item.children, data, hdrs, root_word)
2950 else:
2951 parse_zh_synonyms(item.children, data, hdrs, root_word)
2953 def parse_zh_synonyms_list(
2954 parsed: list[Union[WikiNode, str]],
2955 data: list[LinkageData],
2956 hdrs: list[str],
2957 root_word: str,
2958 ) -> None:
2959 """Parses Chinese dialectal synonyms tables (list format)"""
2960 for item in parsed:
2961 if isinstance(item, WikiNode):
2962 if item.kind == NodeKind.LIST_ITEM:
2963 cleaned = clean_node(wxr, None, item.children)
2964 # print("cleaned:", repr(cleaned))
2965 if any(
2966 [
2967 "Variety" in cleaned,
2968 "Location" in cleaned,
2969 "Words" in cleaned,
2970 ]
2971 ):
2972 pass
2973 else:
2974 cleaned = cleaned.replace("(", ",")
2975 cleaned = cleaned.replace(")", "")
2976 split = cleaned.split(",")
2977 # skip empty words / titles
2978 if split[0] == "":
2979 continue
2980 words = split[0].split("/")
2981 new_hdrs = [x.strip() for x in split[1:]]
2982 tags = []
2983 roman = None
2984 for tag in sorted(new_hdrs):
2985 if tag in valid_tags:
2986 tags.append(tag)
2987 elif tag in zh_tag_lookup:
2988 tags.extend(zh_tag_lookup[tag])
2989 elif (
2990 classify_desc(tag) == "romanization"
2991 and roman is None
2992 ):
2993 roman = tag
2994 else:
2995 print(
2996 f"MISSING ZH SYNONYM TAG "
2997 f"(possibly pinyin) - root "
2998 f"{root_word}, word {words}: {tag}"
2999 )
3000 sys.stdout.flush()
3002 for word in words:
3003 dt: LinkageData = {"word": word.strip()}
3004 if tags:
3005 dt["tags"] = tags
3006 if roman is not None:
3007 dt["roman"] = roman
3008 data.append(dt)
3009 elif item.kind == NodeKind.HTML:
3010 cleaned = clean_node(wxr, None, item.children)
3011 if cleaned.find("Synonyms of") >= 0:
3012 cleaned = cleaned.replace("Synonyms of ", "")
3013 root_word = cleaned
3014 parse_zh_synonyms_list(
3015 item.children, data, hdrs, root_word
3016 )
3017 else:
3018 parse_zh_synonyms_list(
3019 item.children, data, hdrs, root_word
3020 )
3022 def contains_kind(
3023 children: list[Union[WikiNode, str]], nodekind: NodeKind
3024 ) -> bool:
3025 assert isinstance(children, list)
3026 for item in children:
3027 if not isinstance(item, WikiNode):
3028 continue
3029 if item.kind == nodekind: 3029 ↛ 3030line 3029 didn't jump to line 3030 because the condition on line 3029 was never true
3030 return True
3031 elif contains_kind(item.children, nodekind): 3031 ↛ 3032line 3031 didn't jump to line 3032 because the condition on line 3031 was never true
3032 return True
3033 return False
3035 # Main body of parse_linkage()
3036 text = wxr.wtp.node_to_wikitext(linkagenode.children)
3037 parsed = wxr.wtp.parse(
3038 text, expand_all=True, template_fn=linkage_template_fn1
3039 )
3040 if field == "synonyms" and lang_code == "zh":
3041 synonyms: list[LinkageData] = []
3042 if contains_kind(parsed.children, NodeKind.LIST): 3042 ↛ 3043line 3042 didn't jump to line 3043 because the condition on line 3042 was never true
3043 parse_zh_synonyms_list(parsed.children, synonyms, [], "")
3044 else:
3045 parse_zh_synonyms(parsed.children, synonyms, [], "")
3046 # print(json.dumps(synonyms, indent=4, ensure_ascii=False))
3047 data_extend(data, "synonyms", synonyms)
3048 parse_linkage_recurse(parsed.children, field, None)
3049 if not data.get(field) and not have_panel_template:
3050 text = "".join(toplevel_text).strip()
3051 if "\n" not in text and "," in text and text.count(",") > 3:
3052 if not text.startswith("See "): 3052 ↛ exitline 3052 didn't return from function 'parse_linkage' because the condition on line 3052 was always true
3053 parse_linkage_item([text], field, None)
3055 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
3056 """Parses translations for a word. This may also pull in translations
3057 from separate translation subpages."""
3058 assert isinstance(data, dict)
3059 assert isinstance(xlatnode, WikiNode)
3060 # print("===== PARSE_TRANSLATIONS {} {} {}"
3061 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
3062 # print("parse_translations xlatnode={}".format(xlatnode))
3063 if not wxr.config.capture_translations: 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true
3064 return
3065 sense_parts: list[Union[WikiNode, str]] = []
3066 sense: Optional[str] = None
3068 def parse_translation_item(
3069 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
3070 ) -> None:
3071 nonlocal sense
3072 assert isinstance(contents, list)
3073 assert lang is None or isinstance(lang, str)
3074 # print("PARSE_TRANSLATION_ITEM:", contents)
3076 langcode: Optional[str] = None
3077 if sense is None:
3078 sense = clean_node(wxr, data, sense_parts).strip()
3079 # print("sense <- clean_node: ", sense)
3080 idx = sense.find("See also translations at")
3081 if idx > 0: 3081 ↛ 3082line 3081 didn't jump to line 3082 because the condition on line 3081 was never true
3082 wxr.wtp.debug(
3083 "Skipping translation see also: {}".format(sense),
3084 sortid="page/2361",
3085 )
3086 sense = sense[:idx].strip()
3087 if sense.endswith(":"): 3087 ↛ 3088line 3087 didn't jump to line 3088 because the condition on line 3087 was never true
3088 sense = sense[:-1].strip()
3089 if sense.endswith("—"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true
3090 sense = sense[:-1].strip()
3091 translations_from_template: list[str] = []
3093 def translation_item_template_fn(
3094 name: str, ht: TemplateArgs
3095 ) -> Optional[str]:
3096 nonlocal langcode
3097 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
3098 if is_panel_template(wxr, name):
3099 return ""
3100 if name in ("t+check", "t-check", "t-needed"):
3101 # We ignore these templates. They seem to have outright
3102 # garbage in some entries, and very varying formatting in
3103 # others. These should be transitory and unreliable
3104 # anyway.
3105 return "__IGNORE__"
3106 if name in ("t", "t+", "t-simple", "tt", "tt+"):
3107 code = ht.get(1)
3108 if code: 3108 ↛ 3118line 3108 didn't jump to line 3118 because the condition on line 3108 was always true
3109 if langcode and code != langcode:
3110 wxr.wtp.debug(
3111 "inconsistent language codes {} vs "
3112 "{} in translation item: {!r} {}".format(
3113 langcode, code, name, ht
3114 ),
3115 sortid="page/2386",
3116 )
3117 langcode = code
3118 tr = ht.get(2)
3119 if tr:
3120 tr = clean_node(wxr, None, [tr])
3121 translations_from_template.append(tr)
3122 return None
3123 if name == "t-egy":
3124 langcode = "egy"
3125 return None
3126 if name == "ttbc":
3127 code = ht.get(1)
3128 if code: 3128 ↛ 3130line 3128 didn't jump to line 3130 because the condition on line 3128 was always true
3129 langcode = code
3130 return None
3131 if name == "trans-see": 3131 ↛ 3132line 3131 didn't jump to line 3132 because the condition on line 3131 was never true
3132 wxr.wtp.error(
3133 "UNIMPLEMENTED trans-see template", sortid="page/2405"
3134 )
3135 return ""
3136 if name.endswith("-top"): 3136 ↛ 3137line 3136 didn't jump to line 3137 because the condition on line 3136 was never true
3137 return ""
3138 if name.endswith("-bottom"): 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true
3139 return ""
3140 if name.endswith("-mid"): 3140 ↛ 3141line 3140 didn't jump to line 3141 because the condition on line 3140 was never true
3141 return ""
3142 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
3143 # .format(name),
3144 # sortid="page/2414")
3145 return None
3147 sublists = list(
3148 x
3149 for x in contents
3150 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
3151 )
3152 contents = list(
3153 x
3154 for x in contents
3155 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
3156 )
3158 item = clean_node(
3159 wxr, data, contents, template_fn=translation_item_template_fn
3160 )
3161 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
3163 # Parse the translation item.
3164 if item: 3164 ↛ exitline 3164 didn't return from function 'parse_translation_item' because the condition on line 3164 was always true
3165 lang = parse_translation_item_text(
3166 wxr,
3167 word,
3168 data,
3169 item,
3170 sense,
3171 lang,
3172 langcode,
3173 translations_from_template,
3174 is_reconstruction,
3175 )
3177 # Handle sublists. They are frequently used for different
3178 # scripts for the language and different variants of the
3179 # language. We will include the lower-level header as a
3180 # tag in those cases.
3181 for listnode in sublists:
3182 assert listnode.kind == NodeKind.LIST
3183 for node in listnode.children:
3184 if not isinstance(node, WikiNode): 3184 ↛ 3185line 3184 didn't jump to line 3185 because the condition on line 3184 was never true
3185 continue
3186 if node.kind == NodeKind.LIST_ITEM: 3186 ↛ 3183line 3186 didn't jump to line 3183 because the condition on line 3186 was always true
3187 parse_translation_item(node.children, lang=lang)
3189 def parse_translation_template(node: WikiNode) -> None:
3190 assert isinstance(node, WikiNode)
3192 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3193 nonlocal sense_parts
3194 nonlocal sense
3195 if is_panel_template(wxr, name):
3196 return ""
3197 if name == "see also":
3198 # XXX capture
3199 # XXX for example, "/" has top-level list containing
3200 # see also items. So also should parse those.
3201 return ""
3202 if name == "trans-see":
3203 # XXX capture
3204 return ""
3205 if name == "see translation subpage": 3205 ↛ 3206line 3205 didn't jump to line 3206 because the condition on line 3205 was never true
3206 sense_parts = []
3207 sense = None
3208 sub = ht.get(1, "")
3209 if sub:
3210 m = re.match(
3211 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3212 )
3213 else:
3214 m = None
3215 etym = ""
3216 etym_numbered = ""
3217 pos = ""
3218 if m:
3219 etym_numbered = m.group(1)
3220 etym = m.group(2)
3221 pos = m.group(3)
3222 if not sub:
3223 wxr.wtp.debug(
3224 "no part-of-speech in "
3225 "{{see translation subpage|...}}, "
3226 "defaulting to just wxr.wtp.section "
3227 "(= language)",
3228 sortid="page/2468",
3229 )
3230 # seq sent to get_subpage_section without sub and pos
3231 seq = [
3232 language,
3233 TRANSLATIONS_TITLE,
3234 ]
3235 elif (
3236 m
3237 and etym.lower().strip() in ETYMOLOGY_TITLES
3238 and pos.lower() in POS_TITLES
3239 ):
3240 seq = [
3241 language,
3242 etym_numbered,
3243 pos,
3244 TRANSLATIONS_TITLE,
3245 ]
3246 elif sub.lower() in POS_TITLES:
3247 # seq with sub but not pos
3248 seq = [
3249 language,
3250 sub,
3251 TRANSLATIONS_TITLE,
3252 ]
3253 else:
3254 # seq with sub and pos
3255 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3256 if pos.lower() not in POS_TITLES:
3257 wxr.wtp.debug(
3258 "unhandled see translation subpage: "
3259 "language={} sub={} "
3260 "wxr.wtp.subsection={}".format(
3261 language, sub, wxr.wtp.subsection
3262 ),
3263 sortid="page/2478",
3264 )
3265 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3266 subnode = get_subpage_section(
3267 wxr.wtp.title or "MISSING_TITLE",
3268 TRANSLATIONS_TITLE,
3269 [seq],
3270 )
3271 if subnode is None or not isinstance(subnode, WikiNode):
3272 # Failed to find the normal subpage section
3273 # seq with sub and pos
3274 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3275 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
3276 seqs: list[list[str] | tuple[str, ...]] = [
3277 [TRANSLATIONS_TITLE],
3278 [language, pos],
3279 ]
3280 subnode = get_subpage_section(
3281 wxr.wtp.title or "MISSING_TITLE",
3282 TRANSLATIONS_TITLE,
3283 seqs,
3284 )
3285 if subnode is not None and isinstance(
3286 subnode, WikiNode
3287 ):
3288 parse_translations(data, subnode)
3289 return ""
3290 if name in (
3291 "c",
3292 "C",
3293 "categorize",
3294 "cat",
3295 "catlangname",
3296 "topics",
3297 "top",
3298 "qualifier",
3299 "cln",
3300 ):
3301 # These are expanded in the default way
3302 return None
3303 if name in (
3304 "trans-top",
3305 "trans-top-see",
3306 ):
3307 # XXX capture id from trans-top? Capture sense here
3308 # instead of trying to parse it from expanded content?
3309 if ht.get(1):
3310 sense_parts = []
3311 sense = ht.get(1)
3312 else:
3313 sense_parts = []
3314 sense = None
3315 return None
3316 if name in (
3317 "trans-bottom",
3318 "trans-mid",
3319 "checktrans-mid",
3320 "checktrans-bottom",
3321 ):
3322 return None
3323 if name == "checktrans-top":
3324 sense_parts = []
3325 sense = None
3326 return ""
3327 if name == "trans-top-also":
3328 # XXX capture?
3329 sense_parts = []
3330 sense = None
3331 return ""
3332 wxr.wtp.error(
3333 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3334 name, ht
3335 ),
3336 sortid="page/2517",
3337 )
3338 return ""
3340 wxr.wtp.expand(
3341 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3342 )
3344 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3345 nonlocal sense
3346 nonlocal sense_parts
3347 for node in xlatnode.children:
3348 # print(node)
3349 if isinstance(node, str):
3350 if sense:
3351 if not node.isspace():
3352 wxr.wtp.debug(
3353 "skipping string in the middle of "
3354 "translations: {}".format(node),
3355 sortid="page/2530",
3356 )
3357 continue
3358 # Add a part to the sense
3359 sense_parts.append(node)
3360 sense = None
3361 continue
3362 assert isinstance(node, WikiNode)
3363 kind = node.kind
3364 if kind == NodeKind.LIST:
3365 for item in node.children:
3366 if not isinstance(item, WikiNode): 3366 ↛ 3367line 3366 didn't jump to line 3367 because the condition on line 3366 was never true
3367 continue
3368 if item.kind != NodeKind.LIST_ITEM: 3368 ↛ 3369line 3368 didn't jump to line 3369 because the condition on line 3368 was never true
3369 continue
3370 if item.sarg == ":": 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true
3371 continue
3372 parse_translation_item(item.children)
3373 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3373 ↛ 3377line 3373 didn't jump to line 3377 because the condition on line 3373 was never true
3374 # Silently skip list items that are just indented; these
3375 # are used for text between translations, such as indicating
3376 # translations that need to be checked.
3377 pass
3378 elif kind == NodeKind.TEMPLATE:
3379 parse_translation_template(node)
3380 elif kind in ( 3380 ↛ 3385line 3380 didn't jump to line 3385 because the condition on line 3380 was never true
3381 NodeKind.TABLE,
3382 NodeKind.TABLE_ROW,
3383 NodeKind.TABLE_CELL,
3384 ):
3385 parse_translation_recurse(node)
3386 elif kind == NodeKind.HTML:
3387 if node.attrs.get("class") == "NavFrame": 3387 ↛ 3393line 3387 didn't jump to line 3393 because the condition on line 3387 was never true
3388 # Reset ``sense_parts`` (and force recomputing
3389 # by clearing ``sense``) as each NavFrame specifies
3390 # its own sense. This helps eliminate garbage coming
3391 # from text at the beginning at the translations
3392 # section.
3393 sense_parts = []
3394 sense = None
3395 # for item in node.children:
3396 # if not isinstance(item, WikiNode):
3397 # continue
3398 # parse_translation_recurse(item)
3399 parse_translation_recurse(node)
3400 elif kind in LEVEL_KINDS: 3400 ↛ 3402line 3400 didn't jump to line 3402 because the condition on line 3400 was never true
3401 # Sub-levels will be recursed elsewhere
3402 pass
3403 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3404 parse_translation_recurse(node)
3405 elif kind == NodeKind.PREFORMATTED: 3405 ↛ 3406line 3405 didn't jump to line 3406 because the condition on line 3405 was never true
3406 print("parse_translation_recurse: PREFORMATTED:", node)
3407 elif kind == NodeKind.LINK: 3407 ↛ 3461line 3407 didn't jump to line 3461 because the condition on line 3407 was always true
3408 arg0 = node.largs[0]
3409 # Kludge: I've seen occasional normal links to translation
3410 # subpages from main pages (e.g., language/English/Noun
3411 # in July 2021) instead of the normal
3412 # {{see translation subpage|...}} template. This should
3413 # handle them. Note: must be careful not to read other
3414 # links, particularly things like in "human being":
3415 # "a human being -- see [[man/translations]]" (group title)
3416 if ( 3416 ↛ 3424line 3416 didn't jump to line 3424 because the condition on line 3416 was never true
3417 isinstance(arg0, (list, tuple))
3418 and arg0
3419 and isinstance(arg0[0], str)
3420 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3421 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3422 == wxr.wtp.title
3423 ):
3424 wxr.wtp.debug(
3425 "translations subpage link found on main "
3426 "page instead "
3427 "of normal {{see translation subpage|...}}",
3428 sortid="page/2595",
3429 )
3430 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3431 if sub.lower() in POS_TITLES:
3432 seq = [
3433 language,
3434 sub,
3435 TRANSLATIONS_TITLE,
3436 ]
3437 subnode = get_subpage_section(
3438 wxr.wtp.title,
3439 TRANSLATIONS_TITLE,
3440 [seq],
3441 )
3442 if subnode is not None and isinstance(
3443 subnode, WikiNode
3444 ):
3445 parse_translations(data, subnode)
3446 else:
3447 wxr.wtp.error(
3448 "/translations link outside part-of-speech"
3449 )
3451 if (
3452 len(arg0) >= 1
3453 and isinstance(arg0[0], str)
3454 and not arg0[0].lower().startswith("category:")
3455 ):
3456 for x in node.largs[-1]:
3457 if isinstance(x, str): 3457 ↛ 3460line 3457 didn't jump to line 3460 because the condition on line 3457 was always true
3458 sense_parts.append(x)
3459 else:
3460 parse_translation_recurse(x)
3461 elif not sense:
3462 sense_parts.append(node)
3463 else:
3464 wxr.wtp.debug(
3465 "skipping text between translation items/senses: "
3466 "{}".format(node),
3467 sortid="page/2621",
3468 )
3470 # Main code of parse_translation(). We want ``sense`` to be assigned
3471 # regardless of recursion levels, and thus the code is structured
3472 # to define at this level and recurse in parse_translation_recurse().
3473 parse_translation_recurse(xlatnode)
3475 def parse_etymology(data: WordData, node: WikiNode) -> None:
3476 """Parses an etymology section."""
3477 assert isinstance(data, dict)
3478 assert isinstance(node, WikiNode)
3480 templates: list[TemplateData] = []
3482 # Counter for preventing the capture of etymology templates
3483 # when we are inside templates that we want to ignore (i.e.,
3484 # not capture).
3485 ignore_count = 0
3487 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3488 nonlocal ignore_count
3489 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3490 return ""
3491 if re.match(ignored_etymology_templates_re, name):
3492 ignore_count += 1
3493 return None
3495 # CONTINUE_HERE
3497 def etym_post_template_fn(
3498 name: str, ht: TemplateArgs, expansion: str
3499 ) -> None:
3500 nonlocal ignore_count
3501 if name in wikipedia_templates:
3502 parse_wikipedia_template(wxr, data, ht)
3503 return None
3504 if re.match(ignored_etymology_templates_re, name):
3505 ignore_count -= 1
3506 return None
3507 if ignore_count == 0: 3507 ↛ 3513line 3507 didn't jump to line 3513 because the condition on line 3507 was always true
3508 ht = clean_template_args(wxr, ht)
3509 expansion = clean_node(wxr, None, expansion)
3510 templates.append(
3511 {"name": name, "args": ht, "expansion": expansion}
3512 )
3513 return None
3515 # Remove any subsections
3516 contents = list(
3517 x
3518 for x in node.children
3519 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3520 )
3521 # Convert to text, also capturing templates using post_template_fn
3522 text = clean_node(
3523 wxr,
3524 None,
3525 contents,
3526 template_fn=etym_template_fn,
3527 post_template_fn=etym_post_template_fn,
3528 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3529 # Save the collected information.
3530 if len(text) > 0:
3531 data["etymology_text"] = text
3532 if len(templates) > 0:
3533 # Some etymology templates, like Template:root do not generate
3534 # text, so they should be added here. Elsewhere, we check
3535 # for Template:root and add some text to the expansion to please
3536 # the validation.
3537 data["etymology_templates"] = templates
3539 for child_node in node.find_child_recursively( 3539 ↛ exitline 3539 didn't return from function 'parse_etymology' because the loop on line 3539 didn't complete
3540 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3541 ):
3542 if child_node.kind in LEVEL_KIND_FLAGS:
3543 break
3544 elif isinstance( 3544 ↛ 3547line 3544 didn't jump to line 3547 because the condition on line 3544 was never true
3545 child_node, TemplateNode
3546 ) and child_node.template_name in ["zh-x", "zh-q"]:
3547 if "etymology_examples" not in data:
3548 data["etymology_examples"] = []
3549 data["etymology_examples"].extend(
3550 extract_template_zh_x(
3551 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3552 )
3553 )
3555 def parse_descendants(
3556 data: WordData, node: WikiNode, is_proto_root_derived_section=False
3557 ) -> None:
3558 """Parses a Descendants section. Also used on Derived terms and
3559 Extensions sections when we are dealing with a root of a reconstructed
3560 language (i.e. is_proto_root_derived_section == True), as they use the
3561 same structure. In the latter case, The wiktionary convention is not to
3562 title the section as descendants since the immediate offspring of the
3563 roots are morphologically derived terms within the same proto-language.
3564 Still, since the rest of the section lists true descendants, we use the
3565 same function. Entries in the descendants list that are technically
3566 derived terms will have a field "tags": ["derived"]."""
3567 assert isinstance(data, dict)
3568 assert isinstance(node, WikiNode)
3569 assert isinstance(is_proto_root_derived_section, bool)
3571 descendants = []
3573 # Most templates that are not in a LIST should be ignored as they only
3574 # add formatting, like "desc-top", "der-top3", etc. Any template in
3575 # unignored_non_list_templates actually contains relevant descendant
3576 # info. E.g. "CJKV" is often the only line at all in descendants
3577 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would
3578 # be skipped if we didn't handle it specially as it is not part of a
3579 # LIST, and additionally is in panel_templates. There are probably more
3580 # such templates that should be added to this...
3581 unignored_non_list_templates: list[str] = ["CJKV"]
3583 def process_list_item_children(
3584 sarg: str, children: list[Union[str, WikiNode]]
3585 ) -> None:
3586 assert isinstance(sarg, str)
3587 assert isinstance(children, list)
3588 # The descendants section is a hierarchical bulleted listed. sarg is
3589 # usually some number of "*" characters indicating the level of
3590 # indentation of the line, e.g. "***" indicates the line will be
3591 # thrice-indented. A bare ";" is used to indicate a subtitle-like
3592 # line with no indentation. ":" at the end of one or more "*"s is
3593 # used to indicate that the bullet will not be displayed.
3594 item_data: DescendantData = {"depth": sarg.count("*")}
3595 templates: list[TemplateData] = []
3596 is_derived = False
3598 # Counter for preventing the capture of templates when we are inside
3599 # templates that we want to ignore (i.e., not capture).
3600 ignore_count = 0
3602 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3603 nonlocal ignore_count
3604 if ( 3604 ↛ 3608line 3604 didn't jump to line 3608 because the condition on line 3604 was never true
3605 is_panel_template(wxr, name)
3606 and name not in unignored_non_list_templates
3607 ):
3608 return ""
3609 if re.match(ignored_descendants_templates_re, name):
3610 ignore_count += 1
3611 return None
3613 def desc_post_template_fn(
3614 name: str, ht: TemplateArgs, expansion: str
3615 ) -> None:
3616 nonlocal ignore_count
3617 if name in wikipedia_templates: 3617 ↛ 3618line 3617 didn't jump to line 3618 because the condition on line 3617 was never true
3618 parse_wikipedia_template(wxr, data, ht)
3619 return None
3620 if re.match(ignored_descendants_templates_re, name):
3621 ignore_count -= 1
3622 return None
3623 if ignore_count == 0: 3623 ↛ 3639line 3623 didn't jump to line 3639 because the condition on line 3623 was always true
3624 ht = clean_template_args(wxr, ht)
3625 nonlocal is_derived
3626 # If we're in a proto-root Derived terms or Extensions
3627 # section, and the current list item has a link template
3628 # to a term in the same proto-language, then we tag this
3629 # descendant entry with "derived"
3630 is_derived = (
3631 is_proto_root_derived_section
3632 and (name == "l" or name == "link")
3633 and ("1" in ht and ht["1"] == lang_code)
3634 )
3635 expansion = clean_node(wxr, None, expansion)
3636 templates.append(
3637 {"name": name, "args": ht, "expansion": expansion}
3638 )
3639 return None
3641 text = clean_node(
3642 wxr,
3643 None,
3644 children,
3645 template_fn=desc_template_fn,
3646 post_template_fn=desc_post_template_fn,
3647 )
3648 item_data["templates"] = templates
3649 item_data["text"] = text
3650 if is_derived: 3650 ↛ 3651line 3650 didn't jump to line 3651 because the condition on line 3650 was never true
3651 item_data["tags"] = ["derived"]
3652 descendants.append(item_data)
3654 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]:
3655 for i, child in enumerate(node.children):
3656 if isinstance(child, WikiNode):
3657 yield (i, child)
3659 def get_sublist_index(list_item: WikiNode) -> Optional[int]:
3660 for i, child in node_children(list_item):
3661 if child.kind == NodeKind.LIST:
3662 return i
3663 return None
3665 def get_descendants(node: WikiNode) -> None:
3666 """Appends the data for every list item in every list in node
3667 to descendants."""
3668 for _, c in node_children(node):
3669 if (
3670 c.kind == NodeKind.TEMPLATE
3671 and c.largs
3672 and len(c.largs[0]) == 1
3673 and isinstance(c.largs[0][0], str)
3674 and c.largs[0][0] in unignored_non_list_templates
3675 ):
3676 # Some Descendants sections have no wikitext list. Rather,
3677 # the list is entirely generated by a single template (see
3678 # e.g. the use of {{CJKV}} in Chinese entries).
3679 process_list_item_children("", [c])
3680 elif c.kind == NodeKind.HTML: 3680 ↛ 3686line 3680 didn't jump to line 3686 because the condition on line 3680 was never true
3681 # The Descendants sections for many languages feature
3682 # templates that generate html to add styling (e.g. using
3683 # multiple columns) to the list, so that the actual wikitext
3684 # list items are found within a <div>. We look within the
3685 # children of the html node for the actual list items.
3686 get_descendants(c)
3687 elif c.kind == NodeKind.LIST:
3688 get_descendants(c)
3689 elif c.kind == NodeKind.LIST_ITEM:
3690 # If a LIST_ITEM has subitems in a sublist, usually its
3691 # last child is a LIST. However, sometimes after the LIST
3692 # there is one or more trailing LIST_ITEMs, like "\n" or
3693 # a reference template. If there is a sublist, we discard
3694 # everything after it.
3695 i = get_sublist_index(c)
3696 if i is not None:
3697 process_list_item_children(c.sarg, c.children[:i])
3698 get_descendants(c.children[i]) # type: ignore[arg-type]
3699 else:
3700 process_list_item_children(c.sarg, c.children)
3702 # parse_descendants() actual work starts here
3703 get_descendants(node)
3705 # if e.g. on a PIE page, there may be both Derived terms and Extensions
3706 # sections, in which case this function will be called multiple times,
3707 # so we have to check if descendants exists first.
3708 if "descendants" in data: 3708 ↛ 3709line 3708 didn't jump to line 3709 because the condition on line 3708 was never true
3709 data["descendants"].extend(descendants)
3710 else:
3711 data["descendants"] = descendants
3713 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3714 """This recurses into a subtree in the parse tree for a page."""
3715 nonlocal etym_data
3716 nonlocal pos_data
3717 nonlocal inside_level_four
3719 redirect_list: list[str] = [] # for `zh-see` template
3721 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3722 """This is called for otherwise unprocessed parts of the page.
3723 We still expand them so that e.g. Category links get captured."""
3724 if name in wikipedia_templates:
3725 data = select_data()
3726 parse_wikipedia_template(wxr, data, ht)
3727 return None
3728 if is_panel_template(wxr, name):
3729 return ""
3730 return None
3732 for node in treenode.children:
3733 if not isinstance(node, WikiNode):
3734 # print(" X{}".format(repr(node)[:40]))
3735 continue
3736 if isinstance(node, TemplateNode):
3737 if process_soft_redirect_template(wxr, node, redirect_list):
3738 continue
3739 elif node.template_name == "zh-forms":
3740 process_zh_forms_templates(wxr, node, base_data)
3742 if node.kind not in LEVEL_KINDS:
3743 # XXX handle e.g. wikipedia links at the top of a language
3744 # XXX should at least capture "also" at top of page
3745 if node.kind in (
3746 NodeKind.HLINE,
3747 NodeKind.LIST,
3748 NodeKind.LIST_ITEM,
3749 ):
3750 continue
3751 # print(" UNEXPECTED: {}".format(node))
3752 # Clean the node to collect category links
3753 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3754 continue
3755 t = clean_node(
3756 wxr, etym_data, node.sarg if node.sarg else node.largs
3757 )
3758 t = t.lower()
3759 # XXX these counts were never implemented fully, and even this
3760 # gets discarded: Search STATISTICS_IMPLEMENTATION
3761 wxr.config.section_counts[t] += 1
3762 # print("PROCESS_CHILDREN: T:", repr(t))
3763 if t in IGNORED_TITLES:
3764 pass
3765 elif t.startswith(PRONUNCIATION_TITLE):
3766 # Chinese Pronunciation section kludge; we demote these to
3767 # be level 4 instead of 3 so that they're part of a larger
3768 # etymology hierarchy; usually the data here is empty and
3769 # acts as an inbetween between POS and Etymology data
3770 inside_level_four = True
3771 if t.startswith(PRONUNCIATION_TITLE + " "):
3772 # Pronunciation 1, etc, are used in Chinese Glyphs,
3773 # and each of them may have senses under Definition
3774 push_level_four_section()
3775 wxr.wtp.start_subsection(None)
3776 if wxr.config.capture_pronunciation: 3776 ↛ 3858line 3776 didn't jump to line 3858 because the condition on line 3776 was always true
3777 data = select_data()
3778 parse_pronunciation(
3779 wxr,
3780 node,
3781 data,
3782 etym_data,
3783 have_etym,
3784 base_data,
3785 lang_code,
3786 )
3787 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3788 push_etym()
3789 wxr.wtp.start_subsection(None)
3790 if wxr.config.capture_etymologies: 3790 ↛ 3858line 3790 didn't jump to line 3858 because the condition on line 3790 was always true
3791 m = re.search(r"\s(\d+)$", t)
3792 if m:
3793 etym_data["etymology_number"] = int(m.group(1))
3794 parse_etymology(etym_data, node)
3795 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3796 data = select_data()
3797 parse_descendants(data, node)
3798 elif ( 3798 ↛ 3804line 3798 didn't jump to line 3804 because the condition on line 3798 was never true
3799 t in PROTO_ROOT_DERIVED_TITLES
3800 and pos == "root"
3801 and is_reconstruction
3802 and wxr.config.capture_descendants
3803 ):
3804 data = select_data()
3805 parse_descendants(data, node, True)
3806 elif t == TRANSLATIONS_TITLE:
3807 data = select_data()
3808 parse_translations(data, node)
3809 elif t in INFLECTION_TITLES:
3810 parse_inflection(node, t, pos)
3811 elif t == "alternative forms":
3812 extract_alt_form_section(wxr, select_data(), node)
3813 else:
3814 lst = t.split()
3815 while len(lst) > 1 and lst[-1].isdigit(): 3815 ↛ 3816line 3815 didn't jump to line 3816 because the condition on line 3815 was never true
3816 lst = lst[:-1]
3817 t_no_number = " ".join(lst).lower()
3818 if t_no_number in POS_TITLES:
3819 push_pos()
3820 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3821 pos = dt["pos"] or "MISSING_POS"
3822 wxr.wtp.start_subsection(t)
3823 if "debug" in dt:
3824 wxr.wtp.debug(
3825 "{} in section {}".format(dt["debug"], t),
3826 sortid="page/2755",
3827 )
3828 if "warning" in dt: 3828 ↛ 3829line 3828 didn't jump to line 3829 because the condition on line 3828 was never true
3829 wxr.wtp.warning(
3830 "{} in section {}".format(dt["warning"], t),
3831 sortid="page/2759",
3832 )
3833 if "error" in dt: 3833 ↛ 3834line 3833 didn't jump to line 3834 because the condition on line 3833 was never true
3834 wxr.wtp.error(
3835 "{} in section {}".format(dt["error"], t),
3836 sortid="page/2763",
3837 )
3838 # Parse word senses for the part-of-speech
3839 parse_part_of_speech(node, pos)
3840 if "tags" in dt:
3841 for pdata in pos_datas:
3842 data_extend(pdata, "tags", dt["tags"])
3843 elif t_no_number in LINKAGE_TITLES:
3844 # print(f"LINKAGE_TITLES NODE {node=}")
3845 rel = LINKAGE_TITLES[t_no_number]
3846 data = select_data()
3847 parse_linkage(data, rel, node)
3848 elif t_no_number == COMPOUNDS_TITLE:
3849 data = select_data()
3850 if wxr.config.capture_compounds: 3850 ↛ 3858line 3850 didn't jump to line 3858 because the condition on line 3850 was always true
3851 parse_linkage(data, "derived", node)
3853 # XXX parse interesting templates also from other sections. E.g.,
3854 # {{Letter|...}} in ===See also===
3855 # Also <gallery>
3857 # Recurse to children of this node, processing subtitles therein
3858 stack.append(t)
3859 process_children(node, pos)
3860 stack.pop()
3862 if len(redirect_list) > 0:
3863 if len(pos_data) > 0:
3864 pos_data["redirects"] = redirect_list
3865 if "pos" not in pos_data: 3865 ↛ 3866line 3865 didn't jump to line 3866 because the condition on line 3865 was never true
3866 pos_data["pos"] = "soft-redirect"
3867 else:
3868 new_page_data = copy.deepcopy(base_data)
3869 new_page_data["redirects"] = redirect_list
3870 if "pos" not in new_page_data: 3870 ↛ 3872line 3870 didn't jump to line 3872 because the condition on line 3870 was always true
3871 new_page_data["pos"] = "soft-redirect"
3872 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3873 page_datas.append(new_page_data)
3875 def extract_examples(
3876 others: list[WikiNode], sense_base: SenseData
3877 ) -> list[ExampleData]:
3878 """Parses through a list of definitions and quotes to find examples.
3879 Returns a list of example dicts to be added to sense data. Adds
3880 meta-data, mostly categories, into sense_base."""
3881 assert isinstance(others, list)
3882 examples: list[ExampleData] = []
3884 for sub in others:
3885 if not sub.sarg.endswith((":", "*")): 3885 ↛ 3886line 3885 didn't jump to line 3886 because the condition on line 3885 was never true
3886 continue
3887 for item in sub.children:
3888 if not isinstance(item, WikiNode): 3888 ↛ 3889line 3888 didn't jump to line 3889 because the condition on line 3888 was never true
3889 continue
3890 if item.kind != NodeKind.LIST_ITEM: 3890 ↛ 3891line 3890 didn't jump to line 3891 because the condition on line 3890 was never true
3891 continue
3892 usex_type = None
3893 example_template_args = []
3894 example_template_names = []
3895 taxons = set()
3897 # Bypass this function when parsing Chinese, Japanese and
3898 # quotation templates.
3899 new_example_lists = extract_example_list_item(
3900 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3901 )
3902 if len(new_example_lists) > 0:
3903 examples.extend(new_example_lists)
3904 continue
3906 def usex_template_fn(
3907 name: str, ht: TemplateArgs
3908 ) -> Optional[str]:
3909 nonlocal usex_type
3910 if is_panel_template(wxr, name):
3911 return ""
3912 if name in usex_templates:
3913 usex_type = "example"
3914 example_template_args.append(ht)
3915 example_template_names.append(name)
3916 elif name in quotation_templates:
3917 usex_type = "quotation"
3918 elif name in taxonomy_templates: 3918 ↛ 3919line 3918 didn't jump to line 3919 because the condition on line 3918 was never true
3919 taxons.update(ht.get(1, "").split())
3920 for prefix in template_linkages_to_ignore_in_examples:
3921 if re.search(
3922 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3923 ):
3924 return ""
3925 return None
3927 # bookmark
3928 ruby: list[tuple[str, str]] = []
3929 contents = item.children
3930 if lang_code == "ja":
3931 # Capture ruby contents if this is a Japanese language
3932 # example.
3933 # print(contents)
3934 if ( 3934 ↛ 3939line 3934 didn't jump to line 3939 because the condition on line 3934 was never true
3935 contents
3936 and isinstance(contents, str)
3937 and re.match(r"\s*$", contents[0])
3938 ):
3939 contents = contents[1:]
3940 exp = wxr.wtp.parse(
3941 wxr.wtp.node_to_wikitext(contents),
3942 # post_template_fn=head_post_template_fn,
3943 expand_all=True,
3944 )
3945 rub, rest = extract_ruby(wxr, exp.children)
3946 if rub:
3947 for rtup in rub:
3948 ruby.append(rtup)
3949 contents = rest
3950 subtext = clean_node(
3951 wxr, sense_base, contents, template_fn=usex_template_fn
3952 )
3954 frozen_taxons = frozenset(taxons)
3955 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3957 # print(f"{subtext=}")
3958 subtext = re.sub(
3959 r"\s*\(please add an English "
3960 r"translation of this "
3961 r"(example|usage example|quote)\)",
3962 "",
3963 subtext,
3964 ).strip()
3965 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3966 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3967 # print("subtext:", repr(subtext))
3969 lines = subtext.splitlines()
3970 # print(lines)
3972 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3973 lines = list(
3974 x
3975 for x in lines
3976 if not re.match(
3977 r"(Synonyms: |Antonyms: |Hyponyms: |"
3978 r"Synonym: |Antonym: |Hyponym: |"
3979 r"Hypernyms: |Derived terms: |"
3980 r"Related terms: |"
3981 r"Hypernym: |Derived term: |"
3982 r"Coordinate terms:|"
3983 r"Related term: |"
3984 r"For more quotations using )",
3985 x,
3986 )
3987 )
3988 tr = ""
3989 ref = ""
3990 roman = ""
3991 # for line in lines:
3992 # print("LINE:", repr(line))
3993 # print(classify_desc(line))
3994 if len(lines) == 1 and lang_code != "en":
3995 parts = example_splitter_re.split(lines[0])
3996 if ( 3996 ↛ 4004line 3996 didn't jump to line 4004 because the condition on line 3996 was never true
3997 len(parts) > 2
3998 and len(example_template_args) == 1
3999 and any(
4000 ("―" in s) or ("—" in s)
4001 for s in example_template_args[0].values()
4002 )
4003 ):
4004 if nparts := synch_splits_with_args(
4005 lines[0], example_template_args[0]
4006 ):
4007 parts = nparts
4008 if ( 4008 ↛ 4013line 4008 didn't jump to line 4013 because the condition on line 4008 was never true
4009 len(example_template_args) == 1
4010 and "lit" in example_template_args[0]
4011 ):
4012 # ugly brute-force kludge in case there's a lit= arg
4013 literally = example_template_args[0].get("lit", "")
4014 if literally:
4015 literally = (
4016 " (literally, “"
4017 + clean_value(wxr, literally)
4018 + "”)"
4019 )
4020 else:
4021 literally = ""
4022 if ( 4022 ↛ 4061line 4022 didn't jump to line 4061 because the condition on line 4022 was never true
4023 len(example_template_args) == 1
4024 and len(parts) == 2
4025 and len(example_template_args[0])
4026 - (
4027 # horrible kludge to ignore these arguments
4028 # when calculating how many there are
4029 sum(
4030 s in example_template_args[0]
4031 for s in (
4032 "lit", # generates text, but we handle it
4033 "inline",
4034 "noenum",
4035 "nocat",
4036 "sort",
4037 )
4038 )
4039 )
4040 == 3
4041 and clean_value(
4042 wxr, example_template_args[0].get(2, "")
4043 )
4044 == parts[0].strip()
4045 and clean_value(
4046 wxr,
4047 (
4048 example_template_args[0].get(3)
4049 or example_template_args[0].get("translation")
4050 or example_template_args[0].get("t", "")
4051 )
4052 + literally, # in case there's a lit= argument
4053 )
4054 == parts[1].strip()
4055 ):
4056 # {{exampletemplate|ex|Foo bar baz|English translation}}
4057 # is a pretty reliable 'heuristic', so we use it here
4058 # before the others. To be extra sure the template
4059 # doesn't do anything weird, we compare the arguments
4060 # and the output to each other.
4061 lines = [parts[0].strip()]
4062 tr = parts[1].strip()
4063 elif (
4064 len(parts) == 2
4065 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4066 ):
4067 # These other branches just do some simple heuristics w/
4068 # the expanded output of the template (if applicable).
4069 lines = [parts[0].strip()]
4070 tr = parts[1].strip()
4071 elif ( 4071 ↛ 4077line 4071 didn't jump to line 4077 because the condition on line 4071 was never true
4072 len(parts) == 3
4073 and classify_desc2(parts[1])
4074 in ("romanization", "english")
4075 and classify_desc2(parts[2]) in ENGLISH_TEXTS
4076 ):
4077 lines = [parts[0].strip()]
4078 roman = parts[1].strip()
4079 tr = parts[2].strip()
4080 else:
4081 parts = re.split(r"\s+-\s+", lines[0])
4082 if ( 4082 ↛ 4086line 4082 didn't jump to line 4086 because the condition on line 4082 was never true
4083 len(parts) == 2
4084 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4085 ):
4086 lines = [parts[0].strip()]
4087 tr = parts[1].strip()
4088 elif len(lines) > 1:
4089 if any(
4090 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
4091 ) and not (len(example_template_names) == 1):
4092 refs: list[str] = []
4093 for i in range(len(lines)): 4093 ↛ 4099line 4093 didn't jump to line 4099 because the loop on line 4093 didn't complete
4094 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 4094 ↛ 4095line 4094 didn't jump to line 4095 because the condition on line 4094 was never true
4095 break
4096 refs.append(lines[i].strip())
4097 if re.search(r"[]\d:)]\s*$", lines[i]):
4098 break
4099 ref = " ".join(refs)
4100 lines = lines[i + 1 :]
4101 if (
4102 lang_code != "en"
4103 and len(lines) >= 2
4104 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
4105 ):
4106 i = len(lines) - 1
4107 while ( 4107 ↛ 4112line 4107 didn't jump to line 4112 because the condition on line 4107 was never true
4108 i > 1
4109 and classify_desc2(lines[i - 1])
4110 in ENGLISH_TEXTS
4111 ):
4112 i -= 1
4113 tr = "\n".join(lines[i:])
4114 lines = lines[:i]
4115 if len(lines) >= 2:
4116 if classify_desc2(lines[-1]) == "romanization":
4117 roman = lines[-1].strip()
4118 lines = lines[:-1]
4120 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
4121 ref = lines[0]
4122 lines = lines[1:]
4123 elif lang_code != "en" and len(lines) == 2:
4124 cls1 = classify_desc2(lines[0])
4125 cls2 = classify_desc2(lines[1])
4126 if cls2 in ENGLISH_TEXTS and cls1 != "english":
4127 tr = lines[1]
4128 lines = [lines[0]]
4129 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 4129 ↛ 4130line 4129 didn't jump to line 4130 because the condition on line 4129 was never true
4130 tr = lines[0]
4131 lines = [lines[1]]
4132 elif ( 4132 ↛ 4139line 4132 didn't jump to line 4139 because the condition on line 4132 was never true
4133 re.match(r"^[#*]*:+", lines[1])
4134 and classify_desc2(
4135 re.sub(r"^[#*:]+\s*", "", lines[1])
4136 )
4137 in ENGLISH_TEXTS
4138 ):
4139 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
4140 lines = [lines[0]]
4141 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
4142 # Both were classified as English, but
4143 # presumably one is not. Assume first is
4144 # non-English, as that seems more common.
4145 tr = lines[1]
4146 lines = [lines[0]]
4147 elif (
4148 usex_type != "quotation"
4149 and lang_code != "en"
4150 and len(lines) == 3
4151 ):
4152 cls1 = classify_desc2(lines[0])
4153 cls2 = classify_desc2(lines[1])
4154 cls3 = classify_desc2(lines[2])
4155 if (
4156 cls3 == "english"
4157 and cls2 in ("english", "romanization")
4158 and cls1 != "english"
4159 ):
4160 tr = lines[2].strip()
4161 roman = lines[1].strip()
4162 lines = [lines[0].strip()]
4163 elif ( 4163 ↛ 4171line 4163 didn't jump to line 4171 because the condition on line 4163 was never true
4164 usex_type == "quotation"
4165 and lang_code != "en"
4166 and len(lines) > 2
4167 ):
4168 # for x in lines:
4169 # print(" LINE: {}: {}"
4170 # .format(classify_desc2(x), x))
4171 if re.match(r"^[#*]*:+\s*$", lines[1]):
4172 ref = lines[0]
4173 lines = lines[2:]
4174 cls1 = classify_desc2(lines[-1])
4175 if cls1 == "english":
4176 i = len(lines) - 1
4177 while (
4178 i > 1
4179 and classify_desc2(lines[i - 1])
4180 == ENGLISH_TEXTS
4181 ):
4182 i -= 1
4183 tr = "\n".join(lines[i:])
4184 lines = lines[:i]
4186 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
4187 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
4188 tr = re.sub(r"^[#*:]+\s*", "", tr)
4189 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
4190 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
4191 ref = re.sub(r"^[#*:]+\s*", "", ref)
4192 ref = re.sub(
4193 r", (volume |number |page )?“?"
4194 r"\(please specify ([^)]|\(s\))*\)”?|"
4195 ", text here$",
4196 "",
4197 ref,
4198 )
4199 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
4200 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
4201 subtext = "\n".join(x for x in lines if x)
4202 if not tr and lang_code != "en":
4203 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
4204 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 4204 ↛ 4205line 4204 didn't jump to line 4205 because the condition on line 4204 was never true
4205 tr = m.group(2)
4206 subtext = subtext[: m.start()] + m.group(1)
4207 elif lines:
4208 parts = re.split(r"\s*[―—]+\s*", lines[0])
4209 if ( 4209 ↛ 4213line 4209 didn't jump to line 4213 because the condition on line 4209 was never true
4210 len(parts) == 2
4211 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4212 ):
4213 subtext = parts[0].strip()
4214 tr = parts[1].strip()
4215 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
4216 subtext = re.sub(
4217 r"(please add an English translation of "
4218 r"this (quote|usage example))",
4219 "",
4220 subtext,
4221 )
4222 subtext = re.sub(
4223 r"\s*→New International Version " "translation$",
4224 "",
4225 subtext,
4226 ) # e.g. pis/Tok Pisin (Bible)
4227 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4228 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4229 note = None
4230 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4231 if ( 4231 ↛ 4239line 4231 didn't jump to line 4239 because the condition on line 4231 was never true
4232 m is not None
4233 and lang_code != "en"
4234 and (
4235 m.group(1).startswith("with ")
4236 or classify_desc2(m.group(1)) == "english"
4237 )
4238 ):
4239 note = m.group(1)
4240 subtext = subtext[m.end() :]
4241 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4242 ref = re.sub(r",\s*→ISBN", "", ref)
4243 ref = ref.strip()
4244 if ref.endswith(":") or ref.endswith(","):
4245 ref = ref[:-1].strip()
4246 ref = re.sub(r"\s+,\s+", ", ", ref)
4247 ref = re.sub(r"\s+", " ", ref)
4248 if ref and not subtext: 4248 ↛ 4249line 4248 didn't jump to line 4249 because the condition on line 4248 was never true
4249 subtext = ref
4250 ref = ""
4251 if subtext:
4252 dt: ExampleData = {"text": subtext}
4253 if ref:
4254 dt["ref"] = ref
4255 if tr:
4256 dt["english"] = tr
4257 if usex_type:
4258 dt["type"] = usex_type
4259 if note: 4259 ↛ 4260line 4259 didn't jump to line 4260 because the condition on line 4259 was never true
4260 dt["note"] = note
4261 if roman:
4262 dt["roman"] = roman
4263 if ruby:
4264 dt["ruby"] = ruby
4265 examples.append(dt)
4267 return examples
4269 # Main code of parse_language()
4270 # Process the section
4271 stack.append(language)
4272 process_children(langnode, None)
4273 stack.pop()
4275 # Finalize word entires
4276 push_etym()
4277 ret = []
4278 for data in page_datas:
4279 merge_base(data, base_data)
4280 ret.append(data)
4282 # Copy all tags to word senses
4283 for data in ret:
4284 if "senses" not in data: 4284 ↛ 4285line 4284 didn't jump to line 4285 because the condition on line 4284 was never true
4285 continue
4286 # WordData should not have a 'tags' field, but if it does, it's
4287 # deleted and its contents removed and placed in each sense;
4288 # that's why the type ignores.
4289 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4290 if "tags" in data: 4290 ↛ 4291line 4290 didn't jump to line 4291 because the condition on line 4290 was never true
4291 del data["tags"] # type: ignore[typeddict-item]
4292 for sense in data["senses"]:
4293 data_extend(sense, "tags", tags)
4295 return ret
4298def parse_wikipedia_template(
4299 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4300) -> None:
4301 """Helper function for parsing {{wikipedia|...}} and related templates."""
4302 assert isinstance(wxr, WiktextractContext)
4303 assert isinstance(data, dict)
4304 assert isinstance(ht, dict)
4305 langid = clean_node(wxr, data, ht.get("lang", ()))
4306 pagename = (
4307 clean_node(wxr, data, ht.get(1, ()))
4308 or wxr.wtp.title
4309 or "MISSING_PAGE_TITLE"
4310 )
4311 if langid:
4312 data_append(data, "wikipedia", langid + ":" + pagename)
4313 else:
4314 data_append(data, "wikipedia", pagename)
4317def parse_top_template(
4318 wxr: WiktextractContext, node: WikiNode, data: WordData
4319) -> None:
4320 """Parses a template that occurs on the top-level in a page, before any
4321 language subtitles."""
4322 assert isinstance(wxr, WiktextractContext)
4323 assert isinstance(node, WikiNode)
4324 assert isinstance(data, dict)
4326 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4327 if name in wikipedia_templates:
4328 parse_wikipedia_template(wxr, data, ht)
4329 return None
4330 if is_panel_template(wxr, name):
4331 return ""
4332 if name in ("reconstruction",): 4332 ↛ 4333line 4332 didn't jump to line 4333 because the condition on line 4332 was never true
4333 return ""
4334 if name.lower() == "also" or name.lower().startswith("also/"):
4335 # XXX shows related words that might really have been the intended
4336 # word, capture them
4337 return ""
4338 if name == "see also": 4338 ↛ 4340line 4338 didn't jump to line 4340 because the condition on line 4338 was never true
4339 # XXX capture
4340 return ""
4341 if name == "cardinalbox": 4341 ↛ 4343line 4341 didn't jump to line 4343 because the condition on line 4341 was never true
4342 # XXX capture
4343 return ""
4344 if name == "character info": 4344 ↛ 4346line 4344 didn't jump to line 4346 because the condition on line 4344 was never true
4345 # XXX capture
4346 return ""
4347 if name == "commonscat": 4347 ↛ 4349line 4347 didn't jump to line 4349 because the condition on line 4347 was never true
4348 # XXX capture link to Wikimedia commons
4349 return ""
4350 if name == "wrongtitle": 4350 ↛ 4353line 4350 didn't jump to line 4353 because the condition on line 4350 was never true
4351 # XXX this should be captured to replace page title with the
4352 # correct title. E.g. ⿰亻革家
4353 return ""
4354 if name == "wikidata": 4354 ↛ 4355line 4354 didn't jump to line 4355 because the condition on line 4354 was never true
4355 arg = clean_node(wxr, data, ht.get(1, ()))
4356 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4357 data_append(data, "wikidata", arg)
4358 return ""
4359 wxr.wtp.debug(
4360 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4361 sortid="page/2870",
4362 )
4363 return ""
4365 clean_node(wxr, None, [node], template_fn=top_template_fn)
4368def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4369 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4370 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4371 that are next to each other."""
4373 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4374 # section get overwritten. In this case, let's just combine the two.
4376 # In Chinese entries, Pronunciation can be preceded on the
4377 # same level 3 by its Etymology *and* Glyph Origin sections:
4378 # ===Glyph Origin===
4379 # ===Etymology===
4380 # ===Pronunciation===
4381 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4382 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4383 # are now level 6
4385 # Known lowercase PoS names are in part_of_speech_map
4386 # Known lowercase linkage section names are in linkage_map
4388 old = re.split(
4389 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4390 )
4392 parts = []
4393 npar = 4 # Number of parentheses in above expression
4394 parts.append(old[0])
4395 prev_level = None
4396 level = None
4397 skip_level_title = False # When combining etymology sections
4398 for i in range(1, len(old), npar + 1):
4399 left = old[i]
4400 right = old[i + npar - 1]
4401 # remove Wikilinks in title
4402 title = re.sub(r"^\[\[", "", old[i + 1])
4403 title = re.sub(r"\]\]$", "", title)
4404 prev_level = level
4405 level = len(left)
4406 part = old[i + npar]
4407 if level != len(right): 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true
4408 wxr.wtp.debug(
4409 "subtitle has unbalanced levels: "
4410 "{!r} has {} on the left and {} on the right".format(
4411 title, left, right
4412 ),
4413 sortid="page/2904",
4414 )
4415 lc = title.lower()
4416 if name_to_code(title, "en") != "":
4417 if level > 2: 4417 ↛ 4418line 4417 didn't jump to line 4418 because the condition on line 4417 was never true
4418 wxr.wtp.debug(
4419 "subtitle has language name {} at level {}".format(
4420 title, level
4421 ),
4422 sortid="page/2911",
4423 )
4424 level = 2
4425 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4426 if level > 3: 4426 ↛ 4427line 4426 didn't jump to line 4427 because the condition on line 4426 was never true
4427 wxr.wtp.debug(
4428 "etymology section {} at level {}".format(title, level),
4429 sortid="page/2917",
4430 )
4431 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
4432 # sections cheek-to-cheek
4433 skip_level_title = True
4434 # Modify the title of previous ("Glyph Origin") section, in
4435 # case we have a meaningful title like "Etymology 1"
4436 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4437 level = 3
4438 elif lc.startswith(PRONUNCIATION_TITLE):
4439 # Pronunciation is now a level between POS and Etymology, so
4440 # we need to shift everything down by one
4441 level = 4
4442 elif lc in POS_TITLES:
4443 level = 5
4444 elif lc == TRANSLATIONS_TITLE:
4445 level = 6
4446 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4447 level = 6
4448 elif lc in INFLECTION_TITLES:
4449 level = 6
4450 elif lc == DESCENDANTS_TITLE:
4451 level = 6
4452 elif title in PROTO_ROOT_DERIVED_TITLES: 4452 ↛ 4453line 4452 didn't jump to line 4453 because the condition on line 4452 was never true
4453 level = 6
4454 elif lc in IGNORED_TITLES:
4455 level = 6
4456 else:
4457 level = 6
4458 if skip_level_title:
4459 skip_level_title = False
4460 parts.append(part)
4461 else:
4462 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4463 parts.append(part)
4464 # print("=" * level, title)
4465 # if level != len(left):
4466 # print(" FIXED LEVEL OF {} {} -> {}"
4467 # .format(title, len(left), level))
4469 text = "".join(parts)
4470 # print(text)
4471 return text
4474def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4475 # Skip translation pages
4476 if word.endswith("/" + TRANSLATIONS_TITLE): 4476 ↛ 4477line 4476 didn't jump to line 4477 because the condition on line 4476 was never true
4477 return []
4479 if wxr.config.verbose: 4479 ↛ 4480line 4479 didn't jump to line 4480 because the condition on line 4479 was never true
4480 logger.info(f"Parsing page: {word}")
4482 wxr.config.word = word
4483 wxr.wtp.start_page(word)
4485 # Remove <noinclude> and similar tags from main pages. They
4486 # should not appear there, but at least net/Elfdala has one and it
4487 # is probably not the only one.
4488 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4489 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4490 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4492 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4493 # pages that have, for example, Translations section under Linkage, or
4494 # Translations section on the same level as Noun. Enforce a proper
4495 # hierarchy by manipulating the subtitle levels in certain cases.
4496 text = fix_subtitle_hierarchy(wxr, text)
4498 # Parse the page, pre-expanding those templates that are likely to
4499 # influence parsing
4500 tree = wxr.wtp.parse(
4501 text,
4502 pre_expand=True,
4503 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4504 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4505 )
4506 # from wikitextprocessor.parser import print_tree
4507 # print("PAGE PARSE:", print_tree(tree))
4509 top_data: WordData = {}
4511 # Iterate over top-level titles, which should be languages for normal
4512 # pages
4513 by_lang = defaultdict(list)
4514 for langnode in tree.children:
4515 if not isinstance(langnode, WikiNode):
4516 continue
4517 if langnode.kind == NodeKind.TEMPLATE:
4518 parse_top_template(wxr, langnode, top_data)
4519 continue
4520 if langnode.kind == NodeKind.LINK:
4521 # Some pages have links at top level, e.g., "trees" in Wiktionary
4522 continue
4523 if langnode.kind != NodeKind.LEVEL2: 4523 ↛ 4524line 4523 didn't jump to line 4524 because the condition on line 4523 was never true
4524 wxr.wtp.debug(
4525 f"unexpected top-level node: {langnode}", sortid="page/3014"
4526 )
4527 continue
4528 lang = clean_node(
4529 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4530 )
4531 lang_code = name_to_code(lang, "en")
4532 if lang_code == "": 4532 ↛ 4533line 4532 didn't jump to line 4533 because the condition on line 4532 was never true
4533 wxr.wtp.debug(
4534 f"unrecognized language name: {lang}", sortid="page/3019"
4535 )
4536 if (
4537 wxr.config.capture_language_codes
4538 and lang_code not in wxr.config.capture_language_codes
4539 ):
4540 continue
4541 wxr.wtp.start_section(lang)
4543 # Collect all words from the page.
4544 # print(f"{langnode=}")
4545 datas = parse_language(wxr, langnode, lang, lang_code)
4547 # Propagate fields resulting from top-level templates to this
4548 # part-of-speech.
4549 for data in datas:
4550 if "lang" not in data: 4550 ↛ 4551line 4550 didn't jump to line 4551 because the condition on line 4550 was never true
4551 wxr.wtp.debug(
4552 "internal error -- no lang in data: {}".format(data),
4553 sortid="page/3034",
4554 )
4555 continue
4556 for k, v in top_data.items():
4557 assert isinstance(v, (list, tuple))
4558 data_extend(data, k, v)
4559 by_lang[data["lang"]].append(data)
4561 # XXX this code is clearly out of date. There is no longer a "conjugation"
4562 # field. FIX OR REMOVE.
4563 # Do some post-processing on the words. For example, we may distribute
4564 # conjugation information to all the words.
4565 ret = []
4566 for lang, lang_datas in by_lang.items():
4567 ret.extend(lang_datas)
4569 for x in ret:
4570 if x["word"] != word:
4571 if word.startswith("Unsupported titles/"): 4571 ↛ 4577line 4571 didn't jump to line 4577 because the condition on line 4571 was always true
4572 wxr.wtp.debug(
4573 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4574 sortid="20231101/3578page.py",
4575 )
4576 else:
4577 wxr.wtp.debug(
4578 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4579 sortid="20231101/3582page.py",
4580 )
4581 x["original_title"] = word
4582 # validate tag data
4583 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4584 return ret
4587def recursively_separate_raw_tags(
4588 wxr: WiktextractContext, data: dict[str, Any]
4589) -> None:
4590 if not isinstance(data, dict): 4590 ↛ 4591line 4590 didn't jump to line 4591 because the condition on line 4590 was never true
4591 wxr.wtp.error(
4592 "'data' is not dict; most probably "
4593 "data has a list that contains at least one dict and "
4594 "at least one non-dict item",
4595 sortid="en/page-4016/20240419",
4596 )
4597 return
4598 new_tags: list[str] = []
4599 raw_tags: list[str] = data.get("raw_tags", [])
4600 for field, val in data.items():
4601 if field == "tags":
4602 for tag in val:
4603 if tag not in valid_tags:
4604 raw_tags.append(tag)
4605 else:
4606 new_tags.append(tag)
4607 if isinstance(val, list):
4608 if len(val) > 0 and isinstance(val[0], dict):
4609 for d in val:
4610 recursively_separate_raw_tags(wxr, d)
4611 if "tags" in data and not new_tags:
4612 del data["tags"]
4613 elif new_tags:
4614 data["tags"] = new_tags
4615 if raw_tags:
4616 data["raw_tags"] = raw_tags
4619def process_soft_redirect_template(
4620 wxr: WiktextractContext,
4621 template_node: TemplateNode,
4622 redirect_pages: list[str],
4623) -> bool:
4624 # return `True` if the template is soft redirect template
4625 if template_node.template_name == "zh-see":
4626 # https://en.wiktionary.org/wiki/Template:zh-see
4627 title = clean_node(
4628 wxr, None, template_node.template_parameters.get(1, "")
4629 )
4630 if title != "": 4630 ↛ 4632line 4630 didn't jump to line 4632 because the condition on line 4630 was always true
4631 redirect_pages.append(title)
4632 return True
4633 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4634 # https://en.wiktionary.org/wiki/Template:ja-see
4635 for key, value in template_node.template_parameters.items():
4636 if isinstance(key, int): 4636 ↛ 4635line 4636 didn't jump to line 4635 because the condition on line 4636 was always true
4637 title = clean_node(wxr, None, value)
4638 if title != "": 4638 ↛ 4635line 4638 didn't jump to line 4635 because the condition on line 4638 was always true
4639 redirect_pages.append(title)
4640 return True
4641 return False
4644def process_zh_forms_templates(
4645 wxr: WiktextractContext,
4646 template_node: TemplateNode,
4647 base_data: WordData,
4648) -> None:
4649 # https://en.wiktionary.org/wiki/Template:zh-forms
4650 if "forms" not in base_data: 4650 ↛ 4652line 4650 didn't jump to line 4652 because the condition on line 4650 was always true
4651 base_data["forms"] = []
4652 for p_name, p_value in template_node.template_parameters.items():
4653 if not isinstance(p_name, str): 4653 ↛ 4654line 4653 didn't jump to line 4654 because the condition on line 4653 was never true
4654 continue
4655 if re.fullmatch(r"s\d*", p_name):
4656 form_data: FormData = {
4657 "form": clean_node(wxr, None, p_value),
4658 "tags": ["Simplified Chinese"],
4659 }
4660 if len(form_data["form"]) > 0: 4660 ↛ 4652line 4660 didn't jump to line 4652 because the condition on line 4660 was always true
4661 base_data["forms"].append(form_data)
4662 elif re.fullmatch(r"t\d+", p_name): 4662 ↛ 4663line 4662 didn't jump to line 4663 because the condition on line 4662 was never true
4663 form_data = {
4664 "form": clean_node(wxr, None, p_value),
4665 "tags": ["Traditional Chinese"],
4666 }
4667 if len(form_data["form"]) > 0:
4668 base_data["forms"].append(form_data)
4669 elif p_name == "alt": 4669 ↛ 4679line 4669 didn't jump to line 4679 because the condition on line 4669 was always true
4670 for form_text in clean_node(wxr, None, p_value).split(","):
4671 texts = form_text.split("-")
4672 form_data = {"form": texts[0]}
4673 if len(texts) > 1:
4674 # pronunciation data could be added after "-"
4675 # see https://en.wiktionary.org/wiki/新婦
4676 form_data["raw_tags"] = texts[1:]
4677 if len(form_data["form"]) > 0: 4677 ↛ 4670line 4677 didn't jump to line 4670 because the condition on line 4677 was always true
4678 base_data["forms"].append(form_data)
4679 elif p_name == "lit":
4680 lit = clean_node(wxr, None, p_value)
4681 if lit != "":
4682 base_data["literal_meaning"] = lit
4683 if len(base_data["forms"]) == 0:
4684 del base_data["forms"]