Coverage for src/wiktextract/extractor/en/page.py: 76%
1990 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Iterator,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .example import extract_example_list_item, extract_template_zh_x
51from .form_descriptions import (
52 classify_desc,
53 decode_tags,
54 distw,
55 parse_alt_or_inflection_of,
56 parse_sense_qualifier,
57 parse_word_head,
58)
59from .inflection import TableContext, parse_inflection_section
60from .info_templates import (
61 INFO_TEMPLATE_FUNCS,
62 parse_info_template_arguments,
63 parse_info_template_node,
64)
65from .linkages import (
66 extract_alt_form_section,
67 extract_zh_dial_template,
68 parse_linkage_item_text,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 DescendantData,
87 ExampleData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280FLOATING_TABLE_TEMPLATES: set[str] = {
281 # az-suffix-form creates a style=floatright div that is otherwise
282 # deleted; if it is not pre-expanded, we can intercept the template
283 # so we add this set into do_not_pre_expand, and intercept the
284 # templates in parse_part_of_speech
285 "az-suffix-forms",
286 "az-inf-p",
287 "kk-suffix-forms",
288 "ky-suffix-forms",
289 "tr-inf-p",
290 "tr-suffix-forms",
291 "tt-suffix-forms",
292 "uz-suffix-forms",
293}
294# These two should contain template names that should always be
295# pre-expanded when *first* processing the tree, or not pre-expanded
296# so that the template are left in place with their identifying
297# name intact for later filtering.
299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
302# Additional templates to be expanded in the pre-expand phase
303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
304 "multitrans",
305 "multitrans-nowiki",
306 "trans-top",
307 "trans-top-also",
308 "trans-bottom",
309 "checktrans-top",
310 "checktrans-bottom",
311 "col1",
312 "col2",
313 "col3",
314 "col4",
315 "col5",
316 "col1-u",
317 "col2-u",
318 "col3-u",
319 "col4-u",
320 "col5-u",
321 "check deprecated lang param usage",
322 "deprecated code",
323 "ru-verb-alt-ё",
324 "ru-noun-alt-ё",
325 "ru-adj-alt-ё",
326 "ru-proper noun-alt-ё",
327 "ru-pos-alt-ё",
328 "ru-alt-ё",
329 "inflection of",
330 "no deprecated lang param usage",
331 "transclude", # these produce sense entries (or other lists)
332 "tcl",
333}
335# Inverse linkage for those that have them
336linkage_inverses: dict[str, str] = {
337 # XXX this is not currently used, move to post-processing
338 "synonyms": "synonyms",
339 "hypernyms": "hyponyms",
340 "hyponyms": "hypernyms",
341 "holonyms": "meronyms",
342 "meronyms": "holonyms",
343 "derived": "derived_from",
344 "coordinate_terms": "coordinate_terms",
345 "troponyms": "hypernyms",
346 "antonyms": "antonyms",
347 "instances": "instance_of",
348 "related": "related",
349}
351# Templates that are used to form panels on pages and that
352# should be ignored in various positions
353PANEL_TEMPLATES: set[str] = {
354 "Character info",
355 "CJKV",
356 "French personal pronouns",
357 "French possessive adjectives",
358 "French possessive pronouns",
359 "Han etym",
360 "Japanese demonstratives",
361 "Latn-script",
362 "LDL",
363 "MW1913Abbr",
364 "Number-encoding",
365 "Nuttall",
366 "Spanish possessive adjectives",
367 "Spanish possessive pronouns",
368 "USRegionDisputed",
369 "Webster 1913",
370 "ase-rfr",
371 "attention",
372 "attn",
373 "beer",
374 "broken ref",
375 "ca-compass",
376 "character info",
377 "character info/var",
378 "checksense",
379 "compass-fi",
380 "copyvio suspected",
381 "delete",
382 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
383 "etystub",
384 "examples",
385 "hu-corr",
386 "hu-suff-pron",
387 "interwiktionary",
388 "ja-kanjitab",
389 "ko-hanja-search",
390 "look",
391 "maintenance box",
392 "maintenance line",
393 "mediagenic terms",
394 "merge",
395 "missing template",
396 "morse links",
397 "move",
398 "multiple images",
399 "no inline",
400 "picdic",
401 "picdicimg",
402 "picdiclabel",
403 "polyominoes",
404 "predidential nomics",
405 "punctuation", # This actually gets pre-expanded
406 "reconstructed",
407 "request box",
408 "rf-sound example",
409 "rfaccents",
410 "rfap",
411 "rfaspect",
412 "rfc",
413 "rfc-auto",
414 "rfc-header",
415 "rfc-level",
416 "rfc-pron-n",
417 "rfc-sense",
418 "rfclarify",
419 "rfd",
420 "rfd-redundant",
421 "rfd-sense",
422 "rfdate",
423 "rfdatek",
424 "rfdef",
425 "rfe",
426 "rfe/dowork",
427 "rfex",
428 "rfexp",
429 "rfform",
430 "rfgender",
431 "rfi",
432 "rfinfl",
433 "rfm",
434 "rfm-sense",
435 "rfp",
436 "rfp-old",
437 "rfquote",
438 "rfquote-sense",
439 "rfquotek",
440 "rfref",
441 "rfscript",
442 "rft2",
443 "rftaxon",
444 "rftone",
445 "rftranslit",
446 "rfv",
447 "rfv-etym",
448 "rfv-pron",
449 "rfv-quote",
450 "rfv-sense",
451 "selfref",
452 "split",
453 "stroke order", # XXX consider capturing this?
454 "stub entry",
455 "t-needed",
456 "tbot entry",
457 "tea room",
458 "tea room sense",
459 # "ttbc", - XXX needed in at least on/Preposition/Translation page
460 "unblock",
461 "unsupportedpage",
462 "video frames",
463 "was wotd",
464 "wrongtitle",
465 "zh-forms",
466 "zh-hanzi-box",
467 "no entry",
468}
470# Template name prefixes used for language-specific panel templates (i.e.,
471# templates that create side boxes or notice boxes or that should generally
472# be ignored).
473PANEL_PREFIXES: set[str] = {
474 "list:compass points/",
475 "list:Gregorian calendar months/",
476 "RQ:",
477}
479# Templates used for wikipedia links.
480wikipedia_templates: set[str] = {
481 "wikipedia",
482 "slim-wikipedia",
483 "w",
484 "W",
485 "swp",
486 "wiki",
487 "Wikipedia",
488 "wtorw",
489}
490for x in PANEL_PREFIXES & wikipedia_templates: 490 ↛ 491line 490 didn't jump to line 491 because the loop on line 490 never started
491 print(
492 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
493 x
494 )
495 )
497# Mapping from a template name (without language prefix) for the main word
498# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
499# it could validly occur. This is used as just a sanity check to give
500# warnings about probably incorrect coding in Wiktionary.
501template_allowed_pos_map: dict[str, list[str]] = {
502 "abbr": ["abbrev"],
503 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
504 "plural noun": ["noun", "name"],
505 "plural-noun": ["noun", "name"],
506 "proper noun": ["noun", "name"],
507 "proper-noun": ["name", "noun"],
508 "prop": ["name", "noun"],
509 "verb": ["verb", "phrase"],
510 "gerund": ["verb"],
511 "particle": ["adv", "particle"],
512 "adj": ["adj", "adj_noun"],
513 "pron": ["pron", "noun"],
514 "name": ["name", "noun"],
515 "adv": ["adv", "intj", "conj", "particle"],
516 "phrase": ["phrase", "prep_phrase"],
517 "noun phrase": ["phrase"],
518 "ordinal": ["num"],
519 "number": ["num"],
520 "pos": ["affix", "name", "num"],
521 "suffix": ["suffix", "affix"],
522 "character": ["character"],
523 "letter": ["character"],
524 "kanji": ["character"],
525 "cont": ["abbrev"],
526 "interj": ["intj"],
527 "con": ["conj"],
528 "part": ["particle"],
529 "prep": ["prep", "postp"],
530 "postp": ["postp"],
531 "misspelling": ["noun", "adj", "verb", "adv"],
532 "part-form": ["verb"],
533}
534for k, v in template_allowed_pos_map.items():
535 for x in v:
536 if x not in PARTS_OF_SPEECH: 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true
537 print(
538 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
539 "".format(x, k, v)
540 )
541 assert False
544# Templates ignored during etymology extraction, i.e., these will not be listed
545# in the extracted etymology templates.
546ignored_etymology_templates: list[str] = [
547 "...",
548 "IPAchar",
549 "ipachar",
550 "ISBN",
551 "isValidPageName",
552 "redlink category",
553 "deprecated code",
554 "check deprecated lang param usage",
555 "para",
556 "p",
557 "cite",
558 "Cite news",
559 "Cite newsgroup",
560 "cite paper",
561 "cite MLLM 1976",
562 "cite journal",
563 "cite news/documentation",
564 "cite paper/documentation",
565 "cite video game",
566 "cite video game/documentation",
567 "cite newsgroup",
568 "cite newsgroup/documentation",
569 "cite web/documentation",
570 "cite news",
571 "Cite book",
572 "Cite-book",
573 "cite book",
574 "cite web",
575 "cite-usenet",
576 "cite-video/documentation",
577 "Cite-journal",
578 "rfe",
579 "catlangname",
580 "cln",
581 "langname-lite",
582 "no deprecated lang param usage",
583 "mention",
584 "m",
585 "m-self",
586 "link",
587 "l",
588 "ll",
589 "l-self",
590]
591# Regexp for matching ignored etymology template names. This adds certain
592# prefixes to the names listed above.
593ignored_etymology_templates_re = re.compile(
594 r"^((cite-|R:|RQ:).*|"
595 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
596 + r")$"
597)
599# Regexp for matching ignored descendants template names. Right now we just
600# copy the ignored etymology templates
601ignored_descendants_templates_re = ignored_etymology_templates_re
603# Set of template names that are used to define usage examples. If the usage
604# example contains one of these templates, then it its type is set to
605# "example"
606usex_templates: set[str] = {
607 "afex",
608 "affixusex",
609 "co", # {{collocation}} acts like a example template, specifically for
610 # pairs of combinations of words that are more common than you'd
611 # except would be randomly; hlavní#Czech
612 "coi",
613 "collocation",
614 "el-example",
615 "el-x",
616 "example",
617 "examples",
618 "he-usex",
619 "he-x",
620 "hi-usex",
621 "hi-x",
622 "ja-usex-inline",
623 "ja-usex",
624 "ja-x",
625 "jbo-example",
626 "jbo-x",
627 "km-usex",
628 "km-x",
629 "ko-usex",
630 "ko-x",
631 "lo-usex",
632 "lo-x",
633 "ne-x",
634 "ne-usex",
635 "prefixusex",
636 "ryu-usex",
637 "ryu-x",
638 "shn-usex",
639 "shn-x",
640 "suffixusex",
641 "th-usex",
642 "th-x",
643 "ur-usex",
644 "ur-x",
645 "usex",
646 "usex-suffix",
647 "ux",
648 "uxi",
649}
651stop_head_at_these_templates: set[str] = {
652 "category",
653 "cat",
654 "topics",
655 "catlangname",
656 "c",
657 "C",
658 "top",
659 "cln",
660}
662# Set of template names that are used to define quotation examples. If the
663# usage example contains one of these templates, then its type is set to
664# "quotation".
665quotation_templates: set[str] = {
666 "collapse-quote",
667 "quote-av",
668 "quote-book",
669 "quote-GYLD",
670 "quote-hansard",
671 "quotei",
672 "quote-journal",
673 "quotelite",
674 "quote-mailing list",
675 "quote-meta",
676 "quote-newsgroup",
677 "quote-song",
678 "quote-text",
679 "quote",
680 "quote-us-patent",
681 "quote-video game",
682 "quote-web",
683 "quote-wikipedia",
684 "wikiquote",
685 "Wikiquote",
686}
688taxonomy_templates = {
689 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
690 "taxfmt",
691 "taxlink",
692 "taxlink2",
693 "taxlinknew",
694 "taxlook",
695}
697# Template name component to linkage section listing. Integer section means
698# default section, starting at that argument.
699# XXX not used anymore, except for the first elements: moved to
700# template_linkages
701# template_linkage_mappings: list[list[Union[str, int]]] = [
702# ["syn", "synonyms"],
703# ["synonyms", "synonyms"],
704# ["ant", "antonyms"],
705# ["antonyms", "antonyms"],
706# ["hyp", "hyponyms"],
707# ["hyponyms", "hyponyms"],
708# ["der", "derived"],
709# ["derived terms", "derived"],
710# ["coordinate terms", "coordinate_terms"],
711# ["rel", "related"],
712# ["col", 2],
713# ]
715# Template names, this was exctracted from template_linkage_mappings,
716# because the code using template_linkage_mappings was actually not used
717# (but not removed).
718template_linkages_to_ignore_in_examples: set[str] = {
719 "syn",
720 "synonyms",
721 "ant",
722 "antonyms",
723 "hyp",
724 "hyponyms",
725 "der",
726 "derived terms",
727 "coordinate terms",
728 "cot",
729 "rel",
730 "col",
731 "inline alt forms",
732 "alti",
733 "comeronyms",
734 "holonyms",
735 "holo",
736 "hypernyms",
737 "hyper",
738 "meronyms",
739 "mero",
740 "troponyms",
741 "perfectives",
742 "pf",
743 "imperfectives",
744 "impf",
745 "syndiff",
746 "synsee",
747 # not linkage nor example templates
748 "sense",
749 "s",
750 "color panel",
751 "colour panel",
752}
754# Maps template name used in a word sense to a linkage field that it adds.
755sense_linkage_templates: dict[str, str] = {
756 "syn": "synonyms",
757 "synonyms": "synonyms",
758 "synsee": "synonyms",
759 "syndiff": "synonyms",
760 "hyp": "hyponyms",
761 "hyponyms": "hyponyms",
762 "ant": "antonyms",
763 "antonyms": "antonyms",
764 "alti": "related",
765 "inline alt forms": "related",
766 "coordinate terms": "coordinate_terms",
767 "cot": "coordinate_terms",
768 "comeronyms": "related",
769 "holonyms": "holonyms",
770 "holo": "holonyms",
771 "hypernyms": "hypernyms",
772 "hyper": "hypernyms",
773 "meronyms": "meronyms",
774 "mero": "meronyms",
775 "troponyms": "troponyms",
776 "perfectives": "related",
777 "pf": "related",
778 "imperfectives": "related",
779 "impf": "related",
780}
782sense_linkage_templates_tags: dict[str, list[str]] = {
783 "alti": ["alternative"],
784 "inline alt forms": ["alternative"],
785 "comeronyms": ["comeronym"],
786 "perfectives": ["perfective"],
787 "pf": ["perfective"],
788 "imperfectives": ["imperfective"],
789 "impf": ["imperfective"],
790}
793def decode_html_entities(v: Union[str, int]) -> str:
794 """Decodes HTML entities from a value, converting them to the respective
795 Unicode characters/strings."""
796 if isinstance(v, int):
797 # I changed this to return str(v) instead of v = str(v),
798 # but there might have been the intention to have more logic
799 # here. html.unescape would not do anything special with an integer,
800 # it needs html escape symbols (&xx;).
801 return str(v)
802 return html.unescape(v)
805def parse_sense_linkage(
806 wxr: WiktextractContext,
807 data: SenseData,
808 name: str,
809 ht: TemplateArgs,
810 pos: str,
811) -> None:
812 """Parses a linkage (synonym, etc) specified in a word sense."""
813 assert isinstance(wxr, WiktextractContext)
814 assert isinstance(data, dict)
815 assert isinstance(name, str)
816 assert isinstance(ht, dict)
817 field = sense_linkage_templates[name]
818 field_tags = sense_linkage_templates_tags.get(name, [])
819 for i in range(2, 20):
820 w = ht.get(i) or ""
821 w = clean_node(wxr, data, w)
822 is_thesaurus = False
823 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
824 if w.startswith(alias): 824 ↛ 825line 824 didn't jump to line 825 because the condition on line 824 was never true
825 is_thesaurus = True
826 w = w[len(alias) :]
827 if w != wxr.wtp.title:
828 from ...thesaurus import search_thesaurus
830 lang_code = clean_node(wxr, None, ht.get(1, ""))
831 for t_data in search_thesaurus(
832 wxr.thesaurus_db_conn, w, lang_code, pos, field
833 ):
834 l_data = {
835 "word": t_data.term,
836 "source": "Thesaurus:" + w,
837 }
838 if len(t_data.tags) > 0:
839 l_data["tags"] = t_data.tags
840 if len(t_data.raw_tags) > 0:
841 l_data["raw_tags"] = t_data.raw_tags
842 data_append(data, field, l_data)
843 break
844 if not w:
845 break
846 if is_thesaurus: 846 ↛ 847line 846 didn't jump to line 847 because the condition on line 846 was never true
847 continue
848 tags: list[str] = []
849 topics: list[str] = []
850 english: Optional[str] = None
851 # Try to find qualifiers for this synonym
852 q = ht.get("q{}".format(i - 1))
853 if q:
854 cls = classify_desc(q)
855 if cls == "tags":
856 tagsets1, topics1 = decode_tags(q)
857 for ts in tagsets1:
858 tags.extend(ts)
859 topics.extend(topics1)
860 elif cls == "english": 860 ↛ 866line 860 didn't jump to line 866 because the condition on line 860 was always true
861 if english: 861 ↛ 862line 861 didn't jump to line 862 because the condition on line 861 was never true
862 english += "; " + q
863 else:
864 english = q
865 # Try to find English translation for this synonym
866 t = ht.get("t{}".format(i - 1))
867 if t: 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true
868 if english:
869 english += "; " + t
870 else:
871 english = t
873 # See if the linkage contains a parenthesized alt
874 alt = None
875 m = re.search(r"\(([^)]+)\)$", w)
876 if m: 876 ↛ 877line 876 didn't jump to line 877 because the condition on line 876 was never true
877 w = w[: m.start()].strip()
878 alt = m.group(1)
880 dt = {"word": w}
881 if field_tags: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true
882 data_extend(dt, "tags", field_tags)
883 if tags:
884 data_extend(dt, "tags", tags)
885 if topics: 885 ↛ 886line 885 didn't jump to line 886 because the condition on line 885 was never true
886 data_extend(dt, "topics", topics)
887 if english:
888 dt["english"] = english
889 if alt: 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true
890 dt["alt"] = alt
891 data_append(data, field, dt)
894EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
895example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
896captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
899def synch_splits_with_args(
900 line: str, targs: TemplateArgs
901) -> Optional[list[str]]:
902 """If it looks like there's something weird with how a line of example
903 text has been split, this function will do the splitting after counting
904 occurences of the splitting regex inside the two main template arguments
905 containing the string data for the original language example and the
906 English translations.
907 """
908 # Previously, we split without capturing groups, but here we want to
909 # keep the original splitting hyphen regex intact.
910 fparts = captured_splitters_re.split(line)
911 new_parts = []
912 # ["First", " – ", "second", " – ", "third..."] from OL argument
913 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
914 new_parts.append("".join(fparts[:first]))
915 # Translation argument
916 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
917 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
918 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
919 new_parts.append("".join(fparts[first + 1 : second]))
921 if all(new_parts): # no empty strings from the above spaghetti
922 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
923 return new_parts
924 else:
925 return None
928QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
929QUALIFIERS_RE = re.compile(QUALIFIERS)
930# (...): ... or (...(...)...): ...
933def parse_language(
934 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
935) -> list[WordData]:
936 """Iterates over the text of the page, returning words (parts-of-speech)
937 defined on the page one at a time. (Individual word senses for the
938 same part-of-speech are typically encoded in the same entry.)"""
939 # imported here to avoid circular import
940 from .pronunciation import parse_pronunciation
942 assert isinstance(wxr, WiktextractContext)
943 assert isinstance(langnode, WikiNode)
944 assert isinstance(language, str)
945 assert isinstance(lang_code, str)
946 # print("parse_language", language)
948 is_reconstruction = False
949 word: str = wxr.wtp.title # type: ignore[assignment]
950 unsupported_prefix = "Unsupported titles/"
951 if word.startswith(unsupported_prefix):
952 w = word[len(unsupported_prefix) :]
953 if w in unsupported_title_map: 953 ↛ 956line 953 didn't jump to line 956 because the condition on line 953 was always true
954 word = unsupported_title_map[w]
955 else:
956 wxr.wtp.error(
957 "Unimplemented unsupported title: {}".format(word),
958 sortid="page/870",
959 )
960 word = w
961 elif word.startswith("Reconstruction:"): 961 ↛ 962line 961 didn't jump to line 962 because the condition on line 961 was never true
962 word = word[word.find("/") + 1 :]
963 is_reconstruction = True
965 base_data: WordData = {
966 "word": word,
967 "lang": language,
968 "lang_code": lang_code,
969 }
970 if is_reconstruction: 970 ↛ 971line 970 didn't jump to line 971 because the condition on line 970 was never true
971 data_append(base_data, "tags", "reconstruction")
972 sense_data: SenseData = {}
973 pos_data: WordData = {} # For a current part-of-speech
974 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
975 etym_data: WordData = {} # For one etymology
976 pos_datas: list[SenseData] = []
977 level_four_datas: list[WordData] = []
978 etym_datas: list[WordData] = []
979 page_datas: list[WordData] = []
980 have_etym = False
981 inside_level_four = False # This is for checking if the etymology section
982 # or article has a Pronunciation section, for Chinese mostly; because
983 # Chinese articles can have three level three sections (two etymology
984 # sections and pronunciation sections) one after another, we need a kludge
985 # to better keep track of whether we're in a normal "etym" or inside a
986 # "level four" (which is what we've turned the level three Pron sections
987 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
988 # a step.
989 stack: list[str] = [] # names of items on the "stack"
991 def merge_base(data: WordData, base: WordData) -> None:
992 for k, v in base.items():
993 # Copy the value to ensure that we don't share lists or
994 # dicts between structures (even nested ones).
995 v = copy.deepcopy(v)
996 if k not in data:
997 # The list was copied above, so this will not create shared ref
998 data[k] = v # type: ignore[literal-required]
999 continue
1000 if data[k] == v: # type: ignore[literal-required]
1001 continue
1002 if ( 1002 ↛ 1010line 1002 didn't jump to line 1010 because the condition on line 1002 was always true
1003 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1004 or isinstance(
1005 v,
1006 (list, tuple), # Should this be "and"?
1007 )
1008 ):
1009 data[k] = list(data[k]) + list(v) # type: ignore
1010 elif data[k] != v: # type: ignore[literal-required]
1011 wxr.wtp.warning(
1012 "conflicting values for {} in merge_base: "
1013 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1014 sortid="page/904",
1015 )
1017 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1018 """Remove unnecessary keys from dict values
1019 in a list comprehension..."""
1020 if key in pron:
1021 pron.pop(key) # type: ignore
1022 return pron
1024 # If the result has sounds, eliminate sounds that have a prefix that
1025 # does not match "word" or one of "forms"
1026 if "sounds" in data and "word" in data:
1027 accepted = [data["word"]]
1028 accepted.extend(f["form"] for f in data.get("forms", dict()))
1029 data["sounds"] = list(
1030 s
1031 for s in data["sounds"]
1032 if "form" not in s or s["form"] in accepted
1033 )
1034 # If the result has sounds, eliminate sounds that have a pos that
1035 # does not match "pos"
1036 if "sounds" in data and "pos" in data:
1037 data["sounds"] = list(
1038 complementary_pop(s, "pos")
1039 for s in data["sounds"]
1040 # "pos" is not a field of SoundData, correctly, so we're
1041 # removing it here. It's a kludge on a kludge on a kludge.
1042 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1043 )
1045 def push_sense() -> bool:
1046 """Starts collecting data for a new word sense. This returns True
1047 if a sense was added."""
1048 nonlocal sense_data
1049 tags = sense_data.get("tags", ())
1050 if (
1051 not sense_data.get("glosses")
1052 and "translation-hub" not in tags
1053 and "no-gloss" not in tags
1054 ):
1055 return False
1057 if ( 1057 ↛ 1067line 1057 didn't jump to line 1067 because the condition on line 1057 was never true
1058 (
1059 "participle" in sense_data.get("tags", ())
1060 or "infinitive" in sense_data.get("tags", ())
1061 )
1062 and "alt_of" not in sense_data
1063 and "form_of" not in sense_data
1064 and "etymology_text" in etym_data
1065 and etym_data["etymology_text"] != ""
1066 ):
1067 etym = etym_data["etymology_text"]
1068 etym = etym.split(". ")[0]
1069 ret = parse_alt_or_inflection_of(wxr, etym, set())
1070 if ret is not None:
1071 tags, lst = ret
1072 assert isinstance(lst, (list, tuple))
1073 if "form-of" in tags:
1074 data_extend(sense_data, "form_of", lst)
1075 data_extend(sense_data, "tags", tags)
1076 elif "alt-of" in tags:
1077 data_extend(sense_data, "alt_of", lst)
1078 data_extend(sense_data, "tags", tags)
1080 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1080 ↛ 1083line 1080 didn't jump to line 1083 because the condition on line 1080 was never true
1081 "tags", ()
1082 ):
1083 data_append(sense_data, "tags", "no-gloss")
1085 pos_datas.append(sense_data)
1086 sense_data = {}
1087 return True
1089 def push_pos() -> None:
1090 """Starts collecting data for a new part-of-speech."""
1091 nonlocal pos_data
1092 nonlocal pos_datas
1093 push_sense()
1094 if wxr.wtp.subsection:
1095 data: WordData = {"senses": pos_datas}
1096 merge_base(data, pos_data)
1097 level_four_datas.append(data)
1098 pos_data = {}
1099 pos_datas = []
1100 wxr.wtp.start_subsection(None)
1102 def push_level_four_section(clear_sound_data: bool) -> None:
1103 """Starts collecting data for a new level four sections, which
1104 is usually virtual and empty, unless the article has Chinese
1105 'Pronunciation' sections that are etymology-section-like but
1106 under etymology, and at the same level in the source. We modify
1107 the source to demote Pronunciation sections like that to level
1108 4, and other sections one step lower."""
1109 nonlocal level_four_data
1110 nonlocal level_four_datas
1111 nonlocal etym_datas
1112 push_pos()
1113 # print(f"======\n{etym_data=}")
1114 # print(f"======\n{etym_datas=}")
1115 # print(f"======\n{level_four_data=}")
1116 # print(f"======\n{level_four_datas=}")
1117 for data in level_four_datas:
1118 merge_base(data, level_four_data)
1119 etym_datas.append(data)
1120 for data in etym_datas:
1121 merge_base(data, etym_data)
1122 page_datas.append(data)
1123 if clear_sound_data:
1124 level_four_data = {}
1125 level_four_datas = []
1126 etym_datas = []
1128 def push_etym() -> None:
1129 """Starts collecting data for a new etymology."""
1130 nonlocal etym_data
1131 nonlocal etym_datas
1132 nonlocal have_etym
1133 nonlocal inside_level_four
1134 have_etym = True
1135 push_level_four_section(False)
1136 inside_level_four = False
1137 # etymology section could under pronunciation section
1138 etym_data = (
1139 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1140 )
1142 def select_data() -> WordData:
1143 """Selects where to store data (pos or etym) based on whether we
1144 are inside a pos (part-of-speech)."""
1145 # print(f"{wxr.wtp.subsection=}")
1146 # print(f"{stack=}")
1147 if wxr.wtp.subsection is not None:
1148 return pos_data
1149 if inside_level_four:
1150 return level_four_data
1151 if stack[-1] == language:
1152 return base_data
1153 return etym_data
1155 term_label_templates: list[TemplateData] = []
1157 def head_post_template_fn(
1158 name: str, ht: TemplateArgs, expansion: str
1159 ) -> Optional[str]:
1160 """Handles special templates in the head section of a word. Head
1161 section is the text after part-of-speech subtitle and before word
1162 sense list. Typically it generates the bold line for the word, but
1163 may also contain other useful information that often ends in
1164 side boxes. We want to capture some of that additional information."""
1165 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1166 if is_panel_template(wxr, name): 1166 ↛ 1169line 1166 didn't jump to line 1169 because the condition on line 1166 was never true
1167 # Completely ignore these templates (not even recorded in
1168 # head_templates)
1169 return ""
1170 if name == "head":
1171 # XXX are these also captured in forms? Should this special case
1172 # be removed?
1173 t = ht.get(2, "")
1174 if t == "pinyin": 1174 ↛ 1175line 1174 didn't jump to line 1175 because the condition on line 1174 was never true
1175 data_append(pos_data, "tags", "Pinyin")
1176 elif t == "romanization": 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true
1177 data_append(pos_data, "tags", "romanization")
1178 if (
1179 HEAD_TAG_RE.search(name) is not None
1180 or name in WORD_LEVEL_HEAD_TEMPLATES
1181 ):
1182 args_ht = clean_template_args(wxr, ht)
1183 cleaned_expansion = clean_node(wxr, None, expansion)
1184 dt: TemplateData = {
1185 "name": name,
1186 "args": args_ht,
1187 "expansion": cleaned_expansion,
1188 }
1189 data_append(pos_data, "head_templates", dt)
1190 if name in WORD_LEVEL_HEAD_TEMPLATES:
1191 term_label_templates.append(dt)
1192 # Squash these, their tags are applied to the whole word,
1193 # and some cause problems like "term-label"
1194 return ""
1196 # The following are both captured in head_templates and parsed
1197 # separately
1199 if name in wikipedia_templates:
1200 # Note: various places expect to have content from wikipedia
1201 # templates, so cannot convert this to empty
1202 parse_wikipedia_template(wxr, pos_data, ht)
1203 return None
1205 if name == "number box": 1205 ↛ 1207line 1205 didn't jump to line 1207 because the condition on line 1205 was never true
1206 # XXX extract numeric value?
1207 return ""
1208 if name == "enum":
1209 # XXX extract?
1210 return ""
1211 if name == "cardinalbox": 1211 ↛ 1214line 1211 didn't jump to line 1214 because the condition on line 1211 was never true
1212 # XXX extract similar to enum?
1213 # XXX this can also occur in top-level under language
1214 return ""
1215 if name == "Han simplified forms": 1215 ↛ 1217line 1215 didn't jump to line 1217 because the condition on line 1215 was never true
1216 # XXX extract?
1217 return ""
1218 # if name == "ja-kanji forms":
1219 # # XXX extract?
1220 # return ""
1221 # if name == "vi-readings":
1222 # # XXX extract?
1223 # return ""
1224 # if name == "ja-kanji":
1225 # # XXX extract?
1226 # return ""
1227 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1227 ↛ 1229line 1227 didn't jump to line 1229 because the condition on line 1227 was never true
1228 # XXX extract?
1229 return ""
1231 return None
1233 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1234 """Parses the subsection for a part-of-speech under a language on
1235 a page."""
1236 assert isinstance(posnode, WikiNode)
1237 assert isinstance(pos, str)
1238 # print("parse_part_of_speech", pos)
1239 pos_data["pos"] = pos
1240 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1241 lists: list[list[WikiNode]] = [[]] # list of lists
1242 first_para = True
1243 first_head_tmplt = True
1244 collecting_head = True
1245 start_of_paragraph = True
1247 # XXX extract templates from posnode with recursively_extract
1248 # that break stuff, like ja-kanji or az-suffix-form.
1249 # Do the extraction with a list of template names, combined from
1250 # different lists, then separate out them into different lists
1251 # that are handled at different points of the POS section.
1252 # First, extract az-suffix-form, put it in `inflection`,
1253 # and parse `inflection`'s content when appropriate later.
1254 # The contents of az-suffix-form (and ja-kanji) that generate
1255 # divs with "floatright" in their style gets deleted by
1256 # clean_value, so templates that slip through from here won't
1257 # break anything.
1258 # XXX bookmark
1259 # print("===================")
1260 # print(posnode.children)
1262 floaters, poschildren = recursively_extract(
1263 posnode.children,
1264 lambda x: (
1265 isinstance(x, WikiNode)
1266 and (
1267 (
1268 x.kind == NodeKind.TEMPLATE
1269 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1270 )
1271 or (
1272 x.kind == NodeKind.LINK
1273 # Need to check for stringiness because some links are
1274 # broken; for example, if a template is missing an
1275 # argument, a link might look like `[[{{{1}}}...]]`
1276 and isinstance(x.largs[0][0], str)
1277 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1278 )
1279 )
1280 ),
1281 )
1282 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1283 tempnode.largs = [["Inflection"]]
1284 tempnode.children = floaters
1285 parse_inflection(tempnode, "Floating Div", pos)
1286 # print(poschildren)
1287 # XXX new above
1289 if not poschildren: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true
1290 if not floaters:
1291 wxr.wtp.debug(
1292 "PoS section without contents",
1293 sortid="en/page/1051/20230612",
1294 )
1295 else:
1296 wxr.wtp.debug(
1297 "PoS section without contents except for a floating table",
1298 sortid="en/page/1056/20230612",
1299 )
1300 return
1302 for node in poschildren:
1303 if isinstance(node, str):
1304 for m in re.finditer(r"\n+|[^\n]+", node):
1305 p = m.group(0)
1306 if p.startswith("\n\n") and pre:
1307 first_para = False
1308 start_of_paragraph = True
1309 break
1310 if p and collecting_head:
1311 pre[-1].append(p)
1312 continue
1313 assert isinstance(node, WikiNode)
1314 kind = node.kind
1315 if kind == NodeKind.LIST:
1316 lists[-1].append(node)
1317 collecting_head = False
1318 start_of_paragraph = True
1319 continue
1320 elif kind in LEVEL_KINDS:
1321 # Stop parsing section if encountering any kind of
1322 # level header (like ===Noun=== or ====Further Reading====).
1323 # At a quick glance, this should be the default behavior,
1324 # but if some kinds of source articles have sub-sub-sections
1325 # that should be parsed XXX it should be handled by changing
1326 # this break.
1327 break
1328 elif collecting_head and kind == NodeKind.LINK:
1329 # We might collect relevant links as they are often pictures
1330 # relating to the word
1331 if len(node.largs[0]) >= 1 and isinstance( 1331 ↛ 1346line 1331 didn't jump to line 1346 because the condition on line 1331 was always true
1332 node.largs[0][0], str
1333 ):
1334 if node.largs[0][0].startswith( 1334 ↛ 1340line 1334 didn't jump to line 1340 because the condition on line 1334 was never true
1335 ns_title_prefix_tuple(wxr, "Category")
1336 ):
1337 # [[Category:...]]
1338 # We're at the end of the file, probably, so stop
1339 # here. Otherwise the head will get garbage.
1340 break
1341 if node.largs[0][0].startswith( 1341 ↛ 1346line 1341 didn't jump to line 1346 because the condition on line 1341 was always true
1342 ns_title_prefix_tuple(wxr, "File")
1343 ):
1344 # Skips file links
1345 continue
1346 start_of_paragraph = False
1347 pre[-1].extend(node.largs[-1])
1348 elif kind == NodeKind.HTML:
1349 if node.sarg == "br":
1350 if pre[-1]: 1350 ↛ 1302line 1350 didn't jump to line 1302 because the condition on line 1350 was always true
1351 pre.append([]) # Switch to next head
1352 lists.append([]) # Lists parallels pre
1353 collecting_head = True
1354 start_of_paragraph = True
1355 elif collecting_head and node.sarg not in ( 1355 ↛ 1361line 1355 didn't jump to line 1361 because the condition on line 1355 was never true
1356 "gallery",
1357 "ref",
1358 "cite",
1359 "caption",
1360 ):
1361 start_of_paragraph = False
1362 pre[-1].append(node)
1363 else:
1364 start_of_paragraph = False
1365 elif isinstance(node, TemplateNode):
1366 # XXX Insert code here that disambiguates between
1367 # templates that generate word heads and templates
1368 # that don't.
1369 # There's head_tag_re that seems like a regex meant
1370 # to identify head templates. Too bad it's None.
1372 # ignore {{category}}, {{cat}}... etc.
1373 if node.template_name in stop_head_at_these_templates:
1374 # we've reached a template that should be at the end,
1375 continue
1377 # skip these templates; panel_templates is already used
1378 # to skip certain templates else, but it also applies to
1379 # head parsing quite well.
1380 # node.largs[0][0] should always be str, but can't type-check
1381 # that.
1382 if is_panel_template(wxr, node.template_name):
1383 continue
1384 # skip these templates
1385 # if node.largs[0][0] in skip_these_templates_in_head:
1386 # first_head_tmplt = False # no first_head_tmplt at all
1387 # start_of_paragraph = False
1388 # continue
1390 if first_head_tmplt and pre[-1]:
1391 first_head_tmplt = False
1392 start_of_paragraph = False
1393 pre[-1].append(node)
1394 elif pre[-1] and start_of_paragraph:
1395 pre.append([]) # Switch to the next head
1396 lists.append([]) # lists parallel pre
1397 collecting_head = True
1398 start_of_paragraph = False
1399 pre[-1].append(node)
1400 else:
1401 pre[-1].append(node)
1402 elif first_para:
1403 start_of_paragraph = False
1404 if collecting_head: 1404 ↛ 1302line 1404 didn't jump to line 1302 because the condition on line 1404 was always true
1405 pre[-1].append(node)
1406 # XXX use template_fn in clean_node to check that the head macro
1407 # is compatible with the current part-of-speech and generate warning
1408 # if not. Use template_allowed_pos_map.
1410 # Clean up empty pairs, and fix messes with extra newlines that
1411 # separate templates that are followed by lists wiktextract issue #314
1413 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1414 cleaned_lists: list[list[WikiNode]] = []
1415 pairless_pre_index = None
1417 for pre1, ls in zip(pre, lists):
1418 if pre1 and not ls:
1419 pairless_pre_index = len(cleaned_pre)
1420 if not pre1 and not ls: 1420 ↛ 1422line 1420 didn't jump to line 1422 because the condition on line 1420 was never true
1421 # skip [] + []
1422 continue
1423 if not ls and all(
1424 (isinstance(x, str) and not x.strip()) for x in pre1
1425 ):
1426 # skip ["\n", " "] + []
1427 continue
1428 if ls and not pre1:
1429 if pairless_pre_index is not None: 1429 ↛ 1430line 1429 didn't jump to line 1430 because the condition on line 1429 was never true
1430 cleaned_lists[pairless_pre_index] = ls
1431 pairless_pre_index = None
1432 continue
1433 cleaned_pre.append(pre1)
1434 cleaned_lists.append(ls)
1436 pre = cleaned_pre
1437 lists = cleaned_lists
1439 there_are_many_heads = len(pre) > 1
1440 header_tags: list[str] = []
1441 header_topics: list[str] = []
1442 previous_head_had_list = False
1444 if not any(g for g in lists):
1445 process_gloss_without_list(
1446 poschildren, pos, pos_data, header_tags, header_topics
1447 )
1448 else:
1449 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1450 # if len(ls) == 0:
1451 # # don't have gloss list
1452 # # XXX add code here to filter out 'garbage', like text
1453 # # that isn't a head template or head.
1454 # continue
1456 if all(not sl for sl in lists[i:]):
1457 if i == 0: 1457 ↛ 1458line 1457 didn't jump to line 1458 because the condition on line 1457 was never true
1458 if isinstance(node, str):
1459 wxr.wtp.debug(
1460 "first head without list of senses,"
1461 "string: '{}[...]', {}/{}".format(
1462 node[:20], word, language
1463 ),
1464 sortid="page/1689/20221215",
1465 )
1466 if isinstance(node, WikiNode):
1467 if node.largs and node.largs[0][0] in [
1468 "Han char",
1469 ]:
1470 # just ignore these templates
1471 pass
1472 else:
1473 wxr.wtp.debug(
1474 "first head without "
1475 "list of senses, "
1476 "template node "
1477 "{}, {}/{}".format(
1478 node.largs, word, language
1479 ),
1480 sortid="page/1694/20221215",
1481 )
1482 else:
1483 wxr.wtp.debug(
1484 "first head without list of senses, "
1485 "{}/{}".format(word, language),
1486 sortid="page/1700/20221215",
1487 )
1488 # no break here so that the first head always
1489 # gets processed.
1490 else:
1491 if isinstance(node, str): 1491 ↛ 1492line 1491 didn't jump to line 1492 because the condition on line 1491 was never true
1492 wxr.wtp.debug(
1493 "later head without list of senses,"
1494 "string: '{}[...]', {}/{}".format(
1495 node[:20], word, language
1496 ),
1497 sortid="page/1708/20221215",
1498 )
1499 if isinstance(node, WikiNode): 1499 ↛ 1511line 1499 didn't jump to line 1511 because the condition on line 1499 was always true
1500 wxr.wtp.debug(
1501 "later head without list of senses,"
1502 "template node "
1503 "{}, {}/{}".format(
1504 node.sarg if node.sarg else node.largs,
1505 word,
1506 language,
1507 ),
1508 sortid="page/1713/20221215",
1509 )
1510 else:
1511 wxr.wtp.debug(
1512 "later head without list of senses, "
1513 "{}/{}".format(word, language),
1514 sortid="page/1719/20221215",
1515 )
1516 break
1517 head_group = i + 1 if there_are_many_heads else None
1518 # print("parse_part_of_speech: {}: {}: pre={}"
1519 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1521 if previous_head_had_list:
1522 # We use a boolean flag here because we want to be able
1523 # let the header_tags data pass through after the loop
1524 # is over without accidentally emptying it, if there are
1525 # no pos_datas and we need a dummy data.
1526 header_tags.clear()
1527 header_topics.clear()
1529 process_gloss_header(
1530 pre1, pos, head_group, pos_data, header_tags, header_topics
1531 )
1532 for ln in ls:
1533 # Parse each list associated with this head.
1534 for node in ln.children:
1535 # Parse nodes in l.children recursively.
1536 # The recursion function uses push_sense() to
1537 # add stuff into pos_data, and returns True or
1538 # False if something is added, which bubbles upward.
1539 # If the bubble is "True", then higher levels of
1540 # the recursion will not push_sense(), because
1541 # the data is already pushed into a sub-gloss
1542 # downstream, unless the higher level has examples
1543 # that need to be put somewhere.
1544 common_data: SenseData = {
1545 "tags": list(header_tags),
1546 "topics": list(header_topics),
1547 }
1548 if head_group:
1549 common_data["head_nr"] = head_group
1550 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1552 if len(ls) > 0:
1553 previous_head_had_list = True
1554 else:
1555 previous_head_had_list = False
1557 # If there are no senses extracted, add a dummy sense. We want to
1558 # keep tags extracted from the head for the dummy sense.
1559 push_sense() # Make sure unfinished data pushed, and start clean sense
1560 if len(pos_datas) == 0:
1561 data_extend(sense_data, "tags", header_tags)
1562 data_extend(sense_data, "topics", header_topics)
1563 data_append(sense_data, "tags", "no-gloss")
1564 push_sense()
1566 def process_gloss_header(
1567 header_nodes: list[Union[WikiNode, str]],
1568 pos_type: str,
1569 header_group: Optional[int],
1570 pos_data: WordData,
1571 header_tags: list[str],
1572 header_topics: list[str],
1573 ) -> None:
1574 ruby = []
1575 links: list[str] = []
1577 # process template parse nodes here
1578 new_nodes = []
1579 info_template_data = []
1580 for node in header_nodes:
1581 # print(f"{node=}")
1582 info_data, info_out = parse_info_template_node(wxr, node, "head")
1583 if info_data or info_out:
1584 if info_data: 1584 ↛ 1586line 1584 didn't jump to line 1586 because the condition on line 1584 was always true
1585 info_template_data.append(info_data)
1586 if info_out: # including just the original node 1586 ↛ 1587line 1586 didn't jump to line 1587 because the condition on line 1586 was never true
1587 new_nodes.append(info_out)
1588 else:
1589 new_nodes.append(node)
1590 header_nodes = new_nodes
1592 if info_template_data:
1593 if "info_templates" not in pos_data: 1593 ↛ 1596line 1593 didn't jump to line 1596 because the condition on line 1593 was always true
1594 pos_data["info_templates"] = info_template_data
1595 else:
1596 pos_data["info_templates"].extend(info_template_data)
1598 if not word.isalnum():
1599 # `-` is kosher, add more of these if needed.
1600 if word.replace("-", "").isalnum():
1601 pass
1602 else:
1603 # if the word contains non-letter or -number characters, it
1604 # might have something that messes with split-at-semi-comma; we
1605 # collect links so that we can skip splitting them.
1606 exp = wxr.wtp.parse(
1607 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1608 )
1609 link_nodes, _ = recursively_extract(
1610 exp.children,
1611 lambda x: isinstance(x, WikiNode)
1612 and x.kind == NodeKind.LINK,
1613 )
1614 for ln in link_nodes:
1615 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1616 if not ltext.isalnum():
1617 links.append(ltext)
1618 if word not in links: 1618 ↛ 1621line 1618 didn't jump to line 1621 because the condition on line 1618 was always true
1619 links.append(word)
1621 if lang_code == "ja":
1622 exp = wxr.wtp.parse(
1623 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1624 )
1625 rub, _ = recursively_extract(
1626 exp.children,
1627 lambda x: isinstance(x, WikiNode)
1628 and x.kind == NodeKind.HTML
1629 and x.sarg == "ruby",
1630 )
1631 if rub is not None: 1631 ↛ 1676line 1631 didn't jump to line 1676 because the condition on line 1631 was always true
1632 for r in rub:
1633 if TYPE_CHECKING:
1634 # we know the lambda above in recursively_extract
1635 # returns only WikiNodes in rub
1636 assert isinstance(r, WikiNode)
1637 rt = parse_ruby(wxr, r)
1638 if rt is not None:
1639 ruby.append(rt)
1640 elif lang_code == "vi":
1641 # Handle vi-readings templates that have a weird structures for
1642 # Chu Nom vietnamese characters heads
1643 # https://en.wiktionary.org/wiki/Template:vi-readings
1644 new_header_nodes = []
1645 related_readings: list[LinkageData] = []
1646 for node in header_nodes:
1647 if ( 1647 ↛ 1671line 1647 didn't jump to line 1671 because the condition on line 1647 was always true
1648 isinstance(node, TemplateNode)
1649 and node.template_name == "vi-readings"
1650 ):
1651 print(node.template_parameters)
1652 for parameter, tag in (
1653 ("hanviet", "han-viet-reading"),
1654 ("nom", "nom-reading"),
1655 # we ignore the fanqie parameter "phienthiet"
1656 ):
1657 arg = node.template_parameters.get(parameter)
1658 if arg is not None: 1658 ↛ 1652line 1658 didn't jump to line 1652 because the condition on line 1658 was always true
1659 text = clean_node(wxr, None, arg)
1660 for w in text.split(","):
1661 # ignore - separated references
1662 if "-" in w:
1663 w = w[: w.index("-")]
1664 w = w.strip()
1665 related_readings.append(
1666 LinkageData(word=w, tags=[tag])
1667 )
1668 continue
1670 # Skip the vi-reading template for the rest of the head parsing
1671 new_header_nodes.append(node)
1672 if len(related_readings) > 0: 1672 ↛ 1676line 1672 didn't jump to line 1676 because the condition on line 1672 was always true
1673 data_extend(pos_data, "related", related_readings)
1674 header_nodes = new_header_nodes
1676 header_text = clean_node(
1677 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1678 )
1680 if not header_text.strip():
1681 return
1683 term_label_tags: list[str] = []
1684 term_label_topics: list[str] = []
1685 if len(term_label_templates) > 0:
1686 # parse term label templates; if there are other similar kinds
1687 # of templates in headers that you want to squash and apply as
1688 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1689 for templ_data in term_label_templates:
1690 # print(templ_data)
1691 expan = templ_data.get("expansion", "").strip("().,; ")
1692 if not expan: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true
1693 continue
1694 tlb_tagsets, tlb_topics = decode_tags(expan)
1695 for tlb_tags in tlb_tagsets:
1696 if len(tlb_tags) > 0 and not any(
1697 t.startswith("error-") for t in tlb_tags
1698 ):
1699 term_label_tags.extend(tlb_tags)
1700 term_label_topics.extend(tlb_topics)
1701 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1703 header_text = re.sub(r"\s+", " ", header_text)
1704 # print(f"{header_text=}")
1705 parse_word_head(
1706 wxr,
1707 pos_type,
1708 header_text,
1709 pos_data,
1710 is_reconstruction,
1711 header_group,
1712 ruby=ruby,
1713 links=links,
1714 )
1715 if "tags" in pos_data:
1716 # pos_data can get "tags" data from some source; type-checkers
1717 # doesn't like it, so let's ignore it.
1718 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1719 del pos_data["tags"] # type: ignore[typeddict-item]
1720 if len(term_label_tags) > 0:
1721 header_tags.extend(term_label_tags)
1722 if len(term_label_topics) > 0:
1723 header_topics.extend(term_label_topics)
1725 def process_gloss_without_list(
1726 nodes: list[Union[WikiNode, str]],
1727 pos_type: str,
1728 pos_data: WordData,
1729 header_tags: list[str],
1730 header_topics: list[str],
1731 ) -> None:
1732 # gloss text might not inside a list
1733 header_nodes: list[Union[str, WikiNode]] = []
1734 gloss_nodes: list[Union[str, WikiNode]] = []
1735 for node in strip_nodes(nodes):
1736 if isinstance(node, WikiNode):
1737 if isinstance(node, TemplateNode):
1738 if node.template_name in (
1739 "zh-see",
1740 "ja-see",
1741 "ja-see-kango",
1742 ):
1743 continue # soft redirect
1744 elif (
1745 node.template_name == "head"
1746 or node.template_name.startswith(f"{lang_code}-")
1747 ):
1748 header_nodes.append(node)
1749 continue
1750 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1750 ↛ 1752line 1750 didn't jump to line 1752 because the condition on line 1750 was always true
1751 break
1752 gloss_nodes.append(node)
1754 if len(header_nodes) > 0:
1755 process_gloss_header(
1756 header_nodes,
1757 pos_type,
1758 None,
1759 pos_data,
1760 header_tags,
1761 header_topics,
1762 )
1763 if len(gloss_nodes) > 0:
1764 process_gloss_contents(
1765 gloss_nodes,
1766 pos_type,
1767 {"tags": list(header_tags), "topics": list(header_topics)},
1768 )
1770 def parse_sense_node(
1771 node: Union[str, WikiNode], # never receives str
1772 sense_base: SenseData,
1773 pos: str,
1774 ) -> bool:
1775 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1776 Uses push_sense() to attempt adding data to pos_data in the scope
1777 of parse_language() when it reaches deep in the recursion. push_sense()
1778 returns True if it succeeds, and that is bubbled up the stack; if
1779 a sense was added downstream, the higher levels (whose shared data
1780 was already added by a subsense) do not push_sense(), unless it
1781 has examples that need to be put somewhere.
1782 """
1783 assert isinstance(sense_base, dict) # Added to every sense deeper in
1784 if not isinstance(node, WikiNode): 1784 ↛ 1786line 1784 didn't jump to line 1786 because the condition on line 1784 was never true
1785 # This doesn't seem to ever happen in practice.
1786 wxr.wtp.debug(
1787 "{}: parse_sense_node called with"
1788 "something that isn't a WikiNode".format(pos),
1789 sortid="page/1287/20230119",
1790 )
1791 return False
1793 if node.kind != NodeKind.LIST_ITEM: 1793 ↛ 1794line 1793 didn't jump to line 1794 because the condition on line 1793 was never true
1794 wxr.wtp.debug(
1795 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1796 )
1797 return False
1799 if node.sarg == ":":
1800 # Skip example entries at the highest level, ones without
1801 # a sense ("...#") above them.
1802 # If node.sarg is exactly and only ":", then it's at
1803 # the highest level; lower levels would have more
1804 # "indentation", like "#:" or "##:"
1805 return False
1807 # If a recursion call succeeds in push_sense(), bubble it up with
1808 # `added`.
1809 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1810 added = False
1812 gloss_template_args: set[str] = set()
1814 # For LISTs and LIST_ITEMS, their argument is something like
1815 # "##" or "##:", and using that we can rudimentally determine
1816 # list 'depth' if need be, and also what kind of list or
1817 # entry it is; # is for normal glosses, : for examples (indent)
1818 # and * is used for quotations on wiktionary.
1819 current_depth = node.sarg
1821 children = node.children
1823 # subentries, (presumably) a list
1824 # of subglosses below this. The list's
1825 # argument ends with #, and its depth should
1826 # be bigger than parent node.
1827 subentries = [
1828 x
1829 for x in children
1830 if isinstance(x, WikiNode)
1831 and x.kind == NodeKind.LIST
1832 and x.sarg == current_depth + "#"
1833 ]
1835 # sublists of examples and quotations. .sarg
1836 # does not end with "#".
1837 others = [
1838 x
1839 for x in children
1840 if isinstance(x, WikiNode)
1841 and x.kind == NodeKind.LIST
1842 and x.sarg != current_depth + "#"
1843 ]
1845 # the actual contents of this particular node.
1846 # can be a gloss (or a template that expands into
1847 # many glosses which we can't easily pre-expand)
1848 # or could be an "outer gloss" with more specific
1849 # subglosses, or could be a qualfier for the subglosses.
1850 contents = [
1851 x
1852 for x in children
1853 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1854 ]
1855 # If this entry has sublists of entries, we should combine
1856 # gloss information from both the "outer" and sublist content.
1857 # Sometimes the outer gloss
1858 # is more non-gloss or tags, sometimes it is a coarse sense
1859 # and the inner glosses are more specific. The outer one
1860 # does not seem to have qualifiers.
1862 # If we have one sublist with one element, treat it
1863 # specially as it may be a Wiktionary error; raise
1864 # that nested element to the same level.
1865 # XXX If need be, this block can be easily removed in
1866 # the current recursive logicand the result is one sense entry
1867 # with both glosses in the glosses list, as you would
1868 # expect. If the higher entry has examples, there will
1869 # be a higher entry with some duplicated data.
1870 if len(subentries) == 1:
1871 slc = subentries[0].children
1872 if len(slc) == 1:
1873 # copy current node and modify it so it doesn't
1874 # loop infinitely.
1875 cropped_node = copy.copy(node)
1876 cropped_node.children = [
1877 x
1878 for x in children
1879 if not (
1880 isinstance(x, WikiNode)
1881 and x.kind == NodeKind.LIST
1882 and x.sarg == current_depth + "#"
1883 )
1884 ]
1885 added |= parse_sense_node(cropped_node, sense_base, pos)
1886 nonlocal sense_data # this kludge causes duplicated raw_
1887 # glosses data if this is not done;
1888 # if the top-level (cropped_node)
1889 # does not push_sense() properly or
1890 # parse_sense_node() returns early,
1891 # sense_data is not reset. This happens
1892 # for example when you have a no-gloss
1893 # string like "(intransitive)":
1894 # no gloss, push_sense() returns early
1895 # and sense_data has duplicate data with
1896 # sense_base
1897 sense_data = {}
1898 added |= parse_sense_node(slc[0], sense_base, pos)
1899 return added
1901 return process_gloss_contents(
1902 contents,
1903 pos,
1904 sense_base,
1905 subentries,
1906 others,
1907 gloss_template_args,
1908 added,
1909 )
1911 def process_gloss_contents(
1912 contents: list[Union[str, WikiNode]],
1913 pos: str,
1914 sense_base: SenseData,
1915 subentries: list[WikiNode] = [],
1916 others: list[WikiNode] = [],
1917 gloss_template_args: Set[str] = set(),
1918 added: bool = False,
1919 ) -> bool:
1920 def sense_template_fn(
1921 name: str, ht: TemplateArgs, is_gloss: bool = False
1922 ) -> Optional[str]:
1923 # print(f"sense_template_fn: {name}, {ht}")
1924 if name in wikipedia_templates:
1925 # parse_wikipedia_template(wxr, pos_data, ht)
1926 return None
1927 if is_panel_template(wxr, name):
1928 return ""
1929 if name in INFO_TEMPLATE_FUNCS:
1930 info_data, info_exp = parse_info_template_arguments(
1931 wxr, name, ht, "sense"
1932 )
1933 if info_data or info_exp: 1933 ↛ 1939line 1933 didn't jump to line 1939 because the condition on line 1933 was always true
1934 if info_data: 1934 ↛ 1936line 1934 didn't jump to line 1936 because the condition on line 1934 was always true
1935 data_append(sense_base, "info_templates", info_data)
1936 if info_exp and isinstance(info_exp, str): 1936 ↛ 1938line 1936 didn't jump to line 1938 because the condition on line 1936 was always true
1937 return info_exp
1938 return ""
1939 if name in ("defdate",):
1940 date = clean_node(wxr, None, ht.get(1, ()))
1941 if part_two := ht.get(2): 1941 ↛ 1943line 1941 didn't jump to line 1943 because the condition on line 1941 was never true
1942 # Unicode mdash, not '-'
1943 date += "–" + clean_node(wxr, None, part_two)
1944 refs: dict[str, ReferenceData] = {}
1945 # ref, refn, ref2, ref2n, ref3, ref3n
1946 # ref1 not valid
1947 for k, v in sorted(
1948 (k, v) for k, v in ht.items() if isinstance(k, str)
1949 ):
1950 if m := re.match(r"ref(\d?)(n?)", k): 1950 ↛ 1947line 1950 didn't jump to line 1947 because the condition on line 1950 was always true
1951 ref_v = clean_node(wxr, None, v)
1952 if m.group(1) not in refs: # empty string or digit
1953 refs[m.group(1)] = ReferenceData()
1954 if m.group(2):
1955 refs[m.group(1)]["refn"] = ref_v
1956 else:
1957 refs[m.group(1)]["text"] = ref_v
1958 data_append(
1959 sense_base,
1960 "attestations",
1961 AttestationData(date=date, references=list(refs.values())),
1962 )
1963 return ""
1964 if name == "senseid":
1965 langid = clean_node(wxr, None, ht.get(1, ()))
1966 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1967 if re.match(r"Q\d+$", arg):
1968 data_append(sense_base, "wikidata", arg)
1969 data_append(sense_base, "senseid", langid + ":" + arg)
1970 if name in sense_linkage_templates:
1971 # print(f"SENSE_TEMPLATE_FN: {name}")
1972 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1973 return ""
1974 if name == "†" or name == "zh-obsolete":
1975 data_append(sense_base, "tags", "obsolete")
1976 return ""
1977 if name in {
1978 "ux",
1979 "uxi",
1980 "usex",
1981 "afex",
1982 "prefixusex",
1983 "ko-usex",
1984 "ko-x",
1985 "hi-x",
1986 "ja-usex-inline",
1987 "ja-x",
1988 "quotei",
1989 "he-x",
1990 "hi-x",
1991 "km-x",
1992 "ne-x",
1993 "shn-x",
1994 "th-x",
1995 "ur-x",
1996 }:
1997 # Usage examples are captured separately below. We don't
1998 # want to expand them into glosses even when unusual coding
1999 # is used in the entry.
2000 # These templates may slip through inside another item, but
2001 # currently we're separating out example entries (..#:)
2002 # well enough that there seems to very little contamination.
2003 if is_gloss:
2004 wxr.wtp.warning(
2005 "Example template is used for gloss text",
2006 sortid="extractor.en.page.sense_template_fn/1415",
2007 )
2008 else:
2009 return ""
2010 if name == "w": 2010 ↛ 2011line 2010 didn't jump to line 2011 because the condition on line 2010 was never true
2011 if ht.get(2) == "Wp":
2012 return ""
2013 for k, v in ht.items():
2014 v = v.strip()
2015 if v and "<" not in v:
2016 gloss_template_args.add(v)
2017 return None
2019 def extract_link_texts(item: GeneralNode) -> None:
2020 """Recursively extracts link texts from the gloss source. This
2021 information is used to select whether to remove final "." from
2022 form_of/alt_of (e.g., ihm/Hunsrik)."""
2023 if isinstance(item, (list, tuple)):
2024 for x in item:
2025 extract_link_texts(x)
2026 return
2027 if isinstance(item, str):
2028 # There seem to be HTML sections that may futher contain
2029 # unparsed links.
2030 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2030 ↛ 2031line 2030 didn't jump to line 2031 because the loop on line 2030 never started
2031 print("ITER:", m.group(0))
2032 v = m.group(1).split("|")[-1].strip()
2033 if v:
2034 gloss_template_args.add(v)
2035 return
2036 if not isinstance(item, WikiNode): 2036 ↛ 2037line 2036 didn't jump to line 2037 because the condition on line 2036 was never true
2037 return
2038 if item.kind == NodeKind.LINK:
2039 v = item.largs[-1]
2040 if ( 2040 ↛ 2046line 2040 didn't jump to line 2046 because the condition on line 2040 was always true
2041 isinstance(v, list)
2042 and len(v) == 1
2043 and isinstance(v[0], str)
2044 ):
2045 gloss_template_args.add(v[0].strip())
2046 for x in item.children:
2047 extract_link_texts(x)
2049 extract_link_texts(contents)
2051 # get the raw text of non-list contents of this node, and other stuff
2052 # like tag and category data added to sense_base
2053 # cast = no-op type-setter for the type-checker
2054 partial_template_fn = cast(
2055 TemplateFnCallable,
2056 partial(sense_template_fn, is_gloss=True),
2057 )
2058 rawgloss = clean_node(
2059 wxr,
2060 sense_base,
2061 contents,
2062 template_fn=partial_template_fn,
2063 collect_links=True,
2064 )
2066 if not rawgloss: 2066 ↛ 2067line 2066 didn't jump to line 2067 because the condition on line 2066 was never true
2067 return False
2069 # remove manually typed ordered list text at the start("1. ")
2070 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2072 # get stuff like synonyms and categories from "others",
2073 # maybe examples and quotations
2074 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2076 # The gloss could contain templates that produce more list items.
2077 # This happens commonly with, e.g., {{inflection of|...}}. Split
2078 # to parts. However, e.g. Interlingua generates multiple glosses
2079 # in HTML directly without Wikitext markup, so we must also split
2080 # by just newlines.
2081 subglosses = rawgloss.splitlines()
2083 if len(subglosses) == 0: 2083 ↛ 2084line 2083 didn't jump to line 2084 because the condition on line 2083 was never true
2084 return False
2086 if any(s.startswith("#") for s in subglosses):
2087 subtree = wxr.wtp.parse(rawgloss)
2088 # from wikitextprocessor.parser import print_tree
2089 # print("SUBTREE GENERATED BY TEMPLATE:")
2090 # print_tree(subtree)
2091 new_subentries = [
2092 x
2093 for x in subtree.children
2094 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2095 ]
2097 new_others = [
2098 x
2099 for x in subtree.children
2100 if isinstance(x, WikiNode)
2101 and x.kind == NodeKind.LIST
2102 and not x.sarg.endswith("#")
2103 ]
2105 new_contents = [
2106 clean_node(wxr, [], x)
2107 for x in subtree.children
2108 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2109 ]
2111 subentries = subentries or new_subentries
2112 others = others or new_others
2113 subglosses = new_contents
2114 rawgloss = "".join(subglosses)
2115 # Generate no gloss for translation hub pages, but add the
2116 # "translation-hub" tag for them
2117 if rawgloss == "(This entry is a translation hub.)": 2117 ↛ 2118line 2117 didn't jump to line 2118 because the condition on line 2117 was never true
2118 data_append(sense_data, "tags", "translation-hub")
2119 return push_sense()
2121 # Remove certain substrings specific to outer glosses
2122 strip_ends = [", particularly:"]
2123 for x in strip_ends:
2124 if rawgloss.endswith(x):
2125 rawgloss = rawgloss[: -len(x)].strip()
2126 break
2128 # A single gloss, or possibly an outer gloss.
2129 # Check if the possible outer gloss starts with
2130 # parenthesized tags/topics
2132 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2133 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2134 m = QUALIFIERS_RE.match(rawgloss)
2135 # (...): ... or (...(...)...): ...
2136 if m:
2137 q = m.group(1)
2138 rawgloss = rawgloss[m.end() :].strip()
2139 parse_sense_qualifier(wxr, q, sense_base)
2140 if rawgloss == "A pejorative:": 2140 ↛ 2141line 2140 didn't jump to line 2141 because the condition on line 2140 was never true
2141 data_append(sense_base, "tags", "pejorative")
2142 rawgloss = ""
2143 elif rawgloss == "Short forms.": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true
2144 data_append(sense_base, "tags", "abbreviation")
2145 rawgloss = ""
2146 elif rawgloss == "Technical or specialized senses.": 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true
2147 rawgloss = ""
2148 elif rawgloss.startswith("inflection of "):
2149 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2150 if parsed is not None: 2150 ↛ 2159line 2150 didn't jump to line 2159 because the condition on line 2150 was always true
2151 tags, origins = parsed
2152 if origins is not None: 2152 ↛ 2154line 2152 didn't jump to line 2154 because the condition on line 2152 was always true
2153 data_extend(sense_base, "form_of", origins)
2154 if tags is not None: 2154 ↛ 2157line 2154 didn't jump to line 2157 because the condition on line 2154 was always true
2155 data_extend(sense_base, "tags", tags)
2156 else:
2157 data_append(sense_base, "tags", "form-of")
2158 else:
2159 data_append(sense_base, "tags", "form-of")
2160 if rawgloss: 2160 ↛ 2191line 2160 didn't jump to line 2191 because the condition on line 2160 was always true
2161 # Code duplicating a lot of clean-up operations from later in
2162 # this block. We want to clean up the "supergloss" as much as
2163 # possible, in almost the same way as a normal gloss.
2164 supergloss = rawgloss
2166 if supergloss.startswith("; "): 2166 ↛ 2167line 2166 didn't jump to line 2167 because the condition on line 2166 was never true
2167 supergloss = supergloss[1:].strip()
2169 if supergloss.startswith(("^†", "†")):
2170 data_append(sense_base, "tags", "obsolete")
2171 supergloss = supergloss[2:].strip()
2172 elif supergloss.startswith("^‡"): 2172 ↛ 2173line 2172 didn't jump to line 2173 because the condition on line 2172 was never true
2173 data_extend(sense_base, "tags", ["obsolete", "historical"])
2174 supergloss = supergloss[2:].strip()
2176 # remove [14th century...] style brackets at the end
2177 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2179 if supergloss.startswith((",", ":")):
2180 supergloss = supergloss[1:]
2181 supergloss = supergloss.strip()
2182 if supergloss.startswith("N. of "): 2182 ↛ 2183line 2182 didn't jump to line 2183 because the condition on line 2182 was never true
2183 supergloss = "Name of " + supergloss[6:]
2184 supergloss = supergloss[2:]
2185 data_append(sense_base, "glosses", supergloss)
2186 if supergloss in ("A person:",):
2187 data_append(sense_base, "tags", "g-person")
2189 # The main recursive call (except for the exceptions at the
2190 # start of this function).
2191 for sublist in subentries:
2192 if not ( 2192 ↛ 2195line 2192 didn't jump to line 2195 because the condition on line 2192 was never true
2193 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2194 ):
2195 wxr.wtp.debug(
2196 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2197 f"with items that are not LISTs",
2198 sortid="page/1511/20230119",
2199 )
2200 continue
2201 for item in sublist.children:
2202 if not ( 2202 ↛ 2206line 2202 didn't jump to line 2206 because the condition on line 2202 was never true
2203 isinstance(item, WikiNode)
2204 and item.kind == NodeKind.LIST_ITEM
2205 ):
2206 continue
2207 # copy sense_base to prevent cross-contamination between
2208 # subglosses and other subglosses and superglosses
2209 sense_base2 = copy.deepcopy(sense_base)
2210 if parse_sense_node(item, sense_base2, pos): 2210 ↛ 2201line 2210 didn't jump to line 2201 because the condition on line 2210 was always true
2211 added = True
2213 # Capture examples.
2214 # This is called after the recursive calls above so that
2215 # sense_base is not contaminated with meta-data from
2216 # example entries for *this* gloss.
2217 examples = []
2218 if wxr.config.capture_examples: 2218 ↛ 2222line 2218 didn't jump to line 2222 because the condition on line 2218 was always true
2219 examples = extract_examples(others, sense_base)
2221 # push_sense() succeeded somewhere down-river, so skip this level
2222 if added:
2223 if examples:
2224 # this higher-up gloss has examples that we do not want to skip
2225 wxr.wtp.debug(
2226 "'{}[...]' gloss has examples we want to keep, "
2227 "but there are subglosses.".format(repr(rawgloss[:30])),
2228 sortid="page/1498/20230118",
2229 )
2230 else:
2231 return True
2233 # Some entries, e.g., "iacebam", have weird sentences in quotes
2234 # after the gloss, but these sentences don't seem to be intended
2235 # as glosses. Skip them.
2236 indexed_subglosses = list(
2237 (i, gl)
2238 for i, gl in enumerate(subglosses)
2239 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2240 )
2242 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2242 ↛ 2243line 2242 didn't jump to line 2243 because the condition on line 2242 was never true
2243 gl = indexed_subglosses[0][1].strip()
2244 if gl.endswith(":"):
2245 gl = gl[:-1].strip()
2246 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2247 if parsed is not None:
2248 infl_tags, infl_dts = parsed
2249 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2250 # Interpret others as a particular form under
2251 # "inflection of"
2252 data_extend(sense_base, "tags", infl_tags)
2253 data_extend(sense_base, "form_of", infl_dts)
2254 indexed_subglosses = indexed_subglosses[1:]
2255 elif not infl_dts:
2256 data_extend(sense_base, "tags", infl_tags)
2257 indexed_subglosses = indexed_subglosses[1:]
2259 # Create senses for remaining subglosses
2260 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2261 gloss = gloss.strip()
2262 if not gloss and len(indexed_subglosses) > 1: 2262 ↛ 2263line 2262 didn't jump to line 2263 because the condition on line 2262 was never true
2263 continue
2264 # Push a new sense (if the last one is not empty)
2265 if push_sense(): 2265 ↛ 2266line 2265 didn't jump to line 2266 because the condition on line 2265 was never true
2266 added = True
2267 # if gloss not in sense_data.get("raw_glosses", ()):
2268 # data_append(sense_data, "raw_glosses", gloss)
2269 if i == 0 and examples:
2270 # In a multi-line gloss, associate examples
2271 # with only one of them.
2272 # XXX or you could use gloss_i == len(indexed_subglosses)
2273 # to associate examples with the *last* one.
2274 data_extend(sense_data, "examples", examples)
2275 if gloss.startswith("; ") and gloss_i > 0: 2275 ↛ 2276line 2275 didn't jump to line 2276 because the condition on line 2275 was never true
2276 gloss = gloss[1:].strip()
2277 # If the gloss starts with †, mark as obsolete
2278 if gloss.startswith("^†"): 2278 ↛ 2279line 2278 didn't jump to line 2279 because the condition on line 2278 was never true
2279 data_append(sense_data, "tags", "obsolete")
2280 gloss = gloss[2:].strip()
2281 elif gloss.startswith("^‡"): 2281 ↛ 2282line 2281 didn't jump to line 2282 because the condition on line 2281 was never true
2282 data_extend(sense_data, "tags", ["obsolete", "historical"])
2283 gloss = gloss[2:].strip()
2284 # Copy data for all senses to this sense
2285 for k, v in sense_base.items():
2286 if isinstance(v, (list, tuple)):
2287 if k != "tags":
2288 # Tags handled below (countable/uncountable special)
2289 data_extend(sense_data, k, v)
2290 else:
2291 assert k not in ("tags", "categories", "topics")
2292 sense_data[k] = v # type:ignore[literal-required]
2293 # Parse the gloss for this particular sense
2294 m = QUALIFIERS_RE.match(gloss)
2295 # (...): ... or (...(...)...): ...
2296 if m:
2297 parse_sense_qualifier(wxr, m.group(1), sense_data)
2298 gloss = gloss[m.end() :].strip()
2300 # Remove common suffix "[from 14th c.]" and similar
2301 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2303 # Check to make sure we don't have unhandled list items in gloss
2304 ofs = max(gloss.find("#"), gloss.find("* "))
2305 if ofs > 10 and "(#)" not in gloss:
2306 wxr.wtp.debug(
2307 "gloss may contain unhandled list items: {}".format(gloss),
2308 sortid="page/1412",
2309 )
2310 elif "\n" in gloss: 2310 ↛ 2311line 2310 didn't jump to line 2311 because the condition on line 2310 was never true
2311 wxr.wtp.debug(
2312 "gloss contains newline: {}".format(gloss),
2313 sortid="page/1416",
2314 )
2316 # Kludge, some glosses have a comma after initial qualifiers in
2317 # parentheses
2318 if gloss.startswith((",", ":")):
2319 gloss = gloss[1:]
2320 gloss = gloss.strip()
2321 if gloss.endswith(":"):
2322 gloss = gloss[:-1].strip()
2323 if gloss.startswith("N. of "): 2323 ↛ 2324line 2323 didn't jump to line 2324 because the condition on line 2323 was never true
2324 gloss = "Name of " + gloss[6:]
2325 if gloss.startswith("†"): 2325 ↛ 2326line 2325 didn't jump to line 2326 because the condition on line 2325 was never true
2326 data_append(sense_data, "tags", "obsolete")
2327 gloss = gloss[1:]
2328 elif gloss.startswith("^†"): 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true
2329 data_append(sense_data, "tags", "obsolete")
2330 gloss = gloss[2:]
2332 # Copy tags from sense_base if any. This will not copy
2333 # countable/uncountable if either was specified in the sense,
2334 # as sometimes both are specified in word head but only one
2335 # in individual senses.
2336 countability_tags = []
2337 base_tags = sense_base.get("tags", ())
2338 sense_tags = sense_data.get("tags", ())
2339 for tag in base_tags:
2340 if tag in ("countable", "uncountable"):
2341 if tag not in countability_tags: 2341 ↛ 2343line 2341 didn't jump to line 2343 because the condition on line 2341 was always true
2342 countability_tags.append(tag)
2343 continue
2344 if tag not in sense_tags:
2345 data_append(sense_data, "tags", tag)
2346 if countability_tags:
2347 if ( 2347 ↛ 2356line 2347 didn't jump to line 2356 because the condition on line 2347 was always true
2348 "countable" not in sense_tags
2349 and "uncountable" not in sense_tags
2350 ):
2351 data_extend(sense_data, "tags", countability_tags)
2353 # If outer gloss specifies a form-of ("inflection of", see
2354 # aquamarine/German), try to parse the inner glosses as
2355 # tags for an inflected form.
2356 if "form-of" in sense_base.get("tags", ()):
2357 parsed = parse_alt_or_inflection_of(
2358 wxr, gloss, gloss_template_args
2359 )
2360 if parsed is not None: 2360 ↛ 2366line 2360 didn't jump to line 2366 because the condition on line 2360 was always true
2361 infl_tags, infl_dts = parsed
2362 if not infl_dts and infl_tags: 2362 ↛ 2366line 2362 didn't jump to line 2366 because the condition on line 2362 was always true
2363 # Interpret as a particular form under "inflection of"
2364 data_extend(sense_data, "tags", infl_tags)
2366 if not gloss: 2366 ↛ 2367line 2366 didn't jump to line 2367 because the condition on line 2366 was never true
2367 data_append(sense_data, "tags", "empty-gloss")
2368 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2369 if ( 2369 ↛ 2380line 2369 didn't jump to line 2380 because the condition on line 2369 was always true
2370 gloss_i == 0
2371 and len(sense_data.get("glosses", tuple())) >= 1
2372 ):
2373 # If we added a "high-level gloss" from rawgloss, but this
2374 # is that same gloss_i, add this instead of the raw_gloss
2375 # from before if they're different: the rawgloss was not
2376 # cleaned exactly the same as this later gloss
2377 sense_data["glosses"][-1] = gloss
2378 else:
2379 # Add the gloss for the sense.
2380 data_append(sense_data, "glosses", gloss)
2382 # Kludge: there are cases (e.g., etc./Swedish) where there are
2383 # two abbreviations in the same sense, both generated by the
2384 # {{abbreviation of|...}} template. Handle these with some magic.
2385 position = 0
2386 split_glosses = []
2387 for m in re.finditer(r"Abbreviation of ", gloss):
2388 if m.start() != position: 2388 ↛ 2387line 2388 didn't jump to line 2387 because the condition on line 2388 was always true
2389 split_glosses.append(gloss[position : m.start()])
2390 position = m.start()
2391 split_glosses.append(gloss[position:])
2392 for gloss in split_glosses:
2393 # Check if this gloss describes an alt-of or inflection-of
2394 if (
2395 lang_code != "en"
2396 and " " not in gloss
2397 and distw([word], gloss) < 0.3
2398 ):
2399 # Don't try to parse gloss if it is one word
2400 # that is close to the word itself for non-English words
2401 # (probable translations of a tag/form name)
2402 continue
2403 parsed = parse_alt_or_inflection_of(
2404 wxr, gloss, gloss_template_args
2405 )
2406 if parsed is None:
2407 continue
2408 tags, dts = parsed
2409 if not dts and tags:
2410 data_extend(sense_data, "tags", tags)
2411 continue
2412 for dt in dts: # type:ignore[union-attr]
2413 ftags = list(tag for tag in tags if tag != "form-of")
2414 if "alt-of" in tags:
2415 data_extend(sense_data, "tags", ftags)
2416 data_append(sense_data, "alt_of", dt)
2417 elif "compound-of" in tags: 2417 ↛ 2418line 2417 didn't jump to line 2418 because the condition on line 2417 was never true
2418 data_extend(sense_data, "tags", ftags)
2419 data_append(sense_data, "compound_of", dt)
2420 elif "synonym-of" in tags: 2420 ↛ 2421line 2420 didn't jump to line 2421 because the condition on line 2420 was never true
2421 data_extend(dt, "tags", ftags)
2422 data_append(sense_data, "synonyms", dt)
2423 elif tags and dt.get("word", "").startswith("of "): 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true
2424 dt["word"] = dt["word"][3:]
2425 data_append(sense_data, "tags", "form-of")
2426 data_extend(sense_data, "tags", ftags)
2427 data_append(sense_data, "form_of", dt)
2428 elif "form-of" in tags: 2428 ↛ 2412line 2428 didn't jump to line 2412 because the condition on line 2428 was always true
2429 data_extend(sense_data, "tags", tags)
2430 data_append(sense_data, "form_of", dt)
2432 if len(sense_data) == 0:
2433 if len(sense_base.get("tags", [])) == 0: 2433 ↛ 2435line 2433 didn't jump to line 2435 because the condition on line 2433 was always true
2434 del sense_base["tags"]
2435 sense_data.update(sense_base)
2436 if push_sense(): 2436 ↛ 2440line 2436 didn't jump to line 2440 because the condition on line 2436 was always true
2437 # push_sense succeded in adding a sense to pos_data
2438 added = True
2439 # print("PARSE_SENSE DONE:", pos_datas[-1])
2440 return added
2442 def parse_inflection(
2443 node: WikiNode, section: str, pos: Optional[str]
2444 ) -> None:
2445 """Parses inflection data (declension, conjugation) from the given
2446 page. This retrieves the actual inflection template
2447 parameters, which are very useful for applications that need
2448 to learn the inflection classes and generate inflected
2449 forms."""
2450 assert isinstance(node, WikiNode)
2451 assert isinstance(section, str)
2452 assert pos is None or isinstance(pos, str)
2453 # print("parse_inflection:", node)
2455 if pos is None: 2455 ↛ 2456line 2455 didn't jump to line 2456 because the condition on line 2455 was never true
2456 wxr.wtp.debug(
2457 "inflection table outside part-of-speech", sortid="page/1812"
2458 )
2459 return
2461 def inflection_template_fn(
2462 name: str, ht: TemplateArgs
2463 ) -> Optional[str]:
2464 # print("decl_conj_template_fn", name, ht)
2465 if is_panel_template(wxr, name): 2465 ↛ 2466line 2465 didn't jump to line 2466 because the condition on line 2465 was never true
2466 return ""
2467 if name in ("is-u-mutation",): 2467 ↛ 2470line 2467 didn't jump to line 2470 because the condition on line 2467 was never true
2468 # These are not to be captured as an exception to the
2469 # generic code below
2470 return None
2471 m = re.search(
2472 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2473 r"declension|inflection|mut|mutation)($|-)",
2474 name,
2475 )
2476 if m:
2477 args_ht = clean_template_args(wxr, ht)
2478 dt = {"name": name, "args": args_ht}
2479 data_append(pos_data, "inflection_templates", dt)
2481 return None
2483 # Convert the subtree back to Wikitext, then expand all and parse,
2484 # capturing templates in the process
2485 text = wxr.wtp.node_to_wikitext(node.children)
2487 # Split text into separate sections for each to-level template
2488 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2489 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2490 # The (?:...) creates a non-capturing regex group; if it was capturing,
2491 # like the group around it, it would create elements in brace_matches,
2492 # including None if it doesn't match.
2493 # 20250114: Added {| and |} into the regex because tables were being
2494 # cut into pieces by this code. Issue #973, introduction of two-part
2495 # book-end templates similar to trans-top and tran-bottom.
2496 template_sections = []
2497 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2498 # Because there is the possibility of triple curly braces
2499 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2500 # count nesting depth using pairs of two brackets, but
2501 # instead use singular braces ("{ }").
2502 # Because template delimiters should be balanced, regardless
2503 # of whether {{ or {{{ is used, and because we only care
2504 # about the outer-most delimiters (the highest level template)
2505 # we can just count the single braces when those single
2506 # braces are part of a group.
2507 table_nesting = 0
2508 # However, if we have a stray table ({| ... |}) that should always
2509 # be its own section, and should prevent templates from cutting it
2510 # into sections.
2512 # print(f"Parse inflection: {text=}")
2513 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2514 if len(brace_matches) > 1:
2515 tsection: list[str] = []
2516 after_templates = False # kludge to keep any text
2517 # before first template
2518 # with the first template;
2519 # otherwise, text
2520 # goes with preceding template
2521 for m in brace_matches:
2522 if m.startswith("\n; ") and after_templates: 2522 ↛ 2523line 2522 didn't jump to line 2523 because the condition on line 2522 was never true
2523 after_templates = False
2524 template_sections.append(tsection)
2525 tsection = []
2526 tsection.append(m)
2527 elif m.startswith("{{") or m.endswith("{|"):
2528 if (
2529 template_nesting == 0
2530 and after_templates
2531 and table_nesting == 0
2532 ):
2533 template_sections.append(tsection)
2534 tsection = []
2535 # start new section
2536 after_templates = True
2537 if m.startswith("{{"):
2538 template_nesting += 1
2539 else:
2540 # m.endswith("{|")
2541 table_nesting += 1
2542 tsection.append(m)
2543 elif m.startswith("}}") or m.endswith("|}"):
2544 if m.startswith("}}"):
2545 template_nesting -= 1
2546 if template_nesting < 0: 2546 ↛ 2547line 2546 didn't jump to line 2547 because the condition on line 2546 was never true
2547 wxr.wtp.error(
2548 "Negatively nested braces, "
2549 "couldn't split inflection templates, "
2550 "{}/{} section {}".format(
2551 word, language, section
2552 ),
2553 sortid="page/1871",
2554 )
2555 template_sections = [] # use whole text
2556 break
2557 else:
2558 table_nesting -= 1
2559 if table_nesting < 0: 2559 ↛ 2560line 2559 didn't jump to line 2560 because the condition on line 2559 was never true
2560 wxr.wtp.error(
2561 "Negatively nested table braces, "
2562 "couldn't split inflection section, "
2563 "{}/{} section {}".format(
2564 word, language, section
2565 ),
2566 sortid="page/20250114",
2567 )
2568 template_sections = [] # use whole text
2569 break
2570 tsection.append(m)
2571 else:
2572 tsection.append(m)
2573 if tsection: # dangling tsection 2573 ↛ 2581line 2573 didn't jump to line 2581 because the condition on line 2573 was always true
2574 template_sections.append(tsection)
2575 # Why do it this way around? The parser has a preference
2576 # to associate bits outside of tables with the preceding
2577 # table (`after`-variable), so a new tsection begins
2578 # at {{ and everything before it belongs to the previous
2579 # template.
2581 texts = []
2582 if not template_sections:
2583 texts = [text]
2584 else:
2585 for tsection in template_sections:
2586 texts.append("".join(tsection))
2587 if template_nesting != 0: 2587 ↛ 2588line 2587 didn't jump to line 2588 because the condition on line 2587 was never true
2588 wxr.wtp.error(
2589 "Template nesting error: "
2590 "template_nesting = {} "
2591 "couldn't split inflection templates, "
2592 "{}/{} section {}".format(
2593 template_nesting, word, language, section
2594 ),
2595 sortid="page/1896",
2596 )
2597 texts = [text]
2598 for text in texts:
2599 tree = wxr.wtp.parse(
2600 text, expand_all=True, template_fn=inflection_template_fn
2601 )
2603 if not text.strip():
2604 continue
2606 # Parse inflection tables from the section. The data is stored
2607 # under "forms".
2608 if wxr.config.capture_inflections: 2608 ↛ 2598line 2608 didn't jump to line 2598 because the condition on line 2608 was always true
2609 tablecontext = None
2610 m = re.search(r"{{([^}{|]+)\|?", text)
2611 if m:
2612 template_name = m.group(1)
2613 tablecontext = TableContext(template_name)
2615 parse_inflection_section(
2616 wxr,
2617 pos_data,
2618 word,
2619 language,
2620 pos,
2621 section,
2622 tree,
2623 tablecontext=tablecontext,
2624 )
2626 def get_subpage_section(
2627 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2628 ) -> Optional[Union[WikiNode, str]]:
2629 """Loads a subpage of the given page, and finds the section
2630 for the given language, part-of-speech, and section title. This
2631 is used for finding translations and other sections on subpages."""
2632 assert isinstance(language, str)
2633 assert isinstance(title, str)
2634 assert isinstance(subtitle, str)
2635 assert isinstance(seqs, (list, tuple))
2636 for seq in seqs:
2637 for x in seq:
2638 assert isinstance(x, str)
2639 subpage_title = word + "/" + subtitle
2640 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2641 if subpage_content is None:
2642 wxr.wtp.error(
2643 "/translations not found despite "
2644 "{{see translation subpage|...}}",
2645 sortid="page/1934",
2646 )
2647 return None
2649 def recurse(
2650 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2651 ) -> Optional[Union[str, WikiNode]]:
2652 # print(f"seq: {seq}")
2653 if not seq:
2654 return node
2655 if not isinstance(node, WikiNode):
2656 return None
2657 # print(f"node.kind: {node.kind}")
2658 if node.kind in LEVEL_KINDS:
2659 t = clean_node(wxr, None, node.largs[0])
2660 # print(f"t: {t} == seq[0]: {seq[0]}?")
2661 if t.lower() == seq[0].lower():
2662 seq = seq[1:]
2663 if not seq:
2664 return node
2665 for n in node.children:
2666 ret = recurse(n, seq)
2667 if ret is not None:
2668 return ret
2669 return None
2671 tree = wxr.wtp.parse(
2672 subpage_content,
2673 pre_expand=True,
2674 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2675 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2676 )
2677 assert tree.kind == NodeKind.ROOT
2678 for seq in seqs:
2679 ret = recurse(tree, seq)
2680 if ret is None:
2681 wxr.wtp.debug(
2682 "Failed to find subpage section {}/{} seq {}".format(
2683 title, subtitle, seq
2684 ),
2685 sortid="page/1963",
2686 )
2687 return ret
2689 def parse_linkage(
2690 data: WordData, field: str, linkagenode: LevelNode
2691 ) -> None:
2692 assert isinstance(data, dict)
2693 assert isinstance(field, str)
2694 assert isinstance(linkagenode, WikiNode)
2695 # if field == "synonyms":
2696 # print("field", field)
2697 # print("data", data)
2698 # print("children:")
2699 # print(linkagenode.children)
2700 if not wxr.config.capture_linkages: 2700 ↛ 2701line 2700 didn't jump to line 2701 because the condition on line 2700 was never true
2701 return
2702 have_panel_template = False
2703 toplevel_text = []
2704 next_navframe_sense = None # Used for "(sense):" before NavFrame
2706 def parse_linkage_item(
2707 contents: list[Union[str, WikiNode]],
2708 field: str,
2709 sense: Optional[str] = None,
2710 ):
2711 assert isinstance(contents, (list, tuple))
2712 assert isinstance(field, str)
2713 assert sense is None or isinstance(sense, str)
2715 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2716 # .format(field, sense, contents))
2718 parts: list[str] = []
2719 ruby: list[tuple[str, str]] = []
2720 urls: list[str] = []
2721 # data about link text; this is used to skip splitting on
2722 # linkage text items that contain stuff like commas; for
2723 # example "Hunde, die bellen, beißen nicht" in article
2724 # beißen is split into "Hunde", "die bellen" etc.
2725 # We take that link text and use it, eventually,
2726 # in split_at_comma_semi to skip splitting on those
2727 # commas.
2728 links_that_should_not_be_split: list[str] = []
2730 def item_recurse(
2731 contents: list[Union[str, WikiNode]], italic=False
2732 ) -> None:
2733 assert isinstance(contents, (list, tuple))
2734 nonlocal sense
2735 nonlocal ruby
2736 nonlocal parts
2737 # print("ITEM_RECURSE:", contents)
2738 for node in contents:
2739 if isinstance(node, str):
2740 parts.append(node)
2741 continue
2742 kind = node.kind
2743 # print("ITEM_RECURSE KIND:", kind,
2744 # node.sarg if node.sarg else node.largs)
2745 if kind == NodeKind.LIST:
2746 if parts: 2746 ↛ 2761line 2746 didn't jump to line 2761 because the condition on line 2746 was always true
2747 sense1: Optional[str]
2748 sense1 = clean_node(wxr, None, parts)
2749 if sense1.endswith(":"):
2750 sense1 = sense1[:-1].strip()
2751 if sense1.startswith("(") and sense1.endswith(")"): 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true
2752 sense1 = sense1[1:-1].strip()
2753 if sense1.lower() == TRANSLATIONS_TITLE: 2753 ↛ 2754line 2753 didn't jump to line 2754 because the condition on line 2753 was never true
2754 sense1 = None
2755 # print("linkage item_recurse LIST sense1:", sense1)
2756 parse_linkage_recurse(
2757 node.children, field, sense=sense1 or sense
2758 )
2759 parts = []
2760 else:
2761 parse_linkage_recurse(node.children, field, sense)
2762 elif kind in ( 2762 ↛ 2767line 2762 didn't jump to line 2767 because the condition on line 2762 was never true
2763 NodeKind.TABLE,
2764 NodeKind.TABLE_ROW,
2765 NodeKind.TABLE_CELL,
2766 ):
2767 parse_linkage_recurse(node.children, field, sense)
2768 elif kind in ( 2768 ↛ 2772line 2768 didn't jump to line 2772 because the condition on line 2768 was never true
2769 NodeKind.TABLE_HEADER_CELL,
2770 NodeKind.TABLE_CAPTION,
2771 ):
2772 continue
2773 elif kind == NodeKind.HTML: 2773 ↛ 2774line 2773 didn't jump to line 2774 because the condition on line 2773 was never true
2774 classes = (node.attrs.get("class") or "").split()
2775 if node.sarg in ("gallery", "ref", "cite", "caption"):
2776 continue
2777 elif node.sarg == "ruby":
2778 rb = parse_ruby(wxr, node)
2779 if rb:
2780 ruby.append(rb)
2781 parts.append(rb[0])
2782 continue
2783 elif node.sarg == "math":
2784 parts.append(clean_node(wxr, None, node))
2785 continue
2786 elif "interProject" in classes:
2787 continue # These do not seem to be displayed
2788 if "NavFrame" in classes:
2789 parse_linkage_recurse(node.children, field, sense)
2790 else:
2791 item_recurse(node.children, italic=italic)
2792 elif kind == NodeKind.ITALIC:
2793 item_recurse(node.children, italic=True)
2794 elif kind == NodeKind.LINK:
2795 ignore = False
2796 if isinstance(node.largs[0][0], str): 2796 ↛ 2738line 2796 didn't jump to line 2738 because the condition on line 2796 was always true
2797 v1 = node.largs[0][0].strip().lower()
2798 if v1.startswith( 2798 ↛ 2802line 2798 didn't jump to line 2802 because the condition on line 2798 was never true
2799 ns_title_prefix_tuple(wxr, "Category", True)
2800 + ns_title_prefix_tuple(wxr, "File", True)
2801 ):
2802 ignore = True
2803 if not ignore: 2803 ↛ 2738line 2803 didn't jump to line 2738 because the condition on line 2803 was always true
2804 v = node.largs[-1]
2805 if (
2806 len(node.largs) == 1
2807 and len(v) > 0
2808 and isinstance(v[0], str)
2809 and v[0][0] == ":"
2810 ):
2811 v = [v[0][1:]] + list(v[1:]) # type:ignore
2812 if isinstance(v[0], str) and not v[0].isalnum():
2813 links_that_should_not_be_split.append(
2814 "".join(v[0])
2815 ) # type: ignore
2816 item_recurse(v, italic=italic)
2817 elif kind == NodeKind.URL:
2818 if len(node.largs) < 2 and node.largs:
2819 # Naked url captured
2820 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2821 continue
2822 if len(node.largs) == 2: 2822 ↛ 2827line 2822 didn't jump to line 2827 because the condition on line 2822 was always true
2823 # Url from link with text
2824 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2825 # print(f"{node.largs=!r}")
2826 # print("linkage recurse URL {}".format(node))
2827 item_recurse(node.largs[-1], italic=italic)
2828 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2828 ↛ 2831line 2828 didn't jump to line 2831 because the condition on line 2828 was always true
2829 item_recurse(node.children, italic=italic)
2830 else:
2831 wxr.wtp.debug(
2832 "linkage item_recurse unhandled {}: {}".format(
2833 node.kind, node
2834 ),
2835 sortid="page/2073",
2836 )
2838 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2839 # .format(contents))
2841 item_recurse(contents)
2842 item = clean_node(wxr, None, parts)
2843 # print("LINKAGE ITEM CONTENTS:", parts)
2844 # print("CLEANED ITEM: {!r}".format(item))
2845 # print(f"URLS {urls=!r}")
2847 return parse_linkage_item_text(
2848 wxr,
2849 word,
2850 data,
2851 field,
2852 item,
2853 sense,
2854 ruby,
2855 pos_datas,
2856 is_reconstruction,
2857 urls or None,
2858 links_that_should_not_be_split or None,
2859 )
2861 def parse_linkage_recurse(
2862 contents: list[Union[WikiNode, str]],
2863 field: str,
2864 sense: Optional[str],
2865 ) -> None:
2866 assert isinstance(contents, (list, tuple))
2867 assert sense is None or isinstance(sense, str)
2868 nonlocal next_navframe_sense
2869 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2870 for node in contents:
2871 if isinstance(node, str):
2872 # Ignore top-level text, generally comments before the
2873 # linkages list. However, if no linkages are found, then
2874 # use this for linkages (not all words use bullet points
2875 # for linkages).
2876 toplevel_text.append(node)
2877 continue
2878 assert isinstance(node, WikiNode)
2879 kind = node.kind
2880 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2881 if kind == NodeKind.LIST:
2882 parse_linkage_recurse(node.children, field, sense)
2883 elif kind == NodeKind.LIST_ITEM:
2884 v = parse_linkage_item(node.children, field, sense)
2885 if v: 2885 ↛ 2889line 2885 didn't jump to line 2889 because the condition on line 2885 was never true
2886 # parse_linkage_item() can return a value that should
2887 # be used as the sense for the follow-on linkages,
2888 # which are typically provided in a table (see 滿)
2889 next_navframe_sense = v
2890 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2891 parse_linkage_recurse(node.children, field, sense)
2892 elif kind == NodeKind.TABLE_CELL:
2893 parse_linkage_item(node.children, field, sense)
2894 elif kind in (
2895 NodeKind.TABLE_CAPTION,
2896 NodeKind.TABLE_HEADER_CELL,
2897 NodeKind.PREFORMATTED,
2898 NodeKind.BOLD,
2899 ):
2900 continue
2901 elif kind == NodeKind.HTML: 2901 ↛ 2903line 2901 didn't jump to line 2903 because the condition on line 2901 was never true
2902 # Recurse to process inside the HTML for most tags
2903 if node.sarg in ("gallery", "ref", "cite", "caption"):
2904 continue
2905 classes = (node.attrs.get("class") or "").split()
2906 if node.sarg == "li":
2907 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2908 v = parse_linkage_item(node.children, field, sense)
2909 if v:
2910 next_navframe_sense = v
2911 elif "qualifier-content" in classes:
2912 sense1 = clean_node(wxr, None, node.children)
2913 if sense1.endswith(":"):
2914 sense1 = sense1[:-1].strip()
2915 if sense and sense1:
2916 wxr.wtp.debug(
2917 "linkage qualifier-content on multiple "
2918 "levels: {!r} and {!r}".format(sense, sense1),
2919 sortid="page/2170",
2920 )
2921 parse_linkage_recurse(node.children, field, sense1)
2922 elif "NavFrame" in classes:
2923 # NavFrame uses previously assigned next_navframe_sense
2924 # (from a "(sense):" item) and clears it afterwards
2925 parse_linkage_recurse(
2926 node.children, field, sense or next_navframe_sense
2927 )
2928 next_navframe_sense = None
2929 else:
2930 parse_linkage_recurse(node.children, field, sense)
2931 elif kind in LEVEL_KINDS: 2931 ↛ 2933line 2931 didn't jump to line 2933 because the condition on line 2931 was never true
2932 # Just recurse to any possible subsections
2933 parse_linkage_recurse(node.children, field, sense)
2934 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2935 # Skip these on top level; at least sometimes bold is
2936 # used for indicating a subtitle
2937 continue
2938 elif kind == NodeKind.LINK: 2938 ↛ 2944line 2938 didn't jump to line 2944 because the condition on line 2938 was always true
2939 # Recurse into the last argument
2940 # Apparently ":/" is used as a link to "/", so strip
2941 # initial value
2942 parse_linkage_recurse(node.largs[-1], field, sense)
2943 else:
2944 wxr.wtp.debug(
2945 "parse_linkage_recurse unhandled {}: {}".format(
2946 kind, node
2947 ),
2948 sortid="page/2196",
2949 )
2951 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2952 nonlocal have_panel_template
2953 if is_panel_template(wxr, name):
2954 have_panel_template = True
2955 return ""
2956 return None
2958 # Main body of parse_linkage()
2959 l_nodes = []
2960 l_sense = ""
2961 for node in linkagenode.children:
2962 if (
2963 isinstance(node, TemplateNode)
2964 and node.template_name == "zh-dial"
2965 ):
2966 extract_zh_dial_template(wxr, data, node, l_sense)
2967 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
2968 for list_item in node.find_child(NodeKind.LIST_ITEM):
2969 for t_node in list_item.find_child(NodeKind.TEMPLATE):
2970 if t_node.template_name in ["s", "sense"]:
2971 l_sense = clean_node(wxr, None, t_node).strip(
2972 "(): "
2973 )
2974 l_nodes.append(node)
2975 else:
2976 l_nodes.append(node)
2977 text = wxr.wtp.node_to_wikitext(l_nodes)
2978 parsed = wxr.wtp.parse(
2979 text, expand_all=True, template_fn=linkage_template_fn1
2980 )
2981 parse_linkage_recurse(parsed.children, field, None)
2982 if not data.get(field) and not have_panel_template:
2983 text = "".join(toplevel_text).strip()
2984 if "\n" not in text and "," in text and text.count(",") > 3:
2985 if not text.startswith("See "): 2985 ↛ exitline 2985 didn't return from function 'parse_linkage' because the condition on line 2985 was always true
2986 parse_linkage_item([text], field, None)
2988 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2989 """Parses translations for a word. This may also pull in translations
2990 from separate translation subpages."""
2991 assert isinstance(data, dict)
2992 assert isinstance(xlatnode, WikiNode)
2993 # print("===== PARSE_TRANSLATIONS {} {} {}"
2994 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2995 # print("parse_translations xlatnode={}".format(xlatnode))
2996 if not wxr.config.capture_translations: 2996 ↛ 2997line 2996 didn't jump to line 2997 because the condition on line 2996 was never true
2997 return
2998 sense_parts: list[Union[WikiNode, str]] = []
2999 sense: Optional[str] = None
3001 def parse_translation_item(
3002 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
3003 ) -> None:
3004 nonlocal sense
3005 assert isinstance(contents, list)
3006 assert lang is None or isinstance(lang, str)
3007 # print("PARSE_TRANSLATION_ITEM:", contents)
3009 langcode: Optional[str] = None
3010 if sense is None:
3011 sense = clean_node(wxr, data, sense_parts).strip()
3012 # print("sense <- clean_node: ", sense)
3013 idx = sense.find("See also translations at")
3014 if idx > 0: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true
3015 wxr.wtp.debug(
3016 "Skipping translation see also: {}".format(sense),
3017 sortid="page/2361",
3018 )
3019 sense = sense[:idx].strip()
3020 if sense.endswith(":"): 3020 ↛ 3021line 3020 didn't jump to line 3021 because the condition on line 3020 was never true
3021 sense = sense[:-1].strip()
3022 if sense.endswith("—"): 3022 ↛ 3023line 3022 didn't jump to line 3023 because the condition on line 3022 was never true
3023 sense = sense[:-1].strip()
3024 translations_from_template: list[str] = []
3026 def translation_item_template_fn(
3027 name: str, ht: TemplateArgs
3028 ) -> Optional[str]:
3029 nonlocal langcode
3030 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
3031 if is_panel_template(wxr, name):
3032 return ""
3033 if name in ("t+check", "t-check", "t-needed"):
3034 # We ignore these templates. They seem to have outright
3035 # garbage in some entries, and very varying formatting in
3036 # others. These should be transitory and unreliable
3037 # anyway.
3038 return "__IGNORE__"
3039 if name in ("t", "t+", "t-simple", "tt", "tt+"):
3040 code = ht.get(1)
3041 if code: 3041 ↛ 3051line 3041 didn't jump to line 3051 because the condition on line 3041 was always true
3042 if langcode and code != langcode:
3043 wxr.wtp.debug(
3044 "inconsistent language codes {} vs "
3045 "{} in translation item: {!r} {}".format(
3046 langcode, code, name, ht
3047 ),
3048 sortid="page/2386",
3049 )
3050 langcode = code
3051 tr = ht.get(2)
3052 if tr:
3053 tr = clean_node(wxr, None, [tr])
3054 translations_from_template.append(tr)
3055 return None
3056 if name == "t-egy":
3057 langcode = "egy"
3058 return None
3059 if name == "ttbc":
3060 code = ht.get(1)
3061 if code: 3061 ↛ 3063line 3061 didn't jump to line 3063 because the condition on line 3061 was always true
3062 langcode = code
3063 return None
3064 if name == "trans-see": 3064 ↛ 3065line 3064 didn't jump to line 3065 because the condition on line 3064 was never true
3065 wxr.wtp.error(
3066 "UNIMPLEMENTED trans-see template", sortid="page/2405"
3067 )
3068 return ""
3069 if name.endswith("-top"): 3069 ↛ 3070line 3069 didn't jump to line 3070 because the condition on line 3069 was never true
3070 return ""
3071 if name.endswith("-bottom"): 3071 ↛ 3072line 3071 didn't jump to line 3072 because the condition on line 3071 was never true
3072 return ""
3073 if name.endswith("-mid"): 3073 ↛ 3074line 3073 didn't jump to line 3074 because the condition on line 3073 was never true
3074 return ""
3075 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
3076 # .format(name),
3077 # sortid="page/2414")
3078 return None
3080 sublists = list(
3081 x
3082 for x in contents
3083 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
3084 )
3085 contents = list(
3086 x
3087 for x in contents
3088 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
3089 )
3091 item = clean_node(
3092 wxr, data, contents, template_fn=translation_item_template_fn
3093 )
3094 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
3096 # Parse the translation item.
3097 if item: 3097 ↛ exitline 3097 didn't return from function 'parse_translation_item' because the condition on line 3097 was always true
3098 lang = parse_translation_item_text(
3099 wxr,
3100 word,
3101 data,
3102 item,
3103 sense,
3104 lang,
3105 langcode,
3106 translations_from_template,
3107 is_reconstruction,
3108 )
3110 # Handle sublists. They are frequently used for different
3111 # scripts for the language and different variants of the
3112 # language. We will include the lower-level header as a
3113 # tag in those cases.
3114 for listnode in sublists:
3115 assert listnode.kind == NodeKind.LIST
3116 for node in listnode.children:
3117 if not isinstance(node, WikiNode): 3117 ↛ 3118line 3117 didn't jump to line 3118 because the condition on line 3117 was never true
3118 continue
3119 if node.kind == NodeKind.LIST_ITEM: 3119 ↛ 3116line 3119 didn't jump to line 3116 because the condition on line 3119 was always true
3120 parse_translation_item(node.children, lang=lang)
3122 def parse_translation_template(node: WikiNode) -> None:
3123 assert isinstance(node, WikiNode)
3125 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3126 nonlocal sense_parts
3127 nonlocal sense
3128 if is_panel_template(wxr, name):
3129 return ""
3130 if name == "see also":
3131 # XXX capture
3132 # XXX for example, "/" has top-level list containing
3133 # see also items. So also should parse those.
3134 return ""
3135 if name == "trans-see":
3136 # XXX capture
3137 return ""
3138 if name == "see translation subpage": 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true
3139 sense_parts = []
3140 sense = None
3141 sub = ht.get(1, "")
3142 if sub:
3143 m = re.match(
3144 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3145 )
3146 else:
3147 m = None
3148 etym = ""
3149 etym_numbered = ""
3150 pos = ""
3151 if m:
3152 etym_numbered = m.group(1)
3153 etym = m.group(2)
3154 pos = m.group(3)
3155 if not sub:
3156 wxr.wtp.debug(
3157 "no part-of-speech in "
3158 "{{see translation subpage|...}}, "
3159 "defaulting to just wxr.wtp.section "
3160 "(= language)",
3161 sortid="page/2468",
3162 )
3163 # seq sent to get_subpage_section without sub and pos
3164 seq = [
3165 language,
3166 TRANSLATIONS_TITLE,
3167 ]
3168 elif (
3169 m
3170 and etym.lower().strip() in ETYMOLOGY_TITLES
3171 and pos.lower() in POS_TITLES
3172 ):
3173 seq = [
3174 language,
3175 etym_numbered,
3176 pos,
3177 TRANSLATIONS_TITLE,
3178 ]
3179 elif sub.lower() in POS_TITLES:
3180 # seq with sub but not pos
3181 seq = [
3182 language,
3183 sub,
3184 TRANSLATIONS_TITLE,
3185 ]
3186 else:
3187 # seq with sub and pos
3188 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3189 if pos.lower() not in POS_TITLES:
3190 wxr.wtp.debug(
3191 "unhandled see translation subpage: "
3192 "language={} sub={} "
3193 "wxr.wtp.subsection={}".format(
3194 language, sub, wxr.wtp.subsection
3195 ),
3196 sortid="page/2478",
3197 )
3198 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3199 subnode = get_subpage_section(
3200 wxr.wtp.title or "MISSING_TITLE",
3201 TRANSLATIONS_TITLE,
3202 [seq],
3203 )
3204 if subnode is None or not isinstance(subnode, WikiNode):
3205 # Failed to find the normal subpage section
3206 # seq with sub and pos
3207 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3208 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
3209 seqs: list[list[str] | tuple[str, ...]] = [
3210 [TRANSLATIONS_TITLE],
3211 [language, pos],
3212 ]
3213 subnode = get_subpage_section(
3214 wxr.wtp.title or "MISSING_TITLE",
3215 TRANSLATIONS_TITLE,
3216 seqs,
3217 )
3218 if subnode is not None and isinstance(subnode, WikiNode):
3219 parse_translations(data, subnode)
3220 return ""
3221 if name in (
3222 "c",
3223 "C",
3224 "categorize",
3225 "cat",
3226 "catlangname",
3227 "topics",
3228 "top",
3229 "qualifier",
3230 "cln",
3231 ):
3232 # These are expanded in the default way
3233 return None
3234 if name in (
3235 "trans-top",
3236 "trans-top-see",
3237 ):
3238 # XXX capture id from trans-top? Capture sense here
3239 # instead of trying to parse it from expanded content?
3240 if ht.get(1):
3241 sense_parts = []
3242 sense = ht.get(1)
3243 else:
3244 sense_parts = []
3245 sense = None
3246 return None
3247 if name in (
3248 "trans-bottom",
3249 "trans-mid",
3250 "checktrans-mid",
3251 "checktrans-bottom",
3252 ):
3253 return None
3254 if name == "checktrans-top":
3255 sense_parts = []
3256 sense = None
3257 return ""
3258 if name == "trans-top-also":
3259 # XXX capture?
3260 sense_parts = []
3261 sense = None
3262 return ""
3263 wxr.wtp.error(
3264 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3265 name, ht
3266 ),
3267 sortid="page/2517",
3268 )
3269 return ""
3271 wxr.wtp.expand(
3272 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3273 )
3275 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3276 nonlocal sense
3277 nonlocal sense_parts
3278 for node in xlatnode.children:
3279 # print(node)
3280 if isinstance(node, str):
3281 if sense:
3282 if not node.isspace():
3283 wxr.wtp.debug(
3284 "skipping string in the middle of "
3285 "translations: {}".format(node),
3286 sortid="page/2530",
3287 )
3288 continue
3289 # Add a part to the sense
3290 sense_parts.append(node)
3291 sense = None
3292 continue
3293 assert isinstance(node, WikiNode)
3294 kind = node.kind
3295 if kind == NodeKind.LIST:
3296 for item in node.children:
3297 if not isinstance(item, WikiNode): 3297 ↛ 3298line 3297 didn't jump to line 3298 because the condition on line 3297 was never true
3298 continue
3299 if item.kind != NodeKind.LIST_ITEM: 3299 ↛ 3300line 3299 didn't jump to line 3300 because the condition on line 3299 was never true
3300 continue
3301 if item.sarg == ":": 3301 ↛ 3302line 3301 didn't jump to line 3302 because the condition on line 3301 was never true
3302 continue
3303 parse_translation_item(item.children)
3304 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3304 ↛ 3308line 3304 didn't jump to line 3308 because the condition on line 3304 was never true
3305 # Silently skip list items that are just indented; these
3306 # are used for text between translations, such as indicating
3307 # translations that need to be checked.
3308 pass
3309 elif kind == NodeKind.TEMPLATE:
3310 parse_translation_template(node)
3311 elif kind in ( 3311 ↛ 3316line 3311 didn't jump to line 3316 because the condition on line 3311 was never true
3312 NodeKind.TABLE,
3313 NodeKind.TABLE_ROW,
3314 NodeKind.TABLE_CELL,
3315 ):
3316 parse_translation_recurse(node)
3317 elif kind == NodeKind.HTML:
3318 if node.attrs.get("class") == "NavFrame": 3318 ↛ 3324line 3318 didn't jump to line 3324 because the condition on line 3318 was never true
3319 # Reset ``sense_parts`` (and force recomputing
3320 # by clearing ``sense``) as each NavFrame specifies
3321 # its own sense. This helps eliminate garbage coming
3322 # from text at the beginning at the translations
3323 # section.
3324 sense_parts = []
3325 sense = None
3326 # for item in node.children:
3327 # if not isinstance(item, WikiNode):
3328 # continue
3329 # parse_translation_recurse(item)
3330 parse_translation_recurse(node)
3331 elif kind in LEVEL_KINDS: 3331 ↛ 3333line 3331 didn't jump to line 3333 because the condition on line 3331 was never true
3332 # Sub-levels will be recursed elsewhere
3333 pass
3334 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3335 parse_translation_recurse(node)
3336 elif kind == NodeKind.PREFORMATTED: 3336 ↛ 3337line 3336 didn't jump to line 3337 because the condition on line 3336 was never true
3337 print("parse_translation_recurse: PREFORMATTED:", node)
3338 elif kind == NodeKind.LINK: 3338 ↛ 3392line 3338 didn't jump to line 3392 because the condition on line 3338 was always true
3339 arg0 = node.largs[0]
3340 # Kludge: I've seen occasional normal links to translation
3341 # subpages from main pages (e.g., language/English/Noun
3342 # in July 2021) instead of the normal
3343 # {{see translation subpage|...}} template. This should
3344 # handle them. Note: must be careful not to read other
3345 # links, particularly things like in "human being":
3346 # "a human being -- see [[man/translations]]" (group title)
3347 if ( 3347 ↛ 3355line 3347 didn't jump to line 3355 because the condition on line 3347 was never true
3348 isinstance(arg0, (list, tuple))
3349 and arg0
3350 and isinstance(arg0[0], str)
3351 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3352 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3353 == wxr.wtp.title
3354 ):
3355 wxr.wtp.debug(
3356 "translations subpage link found on main "
3357 "page instead "
3358 "of normal {{see translation subpage|...}}",
3359 sortid="page/2595",
3360 )
3361 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3362 if sub.lower() in POS_TITLES:
3363 seq = [
3364 language,
3365 sub,
3366 TRANSLATIONS_TITLE,
3367 ]
3368 subnode = get_subpage_section(
3369 wxr.wtp.title,
3370 TRANSLATIONS_TITLE,
3371 [seq],
3372 )
3373 if subnode is not None and isinstance(
3374 subnode, WikiNode
3375 ):
3376 parse_translations(data, subnode)
3377 else:
3378 wxr.wtp.error(
3379 "/translations link outside part-of-speech"
3380 )
3382 if (
3383 len(arg0) >= 1
3384 and isinstance(arg0[0], str)
3385 and not arg0[0].lower().startswith("category:")
3386 ):
3387 for x in node.largs[-1]:
3388 if isinstance(x, str): 3388 ↛ 3391line 3388 didn't jump to line 3391 because the condition on line 3388 was always true
3389 sense_parts.append(x)
3390 else:
3391 parse_translation_recurse(x)
3392 elif not sense:
3393 sense_parts.append(node)
3394 else:
3395 wxr.wtp.debug(
3396 "skipping text between translation items/senses: "
3397 "{}".format(node),
3398 sortid="page/2621",
3399 )
3401 # Main code of parse_translation(). We want ``sense`` to be assigned
3402 # regardless of recursion levels, and thus the code is structured
3403 # to define at this level and recurse in parse_translation_recurse().
3404 parse_translation_recurse(xlatnode)
3406 def parse_etymology(data: WordData, node: WikiNode) -> None:
3407 """Parses an etymology section."""
3408 assert isinstance(data, dict)
3409 assert isinstance(node, WikiNode)
3411 templates: list[TemplateData] = []
3413 # Counter for preventing the capture of etymology templates
3414 # when we are inside templates that we want to ignore (i.e.,
3415 # not capture).
3416 ignore_count = 0
3418 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3419 nonlocal ignore_count
3420 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3421 return ""
3422 if re.match(ignored_etymology_templates_re, name):
3423 ignore_count += 1
3424 return None
3426 # CONTINUE_HERE
3428 def etym_post_template_fn(
3429 name: str, ht: TemplateArgs, expansion: str
3430 ) -> None:
3431 nonlocal ignore_count
3432 if name in wikipedia_templates:
3433 parse_wikipedia_template(wxr, data, ht)
3434 return None
3435 if re.match(ignored_etymology_templates_re, name):
3436 ignore_count -= 1
3437 return None
3438 if ignore_count == 0: 3438 ↛ 3444line 3438 didn't jump to line 3444 because the condition on line 3438 was always true
3439 ht = clean_template_args(wxr, ht)
3440 expansion = clean_node(wxr, None, expansion)
3441 templates.append(
3442 {"name": name, "args": ht, "expansion": expansion}
3443 )
3444 return None
3446 # Remove any subsections
3447 contents = list(
3448 x
3449 for x in node.children
3450 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3451 )
3452 # Convert to text, also capturing templates using post_template_fn
3453 text = clean_node(
3454 wxr,
3455 None,
3456 contents,
3457 template_fn=etym_template_fn,
3458 post_template_fn=etym_post_template_fn,
3459 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3460 # Save the collected information.
3461 if len(text) > 0:
3462 data["etymology_text"] = text
3463 if len(templates) > 0:
3464 # Some etymology templates, like Template:root do not generate
3465 # text, so they should be added here. Elsewhere, we check
3466 # for Template:root and add some text to the expansion to please
3467 # the validation.
3468 data["etymology_templates"] = templates
3470 for child_node in node.find_child_recursively( 3470 ↛ exitline 3470 didn't return from function 'parse_etymology' because the loop on line 3470 didn't complete
3471 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3472 ):
3473 if child_node.kind in LEVEL_KIND_FLAGS:
3474 break
3475 elif isinstance( 3475 ↛ 3478line 3475 didn't jump to line 3478 because the condition on line 3475 was never true
3476 child_node, TemplateNode
3477 ) and child_node.template_name in ["zh-x", "zh-q"]:
3478 if "etymology_examples" not in data:
3479 data["etymology_examples"] = []
3480 data["etymology_examples"].extend(
3481 extract_template_zh_x(
3482 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3483 )
3484 )
3486 def parse_descendants(
3487 data: WordData, node: WikiNode, is_proto_root_derived_section=False
3488 ) -> None:
3489 """Parses a Descendants section. Also used on Derived terms and
3490 Extensions sections when we are dealing with a root of a reconstructed
3491 language (i.e. is_proto_root_derived_section == True), as they use the
3492 same structure. In the latter case, The wiktionary convention is not to
3493 title the section as descendants since the immediate offspring of the
3494 roots are morphologically derived terms within the same proto-language.
3495 Still, since the rest of the section lists true descendants, we use the
3496 same function. Entries in the descendants list that are technically
3497 derived terms will have a field "tags": ["derived"]."""
3498 assert isinstance(data, dict)
3499 assert isinstance(node, WikiNode)
3500 assert isinstance(is_proto_root_derived_section, bool)
3502 descendants = []
3504 # Most templates that are not in a LIST should be ignored as they only
3505 # add formatting, like "desc-top", "der-top3", etc. Any template in
3506 # unignored_non_list_templates actually contains relevant descendant
3507 # info. E.g. "CJKV" is often the only line at all in descendants
3508 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would
3509 # be skipped if we didn't handle it specially as it is not part of a
3510 # LIST, and additionally is in panel_templates. There are probably more
3511 # such templates that should be added to this...
3512 unignored_non_list_templates: list[str] = ["CJKV"]
3514 def process_list_item_children(
3515 sarg: str, children: list[Union[str, WikiNode]]
3516 ) -> None:
3517 assert isinstance(sarg, str)
3518 assert isinstance(children, list)
3519 # The descendants section is a hierarchical bulleted listed. sarg is
3520 # usually some number of "*" characters indicating the level of
3521 # indentation of the line, e.g. "***" indicates the line will be
3522 # thrice-indented. A bare ";" is used to indicate a subtitle-like
3523 # line with no indentation. ":" at the end of one or more "*"s is
3524 # used to indicate that the bullet will not be displayed.
3525 item_data: DescendantData = {"depth": sarg.count("*")}
3526 templates: list[TemplateData] = []
3527 is_derived = False
3529 # Counter for preventing the capture of templates when we are inside
3530 # templates that we want to ignore (i.e., not capture).
3531 ignore_count = 0
3533 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3534 nonlocal ignore_count
3535 if ( 3535 ↛ 3539line 3535 didn't jump to line 3539 because the condition on line 3535 was never true
3536 is_panel_template(wxr, name)
3537 and name not in unignored_non_list_templates
3538 ):
3539 return ""
3540 if re.match(ignored_descendants_templates_re, name):
3541 ignore_count += 1
3542 return None
3544 def desc_post_template_fn(
3545 name: str, ht: TemplateArgs, expansion: str
3546 ) -> None:
3547 nonlocal ignore_count
3548 if name in wikipedia_templates: 3548 ↛ 3549line 3548 didn't jump to line 3549 because the condition on line 3548 was never true
3549 parse_wikipedia_template(wxr, data, ht)
3550 return None
3551 if re.match(ignored_descendants_templates_re, name):
3552 ignore_count -= 1
3553 return None
3554 if ignore_count == 0: 3554 ↛ 3570line 3554 didn't jump to line 3570 because the condition on line 3554 was always true
3555 ht = clean_template_args(wxr, ht)
3556 nonlocal is_derived
3557 # If we're in a proto-root Derived terms or Extensions
3558 # section, and the current list item has a link template
3559 # to a term in the same proto-language, then we tag this
3560 # descendant entry with "derived"
3561 is_derived = (
3562 is_proto_root_derived_section
3563 and (name == "l" or name == "link")
3564 and ("1" in ht and ht["1"] == lang_code)
3565 )
3566 expansion = clean_node(wxr, None, expansion)
3567 templates.append(
3568 {"name": name, "args": ht, "expansion": expansion}
3569 )
3570 return None
3572 text = clean_node(
3573 wxr,
3574 None,
3575 children,
3576 template_fn=desc_template_fn,
3577 post_template_fn=desc_post_template_fn,
3578 )
3579 item_data["templates"] = templates
3580 item_data["text"] = text
3581 if is_derived: 3581 ↛ 3582line 3581 didn't jump to line 3582 because the condition on line 3581 was never true
3582 item_data["tags"] = ["derived"]
3583 descendants.append(item_data)
3585 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]:
3586 for i, child in enumerate(node.children):
3587 if isinstance(child, WikiNode):
3588 yield (i, child)
3590 def get_sublist_index(list_item: WikiNode) -> Optional[int]:
3591 for i, child in node_children(list_item):
3592 if child.kind == NodeKind.LIST:
3593 return i
3594 return None
3596 def get_descendants(node: WikiNode) -> None:
3597 """Appends the data for every list item in every list in node
3598 to descendants."""
3599 for _, c in node_children(node):
3600 if (
3601 c.kind == NodeKind.TEMPLATE
3602 and c.largs
3603 and len(c.largs[0]) == 1
3604 and isinstance(c.largs[0][0], str)
3605 and c.largs[0][0] in unignored_non_list_templates
3606 ):
3607 # Some Descendants sections have no wikitext list. Rather,
3608 # the list is entirely generated by a single template (see
3609 # e.g. the use of {{CJKV}} in Chinese entries).
3610 process_list_item_children("", [c])
3611 elif c.kind == NodeKind.HTML: 3611 ↛ 3617line 3611 didn't jump to line 3617 because the condition on line 3611 was never true
3612 # The Descendants sections for many languages feature
3613 # templates that generate html to add styling (e.g. using
3614 # multiple columns) to the list, so that the actual wikitext
3615 # list items are found within a <div>. We look within the
3616 # children of the html node for the actual list items.
3617 get_descendants(c)
3618 elif c.kind == NodeKind.LIST:
3619 get_descendants(c)
3620 elif c.kind == NodeKind.LIST_ITEM:
3621 # If a LIST_ITEM has subitems in a sublist, usually its
3622 # last child is a LIST. However, sometimes after the LIST
3623 # there is one or more trailing LIST_ITEMs, like "\n" or
3624 # a reference template. If there is a sublist, we discard
3625 # everything after it.
3626 i = get_sublist_index(c)
3627 if i is not None:
3628 process_list_item_children(c.sarg, c.children[:i])
3629 get_descendants(c.children[i]) # type: ignore[arg-type]
3630 else:
3631 process_list_item_children(c.sarg, c.children)
3633 # parse_descendants() actual work starts here
3634 get_descendants(node)
3636 # if e.g. on a PIE page, there may be both Derived terms and Extensions
3637 # sections, in which case this function will be called multiple times,
3638 # so we have to check if descendants exists first.
3639 if "descendants" in data: 3639 ↛ 3640line 3639 didn't jump to line 3640 because the condition on line 3639 was never true
3640 data["descendants"].extend(descendants)
3641 else:
3642 data["descendants"] = descendants
3644 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3645 """This recurses into a subtree in the parse tree for a page."""
3646 nonlocal etym_data
3647 nonlocal pos_data
3648 nonlocal inside_level_four
3650 redirect_list: list[str] = [] # for `zh-see` template
3652 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3653 """This is called for otherwise unprocessed parts of the page.
3654 We still expand them so that e.g. Category links get captured."""
3655 if name in wikipedia_templates:
3656 data = select_data()
3657 parse_wikipedia_template(wxr, data, ht)
3658 return None
3659 if is_panel_template(wxr, name):
3660 return ""
3661 return None
3663 for node in treenode.children:
3664 if not isinstance(node, WikiNode):
3665 # print(" X{}".format(repr(node)[:40]))
3666 continue
3667 if isinstance(node, TemplateNode):
3668 if process_soft_redirect_template(wxr, node, redirect_list):
3669 continue
3670 elif node.template_name == "zh-forms":
3671 extract_zh_forms_template(wxr, node, select_data())
3673 if node.kind not in LEVEL_KINDS:
3674 # XXX handle e.g. wikipedia links at the top of a language
3675 # XXX should at least capture "also" at top of page
3676 if node.kind in (
3677 NodeKind.HLINE,
3678 NodeKind.LIST,
3679 NodeKind.LIST_ITEM,
3680 ):
3681 continue
3682 # print(" UNEXPECTED: {}".format(node))
3683 # Clean the node to collect category links
3684 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3685 continue
3686 t = clean_node(
3687 wxr, etym_data, node.sarg if node.sarg else node.largs
3688 )
3689 t = t.lower()
3690 # XXX these counts were never implemented fully, and even this
3691 # gets discarded: Search STATISTICS_IMPLEMENTATION
3692 wxr.config.section_counts[t] += 1
3693 # print("PROCESS_CHILDREN: T:", repr(t))
3694 if t in IGNORED_TITLES:
3695 pass
3696 elif t.startswith(PRONUNCIATION_TITLE):
3697 # Chinese Pronunciation section kludge; we demote these to
3698 # be level 4 instead of 3 so that they're part of a larger
3699 # etymology hierarchy; usually the data here is empty and
3700 # acts as an inbetween between POS and Etymology data
3701 inside_level_four = True
3702 if t.startswith(PRONUNCIATION_TITLE + " "):
3703 # Pronunciation 1, etc, are used in Chinese Glyphs,
3704 # and each of them may have senses under Definition
3705 push_level_four_section(True)
3706 wxr.wtp.start_subsection(None)
3707 if wxr.config.capture_pronunciation: 3707 ↛ 3789line 3707 didn't jump to line 3789 because the condition on line 3707 was always true
3708 data = select_data()
3709 parse_pronunciation(
3710 wxr,
3711 node,
3712 data,
3713 etym_data,
3714 have_etym,
3715 base_data,
3716 lang_code,
3717 )
3718 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3719 push_etym()
3720 wxr.wtp.start_subsection(None)
3721 if wxr.config.capture_etymologies: 3721 ↛ 3789line 3721 didn't jump to line 3789 because the condition on line 3721 was always true
3722 m = re.search(r"\s(\d+)$", t)
3723 if m:
3724 etym_data["etymology_number"] = int(m.group(1))
3725 parse_etymology(etym_data, node)
3726 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3727 data = select_data()
3728 parse_descendants(data, node)
3729 elif ( 3729 ↛ 3735line 3729 didn't jump to line 3735 because the condition on line 3729 was never true
3730 t in PROTO_ROOT_DERIVED_TITLES
3731 and pos == "root"
3732 and is_reconstruction
3733 and wxr.config.capture_descendants
3734 ):
3735 data = select_data()
3736 parse_descendants(data, node, True)
3737 elif t == TRANSLATIONS_TITLE:
3738 data = select_data()
3739 parse_translations(data, node)
3740 elif t in INFLECTION_TITLES:
3741 parse_inflection(node, t, pos)
3742 elif t == "alternative forms":
3743 extract_alt_form_section(wxr, select_data(), node)
3744 else:
3745 lst = t.split()
3746 while len(lst) > 1 and lst[-1].isdigit(): 3746 ↛ 3747line 3746 didn't jump to line 3747 because the condition on line 3746 was never true
3747 lst = lst[:-1]
3748 t_no_number = " ".join(lst).lower()
3749 if t_no_number in POS_TITLES:
3750 push_pos()
3751 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3752 pos = dt["pos"] or "MISSING_POS"
3753 wxr.wtp.start_subsection(t)
3754 if "debug" in dt:
3755 wxr.wtp.debug(
3756 "{} in section {}".format(dt["debug"], t),
3757 sortid="page/2755",
3758 )
3759 if "warning" in dt: 3759 ↛ 3760line 3759 didn't jump to line 3760 because the condition on line 3759 was never true
3760 wxr.wtp.warning(
3761 "{} in section {}".format(dt["warning"], t),
3762 sortid="page/2759",
3763 )
3764 if "error" in dt: 3764 ↛ 3765line 3764 didn't jump to line 3765 because the condition on line 3764 was never true
3765 wxr.wtp.error(
3766 "{} in section {}".format(dt["error"], t),
3767 sortid="page/2763",
3768 )
3769 # Parse word senses for the part-of-speech
3770 parse_part_of_speech(node, pos)
3771 if "tags" in dt:
3772 for pdata in pos_datas:
3773 data_extend(pdata, "tags", dt["tags"])
3774 elif t_no_number in LINKAGE_TITLES:
3775 # print(f"LINKAGE_TITLES NODE {node=}")
3776 rel = LINKAGE_TITLES[t_no_number]
3777 data = select_data()
3778 parse_linkage(data, rel, node)
3779 elif t_no_number == COMPOUNDS_TITLE:
3780 data = select_data()
3781 if wxr.config.capture_compounds: 3781 ↛ 3789line 3781 didn't jump to line 3789 because the condition on line 3781 was always true
3782 parse_linkage(data, "derived", node)
3784 # XXX parse interesting templates also from other sections. E.g.,
3785 # {{Letter|...}} in ===See also===
3786 # Also <gallery>
3788 # Recurse to children of this node, processing subtitles therein
3789 stack.append(t)
3790 process_children(node, pos)
3791 stack.pop()
3793 if len(redirect_list) > 0:
3794 if len(pos_data) > 0:
3795 pos_data["redirects"] = redirect_list
3796 if "pos" not in pos_data: 3796 ↛ 3797line 3796 didn't jump to line 3797 because the condition on line 3796 was never true
3797 pos_data["pos"] = "soft-redirect"
3798 else:
3799 new_page_data = copy.deepcopy(base_data)
3800 new_page_data["redirects"] = redirect_list
3801 if "pos" not in new_page_data: 3801 ↛ 3803line 3801 didn't jump to line 3803 because the condition on line 3801 was always true
3802 new_page_data["pos"] = "soft-redirect"
3803 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3804 page_datas.append(new_page_data)
3806 def extract_examples(
3807 others: list[WikiNode], sense_base: SenseData
3808 ) -> list[ExampleData]:
3809 """Parses through a list of definitions and quotes to find examples.
3810 Returns a list of example dicts to be added to sense data. Adds
3811 meta-data, mostly categories, into sense_base."""
3812 assert isinstance(others, list)
3813 examples: list[ExampleData] = []
3815 for sub in others:
3816 if not sub.sarg.endswith((":", "*")): 3816 ↛ 3817line 3816 didn't jump to line 3817 because the condition on line 3816 was never true
3817 continue
3818 for item in sub.children:
3819 if not isinstance(item, WikiNode): 3819 ↛ 3820line 3819 didn't jump to line 3820 because the condition on line 3819 was never true
3820 continue
3821 if item.kind != NodeKind.LIST_ITEM: 3821 ↛ 3822line 3821 didn't jump to line 3822 because the condition on line 3821 was never true
3822 continue
3823 usex_type = None
3824 example_template_args = []
3825 example_template_names = []
3826 taxons = set()
3828 # Bypass this function when parsing Chinese, Japanese and
3829 # quotation templates.
3830 new_example_lists = extract_example_list_item(
3831 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3832 )
3833 if len(new_example_lists) > 0:
3834 examples.extend(new_example_lists)
3835 continue
3837 def usex_template_fn(
3838 name: str, ht: TemplateArgs
3839 ) -> Optional[str]:
3840 nonlocal usex_type
3841 if is_panel_template(wxr, name):
3842 return ""
3843 if name in usex_templates:
3844 usex_type = "example"
3845 example_template_args.append(ht)
3846 example_template_names.append(name)
3847 elif name in quotation_templates:
3848 usex_type = "quotation"
3849 elif name in taxonomy_templates: 3849 ↛ 3850line 3849 didn't jump to line 3850 because the condition on line 3849 was never true
3850 taxons.update(ht.get(1, "").split())
3851 for prefix in template_linkages_to_ignore_in_examples:
3852 if re.search(
3853 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3854 ):
3855 return ""
3856 return None
3858 # bookmark
3859 ruby: list[tuple[str, str]] = []
3860 contents = item.children
3861 if lang_code == "ja":
3862 # Capture ruby contents if this is a Japanese language
3863 # example.
3864 # print(contents)
3865 if ( 3865 ↛ 3870line 3865 didn't jump to line 3870 because the condition on line 3865 was never true
3866 contents
3867 and isinstance(contents, str)
3868 and re.match(r"\s*$", contents[0])
3869 ):
3870 contents = contents[1:]
3871 exp = wxr.wtp.parse(
3872 wxr.wtp.node_to_wikitext(contents),
3873 # post_template_fn=head_post_template_fn,
3874 expand_all=True,
3875 )
3876 rub, rest = extract_ruby(wxr, exp.children)
3877 if rub:
3878 for rtup in rub:
3879 ruby.append(rtup)
3880 contents = rest
3881 subtext = clean_node(
3882 wxr, sense_base, contents, template_fn=usex_template_fn
3883 )
3885 frozen_taxons = frozenset(taxons)
3886 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3888 # print(f"{subtext=}")
3889 subtext = re.sub(
3890 r"\s*\(please add an English "
3891 r"translation of this "
3892 r"(example|usage example|quote)\)",
3893 "",
3894 subtext,
3895 ).strip()
3896 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3897 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3898 # print("subtext:", repr(subtext))
3900 lines = subtext.splitlines()
3901 # print(lines)
3903 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3904 lines = list(
3905 x
3906 for x in lines
3907 if not re.match(
3908 r"(Synonyms: |Antonyms: |Hyponyms: |"
3909 r"Synonym: |Antonym: |Hyponym: |"
3910 r"Hypernyms: |Derived terms: |"
3911 r"Related terms: |"
3912 r"Hypernym: |Derived term: |"
3913 r"Coordinate terms:|"
3914 r"Related term: |"
3915 r"For more quotations using )",
3916 x,
3917 )
3918 )
3919 tr = ""
3920 ref = ""
3921 roman = ""
3922 # for line in lines:
3923 # print("LINE:", repr(line))
3924 # print(classify_desc(line))
3925 if len(lines) == 1 and lang_code != "en":
3926 parts = example_splitter_re.split(lines[0])
3927 if ( 3927 ↛ 3935line 3927 didn't jump to line 3935 because the condition on line 3927 was never true
3928 len(parts) > 2
3929 and len(example_template_args) == 1
3930 and any(
3931 ("―" in s) or ("—" in s)
3932 for s in example_template_args[0].values()
3933 )
3934 ):
3935 if nparts := synch_splits_with_args(
3936 lines[0], example_template_args[0]
3937 ):
3938 parts = nparts
3939 if ( 3939 ↛ 3944line 3939 didn't jump to line 3944 because the condition on line 3939 was never true
3940 len(example_template_args) == 1
3941 and "lit" in example_template_args[0]
3942 ):
3943 # ugly brute-force kludge in case there's a lit= arg
3944 literally = example_template_args[0].get("lit", "")
3945 if literally:
3946 literally = (
3947 " (literally, “"
3948 + clean_value(wxr, literally)
3949 + "”)"
3950 )
3951 else:
3952 literally = ""
3953 if ( 3953 ↛ 3992line 3953 didn't jump to line 3992 because the condition on line 3953 was never true
3954 len(example_template_args) == 1
3955 and len(parts) == 2
3956 and len(example_template_args[0])
3957 - (
3958 # horrible kludge to ignore these arguments
3959 # when calculating how many there are
3960 sum(
3961 s in example_template_args[0]
3962 for s in (
3963 "lit", # generates text, but we handle it
3964 "inline",
3965 "noenum",
3966 "nocat",
3967 "sort",
3968 )
3969 )
3970 )
3971 == 3
3972 and clean_value(
3973 wxr, example_template_args[0].get(2, "")
3974 )
3975 == parts[0].strip()
3976 and clean_value(
3977 wxr,
3978 (
3979 example_template_args[0].get(3)
3980 or example_template_args[0].get("translation")
3981 or example_template_args[0].get("t", "")
3982 )
3983 + literally, # in case there's a lit= argument
3984 )
3985 == parts[1].strip()
3986 ):
3987 # {{exampletemplate|ex|Foo bar baz|English translation}}
3988 # is a pretty reliable 'heuristic', so we use it here
3989 # before the others. To be extra sure the template
3990 # doesn't do anything weird, we compare the arguments
3991 # and the output to each other.
3992 lines = [parts[0].strip()]
3993 tr = parts[1].strip()
3994 elif (
3995 len(parts) == 2
3996 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3997 ):
3998 # These other branches just do some simple heuristics w/
3999 # the expanded output of the template (if applicable).
4000 lines = [parts[0].strip()]
4001 tr = parts[1].strip()
4002 elif ( 4002 ↛ 4008line 4002 didn't jump to line 4008 because the condition on line 4002 was never true
4003 len(parts) == 3
4004 and classify_desc2(parts[1])
4005 in ("romanization", "english")
4006 and classify_desc2(parts[2]) in ENGLISH_TEXTS
4007 ):
4008 lines = [parts[0].strip()]
4009 roman = parts[1].strip()
4010 tr = parts[2].strip()
4011 else:
4012 parts = re.split(r"\s+-\s+", lines[0])
4013 if ( 4013 ↛ 4017line 4013 didn't jump to line 4017 because the condition on line 4013 was never true
4014 len(parts) == 2
4015 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4016 ):
4017 lines = [parts[0].strip()]
4018 tr = parts[1].strip()
4019 elif len(lines) > 1:
4020 if any(
4021 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
4022 ) and not (len(example_template_names) == 1):
4023 refs: list[str] = []
4024 for i in range(len(lines)): 4024 ↛ 4030line 4024 didn't jump to line 4030 because the loop on line 4024 didn't complete
4025 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 4025 ↛ 4026line 4025 didn't jump to line 4026 because the condition on line 4025 was never true
4026 break
4027 refs.append(lines[i].strip())
4028 if re.search(r"[]\d:)]\s*$", lines[i]):
4029 break
4030 ref = " ".join(refs)
4031 lines = lines[i + 1 :]
4032 if (
4033 lang_code != "en"
4034 and len(lines) >= 2
4035 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
4036 ):
4037 i = len(lines) - 1
4038 while ( 4038 ↛ 4043line 4038 didn't jump to line 4043 because the condition on line 4038 was never true
4039 i > 1
4040 and classify_desc2(lines[i - 1])
4041 in ENGLISH_TEXTS
4042 ):
4043 i -= 1
4044 tr = "\n".join(lines[i:])
4045 lines = lines[:i]
4046 if len(lines) >= 2:
4047 if classify_desc2(lines[-1]) == "romanization":
4048 roman = lines[-1].strip()
4049 lines = lines[:-1]
4051 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
4052 ref = lines[0]
4053 lines = lines[1:]
4054 elif lang_code != "en" and len(lines) == 2:
4055 cls1 = classify_desc2(lines[0])
4056 cls2 = classify_desc2(lines[1])
4057 if cls2 in ENGLISH_TEXTS and cls1 != "english":
4058 tr = lines[1]
4059 lines = [lines[0]]
4060 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 4060 ↛ 4061line 4060 didn't jump to line 4061 because the condition on line 4060 was never true
4061 tr = lines[0]
4062 lines = [lines[1]]
4063 elif ( 4063 ↛ 4070line 4063 didn't jump to line 4070 because the condition on line 4063 was never true
4064 re.match(r"^[#*]*:+", lines[1])
4065 and classify_desc2(
4066 re.sub(r"^[#*:]+\s*", "", lines[1])
4067 )
4068 in ENGLISH_TEXTS
4069 ):
4070 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
4071 lines = [lines[0]]
4072 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
4073 # Both were classified as English, but
4074 # presumably one is not. Assume first is
4075 # non-English, as that seems more common.
4076 tr = lines[1]
4077 lines = [lines[0]]
4078 elif (
4079 usex_type != "quotation"
4080 and lang_code != "en"
4081 and len(lines) == 3
4082 ):
4083 cls1 = classify_desc2(lines[0])
4084 cls2 = classify_desc2(lines[1])
4085 cls3 = classify_desc2(lines[2])
4086 if (
4087 cls3 == "english"
4088 and cls2 in ("english", "romanization")
4089 and cls1 != "english"
4090 ):
4091 tr = lines[2].strip()
4092 roman = lines[1].strip()
4093 lines = [lines[0].strip()]
4094 elif ( 4094 ↛ 4102line 4094 didn't jump to line 4102 because the condition on line 4094 was never true
4095 usex_type == "quotation"
4096 and lang_code != "en"
4097 and len(lines) > 2
4098 ):
4099 # for x in lines:
4100 # print(" LINE: {}: {}"
4101 # .format(classify_desc2(x), x))
4102 if re.match(r"^[#*]*:+\s*$", lines[1]):
4103 ref = lines[0]
4104 lines = lines[2:]
4105 cls1 = classify_desc2(lines[-1])
4106 if cls1 == "english":
4107 i = len(lines) - 1
4108 while (
4109 i > 1
4110 and classify_desc2(lines[i - 1])
4111 == ENGLISH_TEXTS
4112 ):
4113 i -= 1
4114 tr = "\n".join(lines[i:])
4115 lines = lines[:i]
4117 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
4118 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
4119 tr = re.sub(r"^[#*:]+\s*", "", tr)
4120 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
4121 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
4122 ref = re.sub(r"^[#*:]+\s*", "", ref)
4123 ref = re.sub(
4124 r", (volume |number |page )?“?"
4125 r"\(please specify ([^)]|\(s\))*\)”?|"
4126 ", text here$",
4127 "",
4128 ref,
4129 )
4130 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
4131 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
4132 subtext = "\n".join(x for x in lines if x)
4133 if not tr and lang_code != "en":
4134 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
4135 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 4135 ↛ 4136line 4135 didn't jump to line 4136 because the condition on line 4135 was never true
4136 tr = m.group(2)
4137 subtext = subtext[: m.start()] + m.group(1)
4138 elif lines:
4139 parts = re.split(r"\s*[―—]+\s*", lines[0])
4140 if ( 4140 ↛ 4144line 4140 didn't jump to line 4144 because the condition on line 4140 was never true
4141 len(parts) == 2
4142 and classify_desc2(parts[1]) in ENGLISH_TEXTS
4143 ):
4144 subtext = parts[0].strip()
4145 tr = parts[1].strip()
4146 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
4147 subtext = re.sub(
4148 r"(please add an English translation of "
4149 r"this (quote|usage example))",
4150 "",
4151 subtext,
4152 )
4153 subtext = re.sub(
4154 r"\s*→New International Version " "translation$",
4155 "",
4156 subtext,
4157 ) # e.g. pis/Tok Pisin (Bible)
4158 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4159 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4160 note = None
4161 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4162 if ( 4162 ↛ 4170line 4162 didn't jump to line 4170 because the condition on line 4162 was never true
4163 m is not None
4164 and lang_code != "en"
4165 and (
4166 m.group(1).startswith("with ")
4167 or classify_desc2(m.group(1)) == "english"
4168 )
4169 ):
4170 note = m.group(1)
4171 subtext = subtext[m.end() :]
4172 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4173 ref = re.sub(r",\s*→ISBN", "", ref)
4174 ref = ref.strip()
4175 if ref.endswith(":") or ref.endswith(","):
4176 ref = ref[:-1].strip()
4177 ref = re.sub(r"\s+,\s+", ", ", ref)
4178 ref = re.sub(r"\s+", " ", ref)
4179 if ref and not subtext: 4179 ↛ 4180line 4179 didn't jump to line 4180 because the condition on line 4179 was never true
4180 subtext = ref
4181 ref = ""
4182 if subtext:
4183 dt: ExampleData = {"text": subtext}
4184 if ref:
4185 dt["ref"] = ref
4186 if tr:
4187 dt["english"] = tr
4188 if usex_type:
4189 dt["type"] = usex_type
4190 if note: 4190 ↛ 4191line 4190 didn't jump to line 4191 because the condition on line 4190 was never true
4191 dt["note"] = note
4192 if roman:
4193 dt["roman"] = roman
4194 if ruby:
4195 dt["ruby"] = ruby
4196 examples.append(dt)
4198 return examples
4200 # Main code of parse_language()
4201 # Process the section
4202 stack.append(language)
4203 process_children(langnode, None)
4204 stack.pop()
4206 # Finalize word entires
4207 push_etym()
4208 ret = []
4209 for data in page_datas:
4210 merge_base(data, base_data)
4211 ret.append(data)
4213 # Copy all tags to word senses
4214 for data in ret:
4215 if "senses" not in data: 4215 ↛ 4216line 4215 didn't jump to line 4216 because the condition on line 4215 was never true
4216 continue
4217 # WordData should not have a 'tags' field, but if it does, it's
4218 # deleted and its contents removed and placed in each sense;
4219 # that's why the type ignores.
4220 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4221 if "tags" in data: 4221 ↛ 4222line 4221 didn't jump to line 4222 because the condition on line 4221 was never true
4222 del data["tags"] # type: ignore[typeddict-item]
4223 for sense in data["senses"]:
4224 data_extend(sense, "tags", tags)
4226 return ret
4229def parse_wikipedia_template(
4230 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4231) -> None:
4232 """Helper function for parsing {{wikipedia|...}} and related templates."""
4233 assert isinstance(wxr, WiktextractContext)
4234 assert isinstance(data, dict)
4235 assert isinstance(ht, dict)
4236 langid = clean_node(wxr, data, ht.get("lang", ()))
4237 pagename = (
4238 clean_node(wxr, data, ht.get(1, ()))
4239 or wxr.wtp.title
4240 or "MISSING_PAGE_TITLE"
4241 )
4242 if langid:
4243 data_append(data, "wikipedia", langid + ":" + pagename)
4244 else:
4245 data_append(data, "wikipedia", pagename)
4248def parse_top_template(
4249 wxr: WiktextractContext, node: WikiNode, data: WordData
4250) -> None:
4251 """Parses a template that occurs on the top-level in a page, before any
4252 language subtitles."""
4253 assert isinstance(wxr, WiktextractContext)
4254 assert isinstance(node, WikiNode)
4255 assert isinstance(data, dict)
4257 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4258 if name in wikipedia_templates:
4259 parse_wikipedia_template(wxr, data, ht)
4260 return None
4261 if is_panel_template(wxr, name):
4262 return ""
4263 if name in ("reconstruction",): 4263 ↛ 4264line 4263 didn't jump to line 4264 because the condition on line 4263 was never true
4264 return ""
4265 if name.lower() == "also" or name.lower().startswith("also/"):
4266 # XXX shows related words that might really have been the intended
4267 # word, capture them
4268 return ""
4269 if name == "see also": 4269 ↛ 4271line 4269 didn't jump to line 4271 because the condition on line 4269 was never true
4270 # XXX capture
4271 return ""
4272 if name == "cardinalbox": 4272 ↛ 4274line 4272 didn't jump to line 4274 because the condition on line 4272 was never true
4273 # XXX capture
4274 return ""
4275 if name == "character info": 4275 ↛ 4277line 4275 didn't jump to line 4277 because the condition on line 4275 was never true
4276 # XXX capture
4277 return ""
4278 if name == "commonscat": 4278 ↛ 4280line 4278 didn't jump to line 4280 because the condition on line 4278 was never true
4279 # XXX capture link to Wikimedia commons
4280 return ""
4281 if name == "wrongtitle": 4281 ↛ 4284line 4281 didn't jump to line 4284 because the condition on line 4281 was never true
4282 # XXX this should be captured to replace page title with the
4283 # correct title. E.g. ⿰亻革家
4284 return ""
4285 if name == "wikidata": 4285 ↛ 4286line 4285 didn't jump to line 4286 because the condition on line 4285 was never true
4286 arg = clean_node(wxr, data, ht.get(1, ()))
4287 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4288 data_append(data, "wikidata", arg)
4289 return ""
4290 wxr.wtp.debug(
4291 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4292 sortid="page/2870",
4293 )
4294 return ""
4296 clean_node(wxr, None, [node], template_fn=top_template_fn)
4299def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4300 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4301 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4302 that are next to each other."""
4304 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4305 # section get overwritten. In this case, let's just combine the two.
4307 # In Chinese entries, Pronunciation can be preceded on the
4308 # same level 3 by its Etymology *and* Glyph Origin sections:
4309 # ===Glyph Origin===
4310 # ===Etymology===
4311 # ===Pronunciation===
4312 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4313 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4314 # are now level 6
4316 # Known lowercase PoS names are in part_of_speech_map
4317 # Known lowercase linkage section names are in linkage_map
4319 old = re.split(
4320 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4321 )
4323 parts = []
4324 npar = 4 # Number of parentheses in above expression
4325 parts.append(old[0])
4326 prev_level = None
4327 level = None
4328 skip_level_title = False # When combining etymology sections
4329 for i in range(1, len(old), npar + 1):
4330 left = old[i]
4331 right = old[i + npar - 1]
4332 # remove Wikilinks in title
4333 title = re.sub(r"^\[\[", "", old[i + 1])
4334 title = re.sub(r"\]\]$", "", title)
4335 prev_level = level
4336 level = len(left)
4337 part = old[i + npar]
4338 if level != len(right): 4338 ↛ 4339line 4338 didn't jump to line 4339 because the condition on line 4338 was never true
4339 wxr.wtp.debug(
4340 "subtitle has unbalanced levels: "
4341 "{!r} has {} on the left and {} on the right".format(
4342 title, left, right
4343 ),
4344 sortid="page/2904",
4345 )
4346 lc = title.lower()
4347 if name_to_code(title, "en") != "":
4348 if level > 2: 4348 ↛ 4349line 4348 didn't jump to line 4349 because the condition on line 4348 was never true
4349 wxr.wtp.debug(
4350 "subtitle has language name {} at level {}".format(
4351 title, level
4352 ),
4353 sortid="page/2911",
4354 )
4355 level = 2
4356 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4357 if level > 3: 4357 ↛ 4358line 4357 didn't jump to line 4358 because the condition on line 4357 was never true
4358 wxr.wtp.debug(
4359 "etymology section {} at level {}".format(title, level),
4360 sortid="page/2917",
4361 )
4362 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
4363 # sections cheek-to-cheek
4364 skip_level_title = True
4365 # Modify the title of previous ("Glyph Origin") section, in
4366 # case we have a meaningful title like "Etymology 1"
4367 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4368 level = 3
4369 elif lc.startswith(PRONUNCIATION_TITLE):
4370 # Pronunciation is now a level between POS and Etymology, so
4371 # we need to shift everything down by one
4372 level = 4
4373 elif lc in POS_TITLES:
4374 level = 5
4375 elif lc == TRANSLATIONS_TITLE:
4376 level = 6
4377 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
4378 level = 6
4379 elif lc in INFLECTION_TITLES:
4380 level = 6
4381 elif lc == DESCENDANTS_TITLE:
4382 level = 6
4383 elif title in PROTO_ROOT_DERIVED_TITLES: 4383 ↛ 4384line 4383 didn't jump to line 4384 because the condition on line 4383 was never true
4384 level = 6
4385 elif lc in IGNORED_TITLES:
4386 level = 6
4387 else:
4388 level = 6
4389 if skip_level_title:
4390 skip_level_title = False
4391 parts.append(part)
4392 else:
4393 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4394 parts.append(part)
4395 # print("=" * level, title)
4396 # if level != len(left):
4397 # print(" FIXED LEVEL OF {} {} -> {}"
4398 # .format(title, len(left), level))
4400 text = "".join(parts)
4401 # print(text)
4402 return text
4405def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4406 # Skip translation pages
4407 if word.endswith("/" + TRANSLATIONS_TITLE): 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true
4408 return []
4410 if wxr.config.verbose: 4410 ↛ 4411line 4410 didn't jump to line 4411 because the condition on line 4410 was never true
4411 logger.info(f"Parsing page: {word}")
4413 wxr.config.word = word
4414 wxr.wtp.start_page(word)
4416 # Remove <noinclude> and similar tags from main pages. They
4417 # should not appear there, but at least net/Elfdala has one and it
4418 # is probably not the only one.
4419 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4420 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4421 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4423 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4424 # pages that have, for example, Translations section under Linkage, or
4425 # Translations section on the same level as Noun. Enforce a proper
4426 # hierarchy by manipulating the subtitle levels in certain cases.
4427 text = fix_subtitle_hierarchy(wxr, text)
4429 # Parse the page, pre-expanding those templates that are likely to
4430 # influence parsing
4431 tree = wxr.wtp.parse(
4432 text,
4433 pre_expand=True,
4434 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4435 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4436 )
4437 # from wikitextprocessor.parser import print_tree
4438 # print("PAGE PARSE:", print_tree(tree))
4440 top_data: WordData = {}
4442 # Iterate over top-level titles, which should be languages for normal
4443 # pages
4444 by_lang = defaultdict(list)
4445 for langnode in tree.children:
4446 if not isinstance(langnode, WikiNode):
4447 continue
4448 if langnode.kind == NodeKind.TEMPLATE:
4449 parse_top_template(wxr, langnode, top_data)
4450 continue
4451 if langnode.kind == NodeKind.LINK:
4452 # Some pages have links at top level, e.g., "trees" in Wiktionary
4453 continue
4454 if langnode.kind != NodeKind.LEVEL2: 4454 ↛ 4455line 4454 didn't jump to line 4455 because the condition on line 4454 was never true
4455 wxr.wtp.debug(
4456 f"unexpected top-level node: {langnode}", sortid="page/3014"
4457 )
4458 continue
4459 lang = clean_node(
4460 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4461 )
4462 lang_code = name_to_code(lang, "en")
4463 if lang_code == "": 4463 ↛ 4464line 4463 didn't jump to line 4464 because the condition on line 4463 was never true
4464 wxr.wtp.debug(
4465 f"unrecognized language name: {lang}", sortid="page/3019"
4466 )
4467 if (
4468 wxr.config.capture_language_codes
4469 and lang_code not in wxr.config.capture_language_codes
4470 ):
4471 continue
4472 wxr.wtp.start_section(lang)
4474 # Collect all words from the page.
4475 # print(f"{langnode=}")
4476 datas = parse_language(wxr, langnode, lang, lang_code)
4478 # Propagate fields resulting from top-level templates to this
4479 # part-of-speech.
4480 for data in datas:
4481 if "lang" not in data: 4481 ↛ 4482line 4481 didn't jump to line 4482 because the condition on line 4481 was never true
4482 wxr.wtp.debug(
4483 "internal error -- no lang in data: {}".format(data),
4484 sortid="page/3034",
4485 )
4486 continue
4487 for k, v in top_data.items():
4488 assert isinstance(v, (list, tuple))
4489 data_extend(data, k, v)
4490 by_lang[data["lang"]].append(data)
4492 # XXX this code is clearly out of date. There is no longer a "conjugation"
4493 # field. FIX OR REMOVE.
4494 # Do some post-processing on the words. For example, we may distribute
4495 # conjugation information to all the words.
4496 ret = []
4497 for lang, lang_datas in by_lang.items():
4498 ret.extend(lang_datas)
4500 for x in ret:
4501 if x["word"] != word:
4502 if word.startswith("Unsupported titles/"): 4502 ↛ 4508line 4502 didn't jump to line 4508 because the condition on line 4502 was always true
4503 wxr.wtp.debug(
4504 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4505 sortid="20231101/3578page.py",
4506 )
4507 else:
4508 wxr.wtp.debug(
4509 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4510 sortid="20231101/3582page.py",
4511 )
4512 x["original_title"] = word
4513 # validate tag data
4514 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4515 return ret
4518def recursively_separate_raw_tags(
4519 wxr: WiktextractContext, data: dict[str, Any]
4520) -> None:
4521 if not isinstance(data, dict): 4521 ↛ 4522line 4521 didn't jump to line 4522 because the condition on line 4521 was never true
4522 wxr.wtp.error(
4523 "'data' is not dict; most probably "
4524 "data has a list that contains at least one dict and "
4525 "at least one non-dict item",
4526 sortid="en/page-4016/20240419",
4527 )
4528 return
4529 new_tags: list[str] = []
4530 raw_tags: list[str] = data.get("raw_tags", [])
4531 for field, val in data.items():
4532 if field == "tags":
4533 for tag in val:
4534 if tag not in valid_tags:
4535 raw_tags.append(tag)
4536 else:
4537 new_tags.append(tag)
4538 if isinstance(val, list):
4539 if len(val) > 0 and isinstance(val[0], dict):
4540 for d in val:
4541 recursively_separate_raw_tags(wxr, d)
4542 if "tags" in data and not new_tags:
4543 del data["tags"]
4544 elif new_tags:
4545 data["tags"] = new_tags
4546 if raw_tags:
4547 data["raw_tags"] = raw_tags
4550def process_soft_redirect_template(
4551 wxr: WiktextractContext,
4552 template_node: TemplateNode,
4553 redirect_pages: list[str],
4554) -> bool:
4555 # return `True` if the template is soft redirect template
4556 if template_node.template_name == "zh-see":
4557 # https://en.wiktionary.org/wiki/Template:zh-see
4558 title = clean_node(
4559 wxr, None, template_node.template_parameters.get(1, "")
4560 )
4561 if title != "": 4561 ↛ 4563line 4561 didn't jump to line 4563 because the condition on line 4561 was always true
4562 redirect_pages.append(title)
4563 return True
4564 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4565 # https://en.wiktionary.org/wiki/Template:ja-see
4566 for key, value in template_node.template_parameters.items():
4567 if isinstance(key, int): 4567 ↛ 4566line 4567 didn't jump to line 4566 because the condition on line 4567 was always true
4568 title = clean_node(wxr, None, value)
4569 if title != "": 4569 ↛ 4566line 4569 didn't jump to line 4566 because the condition on line 4569 was always true
4570 redirect_pages.append(title)
4571 return True
4572 return False
4575ZH_FORMS_TAGS = {
4576 "trad.": "Traditional-Chinese",
4577 "simp.": "Simplified-Chinese",
4578 "alternative forms": "alternative",
4579}
4582def extract_zh_forms_template(
4583 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4584):
4585 # https://en.wiktionary.org/wiki/Template:zh-forms
4586 lit_meaning = clean_node(
4587 wxr, None, t_node.template_parameters.get("lit", "")
4588 )
4589 if lit_meaning != "": 4589 ↛ 4590line 4589 didn't jump to line 4590 because the condition on line 4589 was never true
4590 base_data["literal_meaning"] = lit_meaning
4591 expanded_node = wxr.wtp.parse(
4592 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4593 )
4594 for table in expanded_node.find_child(NodeKind.TABLE):
4595 for row in table.find_child(NodeKind.TABLE_ROW):
4596 row_header = ""
4597 row_header_tags = []
4598 header_has_span = False
4599 for cell in row.find_child(
4600 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4601 ):
4602 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4603 row_header, row_header_tags, header_has_span = (
4604 extract_zh_forms_header_cell(wxr, base_data, cell)
4605 )
4606 elif not header_has_span:
4607 extract_zh_forms_data_cell(
4608 wxr, base_data, cell, row_header, row_header_tags
4609 )
4611 if "forms" in base_data and len(base_data["forms"]) == 0: 4611 ↛ 4612line 4611 didn't jump to line 4612 because the condition on line 4611 was never true
4612 del base_data["forms"]
4615def extract_zh_forms_header_cell(
4616 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4617) -> tuple[str, list[str], bool]:
4618 row_header = ""
4619 row_header_tags = []
4620 header_has_span = False
4621 first_span_index = len(header_cell.children)
4622 for index, span_tag in header_cell.find_html("span", with_index=True):
4623 if index < first_span_index: 4623 ↛ 4625line 4623 didn't jump to line 4625 because the condition on line 4623 was always true
4624 first_span_index = index
4625 header_has_span = True
4626 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4627 for raw_tag in row_header.split(" and "):
4628 raw_tag = raw_tag.strip()
4629 if raw_tag != "": 4629 ↛ 4627line 4629 didn't jump to line 4627 because the condition on line 4629 was always true
4630 row_header_tags.append(raw_tag)
4631 for span_tag in header_cell.find_html_recursively("span"):
4632 span_lang = span_tag.attrs.get("lang", "")
4633 form_nodes = []
4634 sup_title = ""
4635 for node in span_tag.children:
4636 if isinstance(node, HTMLNode) and node.tag == "sup": 4636 ↛ 4637line 4636 didn't jump to line 4637 because the condition on line 4636 was never true
4637 for sup_span in node.find_html("span"):
4638 sup_title = sup_span.attrs.get("title", "")
4639 else:
4640 form_nodes.append(node)
4641 if span_lang in ["zh-Hant", "zh-Hans"]: 4641 ↛ 4642line 4641 didn't jump to line 4642 because the condition on line 4641 was never true
4642 for word in clean_node(wxr, None, form_nodes).split("/"):
4643 if word not in [wxr.wtp.title, ""]:
4644 form = {"form": word}
4645 for raw_tag in row_header_tags:
4646 if raw_tag in ZH_FORMS_TAGS:
4647 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4648 else:
4649 data_append(form, "raw_tags", raw_tag)
4650 if sup_title != "":
4651 data_append(form, "raw_tags", sup_title)
4652 data_append(base_data, "forms", form)
4653 return row_header, row_header_tags, header_has_span
4656def extract_zh_forms_data_cell(
4657 wxr: WiktextractContext,
4658 base_data: WordData,
4659 cell: WikiNode,
4660 row_header: str,
4661 row_header_tags: list[str],
4662):
4663 from .zh_pron_tags import ZH_PRON_TAGS
4665 for top_span_tag in cell.find_html("span"):
4666 forms = []
4667 for span_tag in top_span_tag.find_html("span"):
4668 span_lang = span_tag.attrs.get("lang", "")
4669 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:
4670 word = clean_node(wxr, None, span_tag)
4671 if word not in ["", "/", wxr.wtp.title]:
4672 form = {"form": word}
4673 if row_header != "anagram": 4673 ↛ 4679line 4673 didn't jump to line 4679 because the condition on line 4673 was always true
4674 for raw_tag in row_header_tags:
4675 if raw_tag in ZH_FORMS_TAGS: 4675 ↛ 4678line 4675 didn't jump to line 4678 because the condition on line 4675 was always true
4676 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4677 else:
4678 data_append(form, "raw_tags", raw_tag)
4679 if span_lang == "zh-Hant":
4680 data_append(form, "tags", "Traditional-Chinese")
4681 elif span_lang == "zh-Hans":
4682 data_append(form, "tags", "Simplified-Chinese")
4683 forms.append(form)
4684 elif "font-size:80%" in span_tag.attrs.get("style", ""): 4684 ↛ 4667line 4684 didn't jump to line 4667 because the condition on line 4684 was always true
4685 raw_tag = clean_node(wxr, None, span_tag)
4686 if raw_tag != "": 4686 ↛ 4667line 4686 didn't jump to line 4667 because the condition on line 4686 was always true
4687 for form in forms:
4688 if raw_tag in ZH_PRON_TAGS: 4688 ↛ 4694line 4688 didn't jump to line 4694 because the condition on line 4688 was always true
4689 tr_tag = ZH_PRON_TAGS[raw_tag]
4690 if isinstance(tr_tag, list): 4690 ↛ 4691line 4690 didn't jump to line 4691 because the condition on line 4690 was never true
4691 data_extend(form, "tags", tr_tag)
4692 elif isinstance(tr_tag, str): 4692 ↛ 4687line 4692 didn't jump to line 4687 because the condition on line 4692 was always true
4693 data_append(form, "tags", tr_tag)
4694 elif raw_tag in valid_tags:
4695 data_append(form, "tags", raw_tag)
4696 else:
4697 data_append(form, "raw_tags", raw_tag)
4699 if row_header == "anagram": 4699 ↛ 4700line 4699 didn't jump to line 4700 because the condition on line 4699 was never true
4700 for form in forms:
4701 l_data = {"word": form["form"]}
4702 for key in ["tags", "raw_tags"]:
4703 if key in form:
4704 l_data[key] = form[key]
4705 data_append(base_data, "anagrams", l_data)
4706 else:
4707 data_extend(base_data, "forms", forms)