Coverage for src / wiktextract / extractor / en / page.py: 78%
1818 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Literal,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .descendant import extract_descendant_section
51from .example import extract_example_list_item, extract_template_zh_x
52from .form_descriptions import (
53 classify_desc,
54 decode_tags,
55 distw,
56 parse_alt_or_inflection_of,
57 parse_sense_qualifier,
58 parse_word_head,
59)
60from .inflection import TableContext, parse_inflection_section
61from .info_templates import (
62 INFO_TEMPLATE_FUNCS,
63 parse_info_template_arguments,
64 parse_info_template_node,
65)
66from .linkages import (
67 extract_alt_form_section,
68 parse_linkage,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 FormData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280FLOATING_TABLE_TEMPLATES: set[str] = {
281 # az-suffix-form creates a style=floatright div that is otherwise
282 # deleted; if it is not pre-expanded, we can intercept the template
283 # so we add this set into do_not_pre_expand, and intercept the
284 # templates in parse_part_of_speech
285 "az-suffix-forms",
286 "az-inf-p",
287 "kk-suffix-forms",
288 "ky-suffix-forms",
289 "tr-inf-p",
290 "tr-suffix-forms",
291 "tt-suffix-forms",
292 "uz-suffix-forms",
293}
294# These two should contain template names that should always be
295# pre-expanded when *first* processing the tree, or not pre-expanded
296# so that the template are left in place with their identifying
297# name intact for later filtering.
299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
302# Additional templates to be expanded in the pre-expand phase
303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
304 "multitrans",
305 "multitrans-nowiki",
306 "trans-top",
307 "trans-top-also",
308 "trans-bottom",
309 "checktrans-top",
310 "checktrans-bottom",
311 "col",
312 "col1",
313 "col2",
314 "col3",
315 "col4",
316 "col5",
317 "col1-u",
318 "col2-u",
319 "col3-u",
320 "col4-u",
321 "col5-u",
322 "check deprecated lang param usage",
323 "deprecated code",
324 "ru-verb-alt-ё",
325 "ru-noun-alt-ё",
326 "ru-adj-alt-ё",
327 "ru-proper noun-alt-ё",
328 "ru-pos-alt-ё",
329 "ru-alt-ё",
330 "inflection of",
331 "no deprecated lang param usage",
332 "transclude", # these produce sense entries (or other lists)
333 "tcl",
334}
336# Inverse linkage for those that have them
337linkage_inverses: dict[str, str] = {
338 # XXX this is not currently used, move to post-processing
339 "synonyms": "synonyms",
340 "hypernyms": "hyponyms",
341 "hyponyms": "hypernyms",
342 "holonyms": "meronyms",
343 "meronyms": "holonyms",
344 "derived": "derived_from",
345 "coordinate_terms": "coordinate_terms",
346 "troponyms": "hypernyms",
347 "antonyms": "antonyms",
348 "instances": "instance_of",
349 "related": "related",
350}
352# Templates that are used to form panels on pages and that
353# should be ignored in various positions
354PANEL_TEMPLATES: set[str] = {
355 "Character info",
356 "CJKV",
357 "French personal pronouns",
358 "French possessive adjectives",
359 "French possessive pronouns",
360 "Han etym",
361 "Japanese demonstratives",
362 "Latn-script",
363 "LDL",
364 "MW1913Abbr",
365 "Number-encoding",
366 "Nuttall",
367 "Spanish possessive adjectives",
368 "Spanish possessive pronouns",
369 "USRegionDisputed",
370 "Webster 1913",
371 "ase-rfr",
372 "attention",
373 "attn",
374 "beer",
375 "broken ref",
376 "ca-compass",
377 "character info",
378 "character info/var",
379 "checksense",
380 "compass-fi",
381 "copyvio suspected",
382 "delete",
383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
384 "etystub",
385 "examples",
386 "hu-corr",
387 "hu-suff-pron",
388 "interwiktionary",
389 "ja-kanjitab",
390 "ja-kt",
391 "ko-hanja-search",
392 "look",
393 "maintenance box",
394 "maintenance line",
395 "mediagenic terms",
396 "merge",
397 "missing template",
398 "morse links",
399 "move",
400 "multiple images",
401 "no inline",
402 "picdic",
403 "picdicimg",
404 "picdiclabel",
405 "polyominoes",
406 "predidential nomics",
407 "punctuation", # This actually gets pre-expanded
408 "reconstructed",
409 "request box",
410 "rf-sound example",
411 "rfaccents",
412 "rfap",
413 "rfaspect",
414 "rfc",
415 "rfc-auto",
416 "rfc-header",
417 "rfc-level",
418 "rfc-pron-n",
419 "rfc-sense",
420 "rfclarify",
421 "rfd",
422 "rfd-redundant",
423 "rfd-sense",
424 "rfdate",
425 "rfdatek",
426 "rfdef",
427 "rfe",
428 "rfe/dowork",
429 "rfex",
430 "rfexp",
431 "rfform",
432 "rfgender",
433 "rfi",
434 "rfinfl",
435 "rfm",
436 "rfm-sense",
437 "rfp",
438 "rfp-old",
439 "rfquote",
440 "rfquote-sense",
441 "rfquotek",
442 "rfref",
443 "rfscript",
444 "rft2",
445 "rftaxon",
446 "rftone",
447 "rftranslit",
448 "rfv",
449 "rfv-etym",
450 "rfv-pron",
451 "rfv-quote",
452 "rfv-sense",
453 "selfref",
454 "split",
455 "stroke order", # XXX consider capturing this?
456 "stub entry",
457 "t-needed",
458 "tbot entry",
459 "tea room",
460 "tea room sense",
461 # "ttbc", - XXX needed in at least on/Preposition/Translation page
462 "unblock",
463 "unsupportedpage",
464 "video frames",
465 "was wotd",
466 "wrongtitle",
467 "zh-forms",
468 "zh-hanzi-box",
469 "no entry",
470}
472# Template name prefixes used for language-specific panel templates (i.e.,
473# templates that create side boxes or notice boxes or that should generally
474# be ignored).
475PANEL_PREFIXES: set[str] = {
476 "list:compass points/",
477 "list:Gregorian calendar months/",
478 "RQ:",
479}
481# Templates used for wikipedia links.
482wikipedia_templates: set[str] = {
483 "wikipedia",
484 "slim-wikipedia",
485 "w",
486 "W",
487 "swp",
488 "wiki",
489 "Wikipedia",
490 "wtorw",
491}
492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started
493 print(
494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
495 x
496 )
497 )
499# Mapping from a template name (without language prefix) for the main word
500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
501# it could validly occur. This is used as just a sanity check to give
502# warnings about probably incorrect coding in Wiktionary.
503template_allowed_pos_map: dict[str, list[str]] = {
504 "abbr": ["abbrev"],
505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
506 "plural noun": ["noun", "name"],
507 "plural-noun": ["noun", "name"],
508 "proper noun": ["noun", "name"],
509 "proper-noun": ["name", "noun"],
510 "prop": ["name", "noun"],
511 "verb": ["verb", "phrase"],
512 "gerund": ["verb"],
513 "particle": ["adv", "particle"],
514 "adj": ["adj", "adj_noun"],
515 "pron": ["pron", "noun"],
516 "name": ["name", "noun"],
517 "adv": ["adv", "intj", "conj", "particle"],
518 "phrase": ["phrase", "prep_phrase"],
519 "noun phrase": ["phrase"],
520 "ordinal": ["num"],
521 "number": ["num"],
522 "pos": ["affix", "name", "num"],
523 "suffix": ["suffix", "affix"],
524 "character": ["character"],
525 "letter": ["character"],
526 "kanji": ["character"],
527 "cont": ["abbrev"],
528 "interj": ["intj"],
529 "con": ["conj"],
530 "part": ["particle"],
531 "prep": ["prep", "postp"],
532 "postp": ["postp"],
533 "misspelling": ["noun", "adj", "verb", "adv"],
534 "part-form": ["verb"],
535}
536for k, v in template_allowed_pos_map.items():
537 for x in v:
538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 print(
540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
541 "".format(x, k, v)
542 )
543 assert False
546# Templates ignored during etymology extraction, i.e., these will not be listed
547# in the extracted etymology templates.
548ignored_etymology_templates: list[str] = [
549 "...",
550 "IPAchar",
551 "ipachar",
552 "ISBN",
553 "isValidPageName",
554 "redlink category",
555 "deprecated code",
556 "check deprecated lang param usage",
557 "para",
558 "p",
559 "cite",
560 "Cite news",
561 "Cite newsgroup",
562 "cite paper",
563 "cite MLLM 1976",
564 "cite journal",
565 "cite news/documentation",
566 "cite paper/documentation",
567 "cite video game",
568 "cite video game/documentation",
569 "cite newsgroup",
570 "cite newsgroup/documentation",
571 "cite web/documentation",
572 "cite news",
573 "Cite book",
574 "Cite-book",
575 "cite book",
576 "cite web",
577 "cite-usenet",
578 "cite-video/documentation",
579 "Cite-journal",
580 "rfe",
581 "catlangname",
582 "cln",
583 "langname-lite",
584 "no deprecated lang param usage",
585 "mention",
586 "m",
587 "m-self",
588 "link",
589 "l",
590 "ll",
591 "l-self",
592]
593# Regexp for matching ignored etymology template names. This adds certain
594# prefixes to the names listed above.
595ignored_etymology_templates_re = re.compile(
596 r"^((cite-|R:|RQ:).*|"
597 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
598 + r")$"
599)
601# Regexp for matching ignored descendants template names. Right now we just
602# copy the ignored etymology templates
603ignored_descendants_templates_re = ignored_etymology_templates_re
605# Set of template names that are used to define usage examples. If the usage
606# example contains one of these templates, then it its type is set to
607# "example"
608usex_templates: set[str] = {
609 "afex",
610 "affixusex",
611 "co", # {{collocation}} acts like a example template, specifically for
612 # pairs of combinations of words that are more common than you'd
613 # except would be randomly; hlavní#Czech
614 "coi",
615 "collocation",
616 "el-example",
617 "el-x",
618 "example",
619 "examples",
620 "he-usex",
621 "he-x",
622 "hi-usex",
623 "hi-x",
624 "ja-usex-inline",
625 "ja-usex",
626 "ja-x",
627 "jbo-example",
628 "jbo-x",
629 "km-usex",
630 "km-x",
631 "ko-usex",
632 "ko-x",
633 "lo-usex",
634 "lo-x",
635 "ne-x",
636 "ne-usex",
637 "prefixusex",
638 "ryu-usex",
639 "ryu-x",
640 "shn-usex",
641 "shn-x",
642 "suffixusex",
643 "th-usex",
644 "th-x",
645 "ur-usex",
646 "ur-x",
647 "usex",
648 "usex-suffix",
649 "ux",
650 "uxi",
651}
653stop_head_at_these_templates: set[str] = {
654 "category",
655 "cat",
656 "topics",
657 "catlangname",
658 "c",
659 "C",
660 "top",
661 "cln",
662}
664# Set of template names that are used to define quotation examples. If the
665# usage example contains one of these templates, then its type is set to
666# "quotation".
667quotation_templates: set[str] = {
668 "collapse-quote",
669 "quote-av",
670 "quote-book",
671 "quote-GYLD",
672 "quote-hansard",
673 "quotei",
674 "quote-journal",
675 "quotelite",
676 "quote-mailing list",
677 "quote-meta",
678 "quote-newsgroup",
679 "quote-song",
680 "quote-text",
681 "quote",
682 "quote-us-patent",
683 "quote-video game",
684 "quote-web",
685 "quote-wikipedia",
686 "wikiquote",
687 "Wikiquote",
688}
690taxonomy_templates = {
691 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
692 "taxfmt",
693 "taxlink",
694 "taxlink2",
695 "taxlinknew",
696 "taxlook",
697}
699# Template names, this was exctracted from template_linkage_mappings,
700# because the code using template_linkage_mappings was actually not used
701# (but not removed).
702template_linkages_to_ignore_in_examples: set[str] = {
703 "syn",
704 "synonyms",
705 "ant",
706 "antonyms",
707 "hyp",
708 "hyponyms",
709 "der",
710 "derived terms",
711 "coordinate terms",
712 "cot",
713 "rel",
714 "col",
715 "inline alt forms",
716 "alti",
717 "comeronyms",
718 "holonyms",
719 "holo",
720 "hypernyms",
721 "hyper",
722 "meronyms",
723 "mero",
724 "troponyms",
725 "perfectives",
726 "pf",
727 "imperfectives",
728 "impf",
729 "syndiff",
730 "synsee",
731 # not linkage nor example templates
732 "sense",
733 "s",
734 "color panel",
735 "colour panel",
736}
738# Maps template name used in a word sense to a linkage field that it adds.
739sense_linkage_templates: dict[str, str] = {
740 "syn": "synonyms",
741 "synonyms": "synonyms",
742 "synsee": "synonyms",
743 "syndiff": "synonyms",
744 "hyp": "hyponyms",
745 "hyponyms": "hyponyms",
746 "ant": "antonyms",
747 "antonyms": "antonyms",
748 "alti": "related",
749 "inline alt forms": "related",
750 "coordinate terms": "coordinate_terms",
751 "cot": "coordinate_terms",
752 "comeronyms": "related",
753 "holonyms": "holonyms",
754 "holo": "holonyms",
755 "hypernyms": "hypernyms",
756 "hyper": "hypernyms",
757 "meronyms": "meronyms",
758 "mero": "meronyms",
759 "troponyms": "troponyms",
760 "perfectives": "related",
761 "pf": "related",
762 "imperfectives": "related",
763 "impf": "related",
764}
766sense_linkage_templates_tags: dict[str, list[str]] = {
767 "alti": ["alternative"],
768 "inline alt forms": ["alternative"],
769 "comeronyms": ["comeronym"],
770 "perfectives": ["perfective"],
771 "pf": ["perfective"],
772 "imperfectives": ["imperfective"],
773 "impf": ["imperfective"],
774}
777def decode_html_entities(v: Union[str, int]) -> str:
778 """Decodes HTML entities from a value, converting them to the respective
779 Unicode characters/strings."""
780 if isinstance(v, int):
781 # I changed this to return str(v) instead of v = str(v),
782 # but there might have been the intention to have more logic
783 # here. html.unescape would not do anything special with an integer,
784 # it needs html escape symbols (&xx;).
785 return str(v)
786 return html.unescape(v)
789def parse_sense_linkage(
790 wxr: WiktextractContext,
791 data: SenseData,
792 name: str,
793 ht: TemplateArgs,
794 pos: str,
795) -> None:
796 """Parses a linkage (synonym, etc) specified in a word sense."""
797 assert isinstance(wxr, WiktextractContext)
798 assert isinstance(data, dict)
799 assert isinstance(name, str)
800 assert isinstance(ht, dict)
801 field = sense_linkage_templates[name]
802 field_tags = sense_linkage_templates_tags.get(name, [])
803 for i in range(2, 20):
804 w = ht.get(i) or ""
805 w = clean_node(wxr, data, w)
806 is_thesaurus = False
807 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
808 if w.startswith(alias): 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true
809 is_thesaurus = True
810 w = w[len(alias) :]
811 if w != wxr.wtp.title:
812 from ...thesaurus import search_thesaurus
814 lang_code = clean_node(wxr, None, ht.get(1, ""))
815 for t_data in search_thesaurus(
816 wxr.thesaurus_db_conn, # type: ignore
817 w,
818 lang_code,
819 pos,
820 field, # type: ignore
821 ):
822 l_data: LinkageData = {
823 "word": t_data.term,
824 "source": "Thesaurus:" + w,
825 }
826 if len(t_data.tags) > 0:
827 l_data["tags"] = t_data.tags
828 if len(t_data.raw_tags) > 0:
829 l_data["raw_tags"] = t_data.raw_tags
830 data_append(data, field, l_data)
831 break
832 if not w:
833 break
834 if is_thesaurus: 834 ↛ 835line 834 didn't jump to line 835 because the condition on line 834 was never true
835 continue
836 tags: list[str] = []
837 topics: list[str] = []
838 english: Optional[str] = None
839 # Try to find qualifiers for this synonym
840 q = ht.get("q{}".format(i - 1))
841 if q:
842 cls = classify_desc(q)
843 if cls == "tags":
844 tagsets1, topics1 = decode_tags(q)
845 for ts in tagsets1:
846 tags.extend(ts)
847 topics.extend(topics1)
848 elif cls == "english": 848 ↛ 854line 848 didn't jump to line 854 because the condition on line 848 was always true
849 if english: 849 ↛ 850line 849 didn't jump to line 850 because the condition on line 849 was never true
850 english += "; " + q
851 else:
852 english = q
853 # Try to find English translation for this synonym
854 t = ht.get("t{}".format(i - 1))
855 if t: 855 ↛ 856line 855 didn't jump to line 856 because the condition on line 855 was never true
856 if english:
857 english += "; " + t
858 else:
859 english = t
861 # See if the linkage contains a parenthesized alt
862 alt = None
863 m = re.search(r"\(([^)]+)\)$", w)
864 if m: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true
865 w = w[: m.start()].strip()
866 alt = m.group(1)
868 dt = {"word": w}
869 if field_tags: 869 ↛ 870line 869 didn't jump to line 870 because the condition on line 869 was never true
870 data_extend(dt, "tags", field_tags)
871 if tags:
872 data_extend(dt, "tags", tags)
873 if topics: 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true
874 data_extend(dt, "topics", topics)
875 if english:
876 dt["english"] = english # DEPRECATED for "translation"
877 dt["translation"] = english
878 if alt: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true
879 dt["alt"] = alt
880 data_append(data, field, dt)
883EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
884example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
885captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
888def synch_splits_with_args(
889 line: str, targs: TemplateArgs
890) -> Optional[list[str]]:
891 """If it looks like there's something weird with how a line of example
892 text has been split, this function will do the splitting after counting
893 occurences of the splitting regex inside the two main template arguments
894 containing the string data for the original language example and the
895 English translations.
896 """
897 # Previously, we split without capturing groups, but here we want to
898 # keep the original splitting hyphen regex intact.
899 fparts = captured_splitters_re.split(line)
900 new_parts = []
901 # ["First", " – ", "second", " – ", "third..."] from OL argument
902 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
903 new_parts.append("".join(fparts[:first]))
904 # Translation argument
905 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
906 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
907 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
908 new_parts.append("".join(fparts[first + 1 : second]))
910 if all(new_parts): # no empty strings from the above spaghetti
911 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
912 return new_parts
913 else:
914 return None
917QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
918QUALIFIERS_RE = re.compile(QUALIFIERS)
919# (...): ... or (...(...)...): ...
922def parse_language(
923 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
924) -> list[WordData]:
925 """Iterates over the text of the page, returning words (parts-of-speech)
926 defined on the page one at a time. (Individual word senses for the
927 same part-of-speech are typically encoded in the same entry.)"""
928 # imported here to avoid circular import
929 from .pronunciation import parse_pronunciation
931 assert isinstance(wxr, WiktextractContext)
932 assert isinstance(langnode, WikiNode)
933 assert isinstance(language, str)
934 assert isinstance(lang_code, str)
935 # print("parse_language", language)
937 is_reconstruction = False
938 word: str = wxr.wtp.title # type: ignore[assignment]
939 unsupported_prefix = "Unsupported titles/"
940 if word.startswith(unsupported_prefix):
941 w = word[len(unsupported_prefix) :]
942 if w in unsupported_title_map: 942 ↛ 945line 942 didn't jump to line 945 because the condition on line 942 was always true
943 word = unsupported_title_map[w]
944 else:
945 wxr.wtp.error(
946 "Unimplemented unsupported title: {}".format(word),
947 sortid="page/870",
948 )
949 word = w
950 elif word.startswith("Reconstruction:"):
951 word = word[word.find("/") + 1 :]
952 is_reconstruction = True
954 base_data: WordData = {
955 "word": word,
956 "lang": language,
957 "lang_code": lang_code,
958 }
959 if is_reconstruction:
960 data_append(base_data, "tags", "reconstruction")
961 sense_data: SenseData = {}
962 pos_data: WordData = {} # For a current part-of-speech
963 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
964 etym_data: WordData = {} # For one etymology
965 sense_datas: list[SenseData] = []
966 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
967 # Never reset, do not use as data
968 level_four_datas: list[WordData] = []
969 etym_datas: list[WordData] = []
970 page_datas: list[WordData] = []
971 have_etym = False
972 inside_level_four = False # This is for checking if the etymology section
973 # or article has a Pronunciation section, for Chinese mostly; because
974 # Chinese articles can have three level three sections (two etymology
975 # sections and pronunciation sections) one after another, we need a kludge
976 # to better keep track of whether we're in a normal "etym" or inside a
977 # "level four" (which is what we've turned the level three Pron sections
978 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
979 # a step.
980 stack: list[str] = [] # names of items on the "stack"
982 def merge_base(data: WordData, base: WordData) -> None:
983 for k, v in base.items():
984 # Copy the value to ensure that we don't share lists or
985 # dicts between structures (even nested ones).
986 v = copy.deepcopy(v)
987 if k not in data:
988 # The list was copied above, so this will not create shared ref
989 data[k] = v # type: ignore[literal-required]
990 continue
991 if data[k] == v: # type: ignore[literal-required]
992 continue
993 if ( 993 ↛ 1001line 993 didn't jump to line 1001 because the condition on line 993 was always true
994 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
995 or isinstance(
996 v,
997 (list, tuple), # Should this be "and"?
998 )
999 ):
1000 data[k] = list(data[k]) + list(v) # type: ignore
1001 elif data[k] != v: # type: ignore[literal-required]
1002 wxr.wtp.warning(
1003 "conflicting values for {} in merge_base: "
1004 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1005 sortid="page/904",
1006 )
1008 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1009 """Remove unnecessary keys from dict values
1010 in a list comprehension..."""
1011 if key in pron:
1012 pron.pop(key) # type: ignore
1013 return pron
1015 # If the result has sounds, eliminate sounds that have a prefix that
1016 # does not match "word" or one of "forms"
1017 if "sounds" in data and "word" in data:
1018 accepted = [data["word"]]
1019 accepted.extend(f["form"] for f in data.get("forms", dict()))
1020 data["sounds"] = list(
1021 s
1022 for s in data["sounds"]
1023 if "form" not in s or s["form"] in accepted
1024 )
1025 # If the result has sounds, eliminate sounds that have a pos that
1026 # does not match "pos"
1027 if "sounds" in data and "pos" in data:
1028 data["sounds"] = list(
1029 complementary_pop(s, "pos")
1030 for s in data["sounds"]
1031 # "pos" is not a field of SoundData, correctly, so we're
1032 # removing it here. It's a kludge on a kludge on a kludge.
1033 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1034 )
1036 def push_sense(sorting_ordinal: int | None = None) -> bool:
1037 """Starts collecting data for a new word sense. This returns True
1038 if a sense was added."""
1039 nonlocal sense_data
1040 if sorting_ordinal is None:
1041 sorting_ordinal = sense_ordinal
1042 tags = sense_data.get("tags", ())
1043 if (
1044 not sense_data.get("glosses")
1045 and "translation-hub" not in tags
1046 and "no-gloss" not in tags
1047 ):
1048 return False
1050 if ( 1050 ↛ 1060line 1050 didn't jump to line 1060 because the condition on line 1050 was never true
1051 (
1052 "participle" in sense_data.get("tags", ())
1053 or "infinitive" in sense_data.get("tags", ())
1054 )
1055 and "alt_of" not in sense_data
1056 and "form_of" not in sense_data
1057 and "etymology_text" in etym_data
1058 and etym_data["etymology_text"] != ""
1059 ):
1060 etym = etym_data["etymology_text"]
1061 etym = etym.split(". ")[0]
1062 ret = parse_alt_or_inflection_of(wxr, etym, set())
1063 if ret is not None:
1064 tags, lst = ret
1065 assert isinstance(lst, (list, tuple))
1066 if "form-of" in tags:
1067 data_extend(sense_data, "form_of", lst)
1068 data_extend(sense_data, "tags", tags)
1069 elif "alt-of" in tags:
1070 data_extend(sense_data, "alt_of", lst)
1071 data_extend(sense_data, "tags", tags)
1073 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1073 ↛ 1076line 1073 didn't jump to line 1076 because the condition on line 1073 was never true
1074 "tags", ()
1075 ):
1076 data_append(sense_data, "tags", "no-gloss")
1078 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore
1079 sense_datas.append(sense_data)
1080 sense_data = {}
1081 return True
1083 def push_pos(sorting_ordinal: int | None = None) -> None:
1084 """Starts collecting data for a new part-of-speech."""
1085 nonlocal pos_data
1086 nonlocal sense_datas
1087 push_sense(sorting_ordinal)
1088 if wxr.wtp.subsection:
1089 data: WordData = {"senses": sense_datas}
1090 merge_base(data, pos_data)
1091 level_four_datas.append(data)
1092 pos_data = {}
1093 sense_datas = []
1094 wxr.wtp.start_subsection(None)
1096 def push_level_four_section(clear_sound_data: bool) -> None:
1097 """Starts collecting data for a new level four sections, which
1098 is usually virtual and empty, unless the article has Chinese
1099 'Pronunciation' sections that are etymology-section-like but
1100 under etymology, and at the same level in the source. We modify
1101 the source to demote Pronunciation sections like that to level
1102 4, and other sections one step lower."""
1103 nonlocal level_four_data
1104 nonlocal level_four_datas
1105 nonlocal etym_datas
1106 push_pos()
1107 # print(f"======\n{etym_data=}")
1108 # print(f"======\n{etym_datas=}")
1109 # print(f"======\n{level_four_data=}")
1110 # print(f"======\n{level_four_datas=}")
1111 for data in level_four_datas:
1112 merge_base(data, level_four_data)
1113 etym_datas.append(data)
1114 for data in etym_datas:
1115 merge_base(data, etym_data)
1116 page_datas.append(data)
1117 if clear_sound_data:
1118 level_four_data = {}
1119 level_four_datas = []
1120 etym_datas = []
1122 def push_etym() -> None:
1123 """Starts collecting data for a new etymology."""
1124 nonlocal etym_data
1125 nonlocal etym_datas
1126 nonlocal have_etym
1127 nonlocal inside_level_four
1128 have_etym = True
1129 push_level_four_section(False)
1130 inside_level_four = False
1131 # etymology section could under pronunciation section
1132 etym_data = (
1133 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1134 )
1136 def select_data() -> WordData:
1137 """Selects where to store data (pos or etym) based on whether we
1138 are inside a pos (part-of-speech)."""
1139 # print(f"{wxr.wtp.subsection=}")
1140 # print(f"{stack=}")
1141 if wxr.wtp.subsection is not None:
1142 return pos_data
1143 if inside_level_four:
1144 return level_four_data
1145 if stack[-1] == language:
1146 return base_data
1147 return etym_data
1149 term_label_templates: list[TemplateData] = []
1151 def head_post_template_fn(
1152 name: str, ht: TemplateArgs, expansion: str
1153 ) -> Optional[str]:
1154 """Handles special templates in the head section of a word. Head
1155 section is the text after part-of-speech subtitle and before word
1156 sense list. Typically it generates the bold line for the word, but
1157 may also contain other useful information that often ends in
1158 side boxes. We want to capture some of that additional information."""
1159 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1160 if is_panel_template(wxr, name): 1160 ↛ 1163line 1160 didn't jump to line 1163 because the condition on line 1160 was never true
1161 # Completely ignore these templates (not even recorded in
1162 # head_templates)
1163 return ""
1164 if name == "head":
1165 # XXX are these also captured in forms? Should this special case
1166 # be removed?
1167 t = ht.get(2, "")
1168 if t == "pinyin": 1168 ↛ 1169line 1168 didn't jump to line 1169 because the condition on line 1168 was never true
1169 data_append(pos_data, "tags", "Pinyin")
1170 elif t == "romanization": 1170 ↛ 1171line 1170 didn't jump to line 1171 because the condition on line 1170 was never true
1171 data_append(pos_data, "tags", "romanization")
1172 if (
1173 HEAD_TAG_RE.search(name) is not None
1174 or name in WORD_LEVEL_HEAD_TEMPLATES
1175 ):
1176 args_ht = clean_template_args(wxr, ht)
1177 cleaned_expansion = clean_node(wxr, None, expansion)
1178 dt: TemplateData = {
1179 "name": name,
1180 "args": args_ht,
1181 "expansion": cleaned_expansion,
1182 }
1183 data_append(pos_data, "head_templates", dt)
1184 if name in WORD_LEVEL_HEAD_TEMPLATES:
1185 term_label_templates.append(dt)
1186 # Squash these, their tags are applied to the whole word,
1187 # and some cause problems like "term-label"
1188 return ""
1190 # The following are both captured in head_templates and parsed
1191 # separately
1193 if name in wikipedia_templates:
1194 # Note: various places expect to have content from wikipedia
1195 # templates, so cannot convert this to empty
1196 parse_wikipedia_template(wxr, pos_data, ht)
1197 return None
1199 if name == "number box": 1199 ↛ 1201line 1199 didn't jump to line 1201 because the condition on line 1199 was never true
1200 # XXX extract numeric value?
1201 return ""
1202 if name == "enum":
1203 # XXX extract?
1204 return ""
1205 if name == "cardinalbox": 1205 ↛ 1208line 1205 didn't jump to line 1208 because the condition on line 1205 was never true
1206 # XXX extract similar to enum?
1207 # XXX this can also occur in top-level under language
1208 return ""
1209 if name == "Han simplified forms": 1209 ↛ 1211line 1209 didn't jump to line 1211 because the condition on line 1209 was never true
1210 # XXX extract?
1211 return ""
1212 # if name == "ja-kanji forms":
1213 # # XXX extract?
1214 # return ""
1215 # if name == "vi-readings":
1216 # # XXX extract?
1217 # return ""
1218 # if name == "ja-kanji":
1219 # # XXX extract?
1220 # return ""
1221 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1221 ↛ 1223line 1221 didn't jump to line 1223 because the condition on line 1221 was never true
1222 # XXX extract?
1223 return ""
1225 return None
1227 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1228 """Parses the subsection for a part-of-speech under a language on
1229 a page."""
1230 assert isinstance(posnode, WikiNode)
1231 assert isinstance(pos, str)
1232 # print("parse_part_of_speech", pos)
1233 pos_data["pos"] = pos
1234 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1235 lists: list[list[WikiNode]] = [[]] # list of lists
1236 first_para = True
1237 first_head_tmplt = True
1238 collecting_head = True
1239 start_of_paragraph = True
1241 # XXX extract templates from posnode with recursively_extract
1242 # that break stuff, like ja-kanji or az-suffix-form.
1243 # Do the extraction with a list of template names, combined from
1244 # different lists, then separate out them into different lists
1245 # that are handled at different points of the POS section.
1246 # First, extract az-suffix-form, put it in `inflection`,
1247 # and parse `inflection`'s content when appropriate later.
1248 # The contents of az-suffix-form (and ja-kanji) that generate
1249 # divs with "floatright" in their style gets deleted by
1250 # clean_value, so templates that slip through from here won't
1251 # break anything.
1252 # XXX bookmark
1253 # print("===================")
1254 # print(posnode.children)
1256 floaters, poschildren = recursively_extract(
1257 posnode.children,
1258 lambda x: (
1259 isinstance(x, WikiNode)
1260 and (
1261 (
1262 isinstance(x, TemplateNode)
1263 and x.template_name in FLOATING_TABLE_TEMPLATES
1264 )
1265 or (
1266 x.kind == NodeKind.LINK
1267 # Need to check for stringiness because some links are
1268 # broken; for example, if a template is missing an
1269 # argument, a link might look like `[[{{{1}}}...]]`
1270 and len(x.largs) > 0
1271 and len(x.largs[0]) > 0
1272 and isinstance(x.largs[0][0], str)
1273 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1274 )
1275 )
1276 ),
1277 )
1278 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1279 tempnode.largs = [["Inflection"]]
1280 tempnode.children = floaters
1281 parse_inflection(tempnode, "Floating Div", pos)
1282 # print(poschildren)
1283 # XXX new above
1285 if not poschildren: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true
1286 if not floaters:
1287 wxr.wtp.debug(
1288 "PoS section without contents",
1289 sortid="en/page/1051/20230612",
1290 )
1291 else:
1292 wxr.wtp.debug(
1293 "PoS section without contents except for a floating table",
1294 sortid="en/page/1056/20230612",
1295 )
1296 return
1298 for node in poschildren:
1299 if isinstance(node, str):
1300 for m in re.finditer(r"\n+|[^\n]+", node):
1301 p = m.group(0)
1302 if p.startswith("\n\n") and pre:
1303 first_para = False
1304 start_of_paragraph = True
1305 break
1306 if p and collecting_head:
1307 pre[-1].append(p)
1308 continue
1309 assert isinstance(node, WikiNode)
1310 kind = node.kind
1311 if kind == NodeKind.LIST:
1312 lists[-1].append(node)
1313 collecting_head = False
1314 start_of_paragraph = True
1315 continue
1316 elif kind in LEVEL_KINDS:
1317 # Stop parsing section if encountering any kind of
1318 # level header (like ===Noun=== or ====Further Reading====).
1319 # At a quick glance, this should be the default behavior,
1320 # but if some kinds of source articles have sub-sub-sections
1321 # that should be parsed XXX it should be handled by changing
1322 # this break.
1323 break
1324 elif collecting_head and kind == NodeKind.LINK:
1325 # We might collect relevant links as they are often pictures
1326 # relating to the word
1327 if len(node.largs[0]) >= 1 and isinstance( 1327 ↛ 1342line 1327 didn't jump to line 1342 because the condition on line 1327 was always true
1328 node.largs[0][0], str
1329 ):
1330 if node.largs[0][0].startswith( 1330 ↛ 1336line 1330 didn't jump to line 1336 because the condition on line 1330 was never true
1331 ns_title_prefix_tuple(wxr, "Category")
1332 ):
1333 # [[Category:...]]
1334 # We're at the end of the file, probably, so stop
1335 # here. Otherwise the head will get garbage.
1336 break
1337 if node.largs[0][0].startswith( 1337 ↛ 1342line 1337 didn't jump to line 1342 because the condition on line 1337 was always true
1338 ns_title_prefix_tuple(wxr, "File")
1339 ):
1340 # Skips file links
1341 continue
1342 start_of_paragraph = False
1343 pre[-1].extend(node.largs[-1])
1344 elif kind == NodeKind.HTML:
1345 if node.sarg == "br":
1346 if pre[-1]: 1346 ↛ 1298line 1346 didn't jump to line 1298 because the condition on line 1346 was always true
1347 pre.append([]) # Switch to next head
1348 lists.append([]) # Lists parallels pre
1349 collecting_head = True
1350 start_of_paragraph = True
1351 elif collecting_head and node.sarg not in ( 1351 ↛ 1357line 1351 didn't jump to line 1357 because the condition on line 1351 was never true
1352 "gallery",
1353 "ref",
1354 "cite",
1355 "caption",
1356 ):
1357 start_of_paragraph = False
1358 pre[-1].append(node)
1359 else:
1360 start_of_paragraph = False
1361 elif isinstance(node, TemplateNode):
1362 # XXX Insert code here that disambiguates between
1363 # templates that generate word heads and templates
1364 # that don't.
1365 # There's head_tag_re that seems like a regex meant
1366 # to identify head templates. Too bad it's None.
1368 # ignore {{category}}, {{cat}}... etc.
1369 if node.template_name in stop_head_at_these_templates:
1370 # we've reached a template that should be at the end,
1371 continue
1373 # skip these templates; panel_templates is already used
1374 # to skip certain templates else, but it also applies to
1375 # head parsing quite well.
1376 # node.largs[0][0] should always be str, but can't type-check
1377 # that.
1378 if is_panel_template(wxr, node.template_name):
1379 continue
1380 # skip these templates
1381 # if node.largs[0][0] in skip_these_templates_in_head:
1382 # first_head_tmplt = False # no first_head_tmplt at all
1383 # start_of_paragraph = False
1384 # continue
1386 if first_head_tmplt and pre[-1]:
1387 first_head_tmplt = False
1388 start_of_paragraph = False
1389 pre[-1].append(node)
1390 elif pre[-1] and start_of_paragraph:
1391 pre.append([]) # Switch to the next head
1392 lists.append([]) # lists parallel pre
1393 collecting_head = True
1394 start_of_paragraph = False
1395 pre[-1].append(node)
1396 else:
1397 pre[-1].append(node)
1398 elif first_para:
1399 start_of_paragraph = False
1400 if collecting_head: 1400 ↛ 1298line 1400 didn't jump to line 1298 because the condition on line 1400 was always true
1401 pre[-1].append(node)
1402 # XXX use template_fn in clean_node to check that the head macro
1403 # is compatible with the current part-of-speech and generate warning
1404 # if not. Use template_allowed_pos_map.
1406 # Clean up empty pairs, and fix messes with extra newlines that
1407 # separate templates that are followed by lists wiktextract issue #314
1409 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1410 cleaned_lists: list[list[WikiNode]] = []
1411 pairless_pre_index = None
1413 for pre1, ls in zip(pre, lists):
1414 if pre1 and not ls:
1415 pairless_pre_index = len(cleaned_pre)
1416 if not pre1 and not ls: 1416 ↛ 1418line 1416 didn't jump to line 1418 because the condition on line 1416 was never true
1417 # skip [] + []
1418 continue
1419 if not ls and all(
1420 (isinstance(x, str) and not x.strip()) for x in pre1
1421 ):
1422 # skip ["\n", " "] + []
1423 continue
1424 if ls and not pre1:
1425 if pairless_pre_index is not None: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 cleaned_lists[pairless_pre_index] = ls
1427 pairless_pre_index = None
1428 continue
1429 cleaned_pre.append(pre1)
1430 cleaned_lists.append(ls)
1432 pre = cleaned_pre
1433 lists = cleaned_lists
1435 there_are_many_heads = len(pre) > 1
1436 header_tags: list[str] = []
1437 header_topics: list[str] = []
1438 previous_head_had_list = False
1440 if not any(g for g in lists):
1441 process_gloss_without_list(
1442 poschildren, pos, pos_data, header_tags, header_topics
1443 )
1444 else:
1445 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1446 # if len(ls) == 0:
1447 # # don't have gloss list
1448 # # XXX add code here to filter out 'garbage', like text
1449 # # that isn't a head template or head.
1450 # continue
1452 if all(not sl for sl in lists[i:]):
1453 if i == 0: 1453 ↛ 1454line 1453 didn't jump to line 1454 because the condition on line 1453 was never true
1454 if isinstance(node, str):
1455 wxr.wtp.debug(
1456 "first head without list of senses,"
1457 "string: '{}[...]', {}/{}".format(
1458 node[:20], word, language
1459 ),
1460 sortid="page/1689/20221215",
1461 )
1462 if isinstance(node, WikiNode):
1463 if node.largs and node.largs[0][0] in [
1464 "Han char",
1465 ]:
1466 # just ignore these templates
1467 pass
1468 else:
1469 wxr.wtp.debug(
1470 "first head without "
1471 "list of senses, "
1472 "template node "
1473 "{}, {}/{}".format(
1474 node.largs, word, language
1475 ),
1476 sortid="page/1694/20221215",
1477 )
1478 else:
1479 wxr.wtp.debug(
1480 "first head without list of senses, "
1481 "{}/{}".format(word, language),
1482 sortid="page/1700/20221215",
1483 )
1484 # no break here so that the first head always
1485 # gets processed.
1486 else:
1487 if isinstance(node, str): 1487 ↛ 1488line 1487 didn't jump to line 1488 because the condition on line 1487 was never true
1488 wxr.wtp.debug(
1489 "later head without list of senses,"
1490 "string: '{}[...]', {}/{}".format(
1491 node[:20], word, language
1492 ),
1493 sortid="page/1708/20221215",
1494 )
1495 if isinstance(node, WikiNode): 1495 ↛ 1507line 1495 didn't jump to line 1507 because the condition on line 1495 was always true
1496 wxr.wtp.debug(
1497 "later head without list of senses,"
1498 "template node "
1499 "{}, {}/{}".format(
1500 node.sarg if node.sarg else node.largs,
1501 word,
1502 language,
1503 ),
1504 sortid="page/1713/20221215",
1505 )
1506 else:
1507 wxr.wtp.debug(
1508 "later head without list of senses, "
1509 "{}/{}".format(word, language),
1510 sortid="page/1719/20221215",
1511 )
1512 break
1513 head_group = i + 1 if there_are_many_heads else None
1514 # print("parse_part_of_speech: {}: {}: pre={}"
1515 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1517 if previous_head_had_list:
1518 # We use a boolean flag here because we want to be able
1519 # let the header_tags data pass through after the loop
1520 # is over without accidentally emptying it, if there are
1521 # no pos_datas and we need a dummy data.
1522 header_tags.clear()
1523 header_topics.clear()
1525 process_gloss_header(
1526 pre1, pos, head_group, pos_data, header_tags, header_topics
1527 )
1528 for ln in ls:
1529 # Parse each list associated with this head.
1530 for node in ln.children:
1531 # Parse nodes in l.children recursively.
1532 # The recursion function uses push_sense() to
1533 # add stuff into sense_datas, and returns True or
1534 # False if something is added, which bubbles upward.
1535 # If the bubble is "True", then higher levels of
1536 # the recursion will not push_sense(), because
1537 # the data is already pushed into a sub-gloss
1538 # downstream, unless the higher level has examples
1539 # that need to be put somewhere.
1540 common_data: SenseData = {
1541 "tags": list(header_tags),
1542 "topics": list(header_topics),
1543 }
1544 if head_group:
1545 common_data["head_nr"] = head_group
1546 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1548 if len(ls) > 0:
1549 previous_head_had_list = True
1550 else:
1551 previous_head_had_list = False
1553 # If there are no senses extracted, add a dummy sense. We want to
1554 # keep tags extracted from the head for the dummy sense.
1555 push_sense() # Make sure unfinished data pushed, and start clean sense
1556 if len(sense_datas) == 0:
1557 data_extend(sense_data, "tags", header_tags)
1558 data_extend(sense_data, "topics", header_topics)
1559 data_append(sense_data, "tags", "no-gloss")
1560 push_sense()
1562 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore
1564 for sd in sense_datas:
1565 if "__temp_sense_sorting_ordinal" in sd: 1565 ↛ 1564line 1565 didn't jump to line 1564 because the condition on line 1565 was always true
1566 del sd["__temp_sense_sorting_ordinal"] # type: ignore
1568 def process_gloss_header(
1569 header_nodes: list[Union[WikiNode, str]],
1570 pos_type: str,
1571 header_group: Optional[int],
1572 pos_data: WordData,
1573 header_tags: list[str],
1574 header_topics: list[str],
1575 ) -> None:
1576 ruby = []
1577 links: list[str] = []
1579 # process template parse nodes here
1580 new_nodes = []
1581 info_template_data = []
1582 for node in header_nodes:
1583 # print(f"{node=}")
1584 info_data, info_out = parse_info_template_node(wxr, node, "head")
1585 if info_data or info_out:
1586 if info_data: 1586 ↛ 1588line 1586 didn't jump to line 1588 because the condition on line 1586 was always true
1587 info_template_data.append(info_data)
1588 if info_out: # including just the original node 1588 ↛ 1589line 1588 didn't jump to line 1589 because the condition on line 1588 was never true
1589 new_nodes.append(info_out)
1590 else:
1591 new_nodes.append(node)
1592 header_nodes = new_nodes
1594 if info_template_data:
1595 if "info_templates" not in pos_data: 1595 ↛ 1598line 1595 didn't jump to line 1598 because the condition on line 1595 was always true
1596 pos_data["info_templates"] = info_template_data
1597 else:
1598 pos_data["info_templates"].extend(info_template_data)
1600 if not word.isalnum():
1601 # `-` is kosher, add more of these if needed.
1602 if word.replace("-", "").isalnum():
1603 pass
1604 else:
1605 # if the word contains non-letter or -number characters, it
1606 # might have something that messes with split-at-semi-comma; we
1607 # collect links so that we can skip splitting them.
1608 exp = wxr.wtp.parse(
1609 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1610 )
1611 link_nodes, _ = recursively_extract(
1612 exp.children,
1613 lambda x: isinstance(x, WikiNode)
1614 and x.kind == NodeKind.LINK,
1615 )
1616 for ln in link_nodes:
1617 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1618 if not ltext.isalnum():
1619 links.append(ltext)
1620 if word not in links: 1620 ↛ 1623line 1620 didn't jump to line 1623 because the condition on line 1620 was always true
1621 links.append(word)
1623 if lang_code == "ja":
1624 exp = wxr.wtp.parse(
1625 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1626 )
1627 rub, _ = recursively_extract(
1628 exp.children,
1629 lambda x: isinstance(x, WikiNode)
1630 and x.kind == NodeKind.HTML
1631 and x.sarg == "ruby",
1632 )
1633 if rub is not None: 1633 ↛ 1678line 1633 didn't jump to line 1678 because the condition on line 1633 was always true
1634 for r in rub:
1635 if TYPE_CHECKING:
1636 # we know the lambda above in recursively_extract
1637 # returns only WikiNodes in rub
1638 assert isinstance(r, WikiNode)
1639 rt = parse_ruby(wxr, r)
1640 if rt is not None: 1640 ↛ 1634line 1640 didn't jump to line 1634 because the condition on line 1640 was always true
1641 ruby.append(rt)
1642 elif lang_code == "vi":
1643 # Handle vi-readings templates that have a weird structures for
1644 # Chu Nom vietnamese characters heads
1645 # https://en.wiktionary.org/wiki/Template:vi-readings
1646 new_header_nodes = []
1647 related_readings: list[LinkageData] = []
1648 for node in header_nodes:
1649 if ( 1649 ↛ 1673line 1649 didn't jump to line 1673 because the condition on line 1649 was always true
1650 isinstance(node, TemplateNode)
1651 and node.template_name == "vi-readings"
1652 ):
1653 print(node.template_parameters)
1654 for parameter, tag in (
1655 ("hanviet", "han-viet-reading"),
1656 ("nom", "nom-reading"),
1657 # we ignore the fanqie parameter "phienthiet"
1658 ):
1659 arg = node.template_parameters.get(parameter)
1660 if arg is not None: 1660 ↛ 1654line 1660 didn't jump to line 1654 because the condition on line 1660 was always true
1661 text = clean_node(wxr, None, arg)
1662 for w in text.split(","):
1663 # ignore - separated references
1664 if "-" in w:
1665 w = w[: w.index("-")]
1666 w = w.strip()
1667 related_readings.append(
1668 LinkageData(word=w, tags=[tag])
1669 )
1670 continue
1672 # Skip the vi-reading template for the rest of the head parsing
1673 new_header_nodes.append(node)
1674 if len(related_readings) > 0: 1674 ↛ 1678line 1674 didn't jump to line 1678 because the condition on line 1674 was always true
1675 data_extend(pos_data, "related", related_readings)
1676 header_nodes = new_header_nodes
1678 header_text = clean_node(
1679 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1680 )
1682 if not header_text.strip():
1683 return
1685 term_label_tags: list[str] = []
1686 term_label_topics: list[str] = []
1687 if len(term_label_templates) > 0:
1688 # parse term label templates; if there are other similar kinds
1689 # of templates in headers that you want to squash and apply as
1690 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1691 for templ_data in term_label_templates:
1692 # print(templ_data)
1693 expan = templ_data.get("expansion", "").strip("().,; ")
1694 if not expan: 1694 ↛ 1695line 1694 didn't jump to line 1695 because the condition on line 1694 was never true
1695 continue
1696 tlb_tagsets, tlb_topics = decode_tags(expan)
1697 for tlb_tags in tlb_tagsets:
1698 if len(tlb_tags) > 0 and not any(
1699 t.startswith("error-") for t in tlb_tags
1700 ):
1701 term_label_tags.extend(tlb_tags)
1702 term_label_topics.extend(tlb_topics)
1703 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1705 header_text = re.sub(r"\s+", " ", header_text)
1706 # print(f"{header_text=}")
1707 parse_word_head(
1708 wxr,
1709 pos_type,
1710 header_text,
1711 pos_data,
1712 is_reconstruction,
1713 header_group,
1714 ruby=ruby,
1715 links=links,
1716 )
1717 if "tags" in pos_data:
1718 # pos_data can get "tags" data from some source; type-checkers
1719 # doesn't like it, so let's ignore it.
1720 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1721 del pos_data["tags"] # type: ignore[typeddict-item]
1722 if len(term_label_tags) > 0:
1723 header_tags.extend(term_label_tags)
1724 if len(term_label_topics) > 0:
1725 header_topics.extend(term_label_topics)
1727 def process_gloss_without_list(
1728 nodes: list[Union[WikiNode, str]],
1729 pos_type: str,
1730 pos_data: WordData,
1731 header_tags: list[str],
1732 header_topics: list[str],
1733 ) -> None:
1734 # gloss text might not inside a list
1735 header_nodes: list[Union[str, WikiNode]] = []
1736 gloss_nodes: list[Union[str, WikiNode]] = []
1737 for node in strip_nodes(nodes):
1738 if isinstance(node, WikiNode):
1739 if isinstance(node, TemplateNode):
1740 if node.template_name in (
1741 "zh-see",
1742 "ja-see",
1743 "ja-see-kango",
1744 ):
1745 continue # soft redirect
1746 elif (
1747 node.template_name == "head"
1748 or node.template_name.startswith(f"{lang_code}-")
1749 ):
1750 header_nodes.append(node)
1751 continue
1752 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1752 ↛ 1754line 1752 didn't jump to line 1754 because the condition on line 1752 was always true
1753 break
1754 gloss_nodes.append(node)
1756 if len(header_nodes) > 0:
1757 process_gloss_header(
1758 header_nodes,
1759 pos_type,
1760 None,
1761 pos_data,
1762 header_tags,
1763 header_topics,
1764 )
1765 if len(gloss_nodes) > 0:
1766 process_gloss_contents(
1767 gloss_nodes,
1768 pos_type,
1769 {"tags": list(header_tags), "topics": list(header_topics)},
1770 )
1772 def parse_sense_node(
1773 node: Union[str, WikiNode], # never receives str
1774 sense_base: SenseData,
1775 pos: str,
1776 ) -> bool:
1777 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1778 Uses push_sense() to attempt adding data to pos_data in the scope
1779 of parse_language() when it reaches deep in the recursion. push_sense()
1780 returns True if it succeeds, and that is bubbled up the stack; if
1781 a sense was added downstream, the higher levels (whose shared data
1782 was already added by a subsense) do not push_sense(), unless it
1783 has examples that need to be put somewhere.
1784 """
1785 assert isinstance(sense_base, dict) # Added to every sense deeper in
1787 nonlocal sense_ordinal
1788 my_ordinal = sense_ordinal # copies, not a reference
1789 sense_ordinal += 1 # only use for sorting
1791 if not isinstance(node, WikiNode): 1791 ↛ 1793line 1791 didn't jump to line 1793 because the condition on line 1791 was never true
1792 # This doesn't seem to ever happen in practice.
1793 wxr.wtp.debug(
1794 "{}: parse_sense_node called with"
1795 "something that isn't a WikiNode".format(pos),
1796 sortid="page/1287/20230119",
1797 )
1798 return False
1800 if node.kind != NodeKind.LIST_ITEM: 1800 ↛ 1801line 1800 didn't jump to line 1801 because the condition on line 1800 was never true
1801 wxr.wtp.debug(
1802 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1803 )
1804 return False
1806 if node.sarg == ":":
1807 # Skip example entries at the highest level, ones without
1808 # a sense ("...#") above them.
1809 # If node.sarg is exactly and only ":", then it's at
1810 # the highest level; lower levels would have more
1811 # "indentation", like "#:" or "##:"
1812 return False
1814 # If a recursion call succeeds in push_sense(), bubble it up with
1815 # `added`.
1816 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1817 added = False
1819 gloss_template_args: set[str] = set()
1821 # For LISTs and LIST_ITEMS, their argument is something like
1822 # "##" or "##:", and using that we can rudimentally determine
1823 # list 'depth' if need be, and also what kind of list or
1824 # entry it is; # is for normal glosses, : for examples (indent)
1825 # and * is used for quotations on wiktionary.
1826 current_depth = node.sarg
1828 children = node.children
1830 # subentries, (presumably) a list
1831 # of subglosses below this. The list's
1832 # argument ends with #, and its depth should
1833 # be bigger than parent node.
1834 subentries = [
1835 x
1836 for x in children
1837 if isinstance(x, WikiNode)
1838 and x.kind == NodeKind.LIST
1839 and x.sarg == current_depth + "#"
1840 ]
1842 # sublists of examples and quotations. .sarg
1843 # does not end with "#".
1844 others = [
1845 x
1846 for x in children
1847 if isinstance(x, WikiNode)
1848 and x.kind == NodeKind.LIST
1849 and x.sarg != current_depth + "#"
1850 ]
1852 # the actual contents of this particular node.
1853 # can be a gloss (or a template that expands into
1854 # many glosses which we can't easily pre-expand)
1855 # or could be an "outer gloss" with more specific
1856 # subglosses, or could be a qualfier for the subglosses.
1857 contents = [
1858 x
1859 for x in children
1860 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1861 ]
1862 # If this entry has sublists of entries, we should combine
1863 # gloss information from both the "outer" and sublist content.
1864 # Sometimes the outer gloss
1865 # is more non-gloss or tags, sometimes it is a coarse sense
1866 # and the inner glosses are more specific. The outer one
1867 # does not seem to have qualifiers.
1869 # If we have one sublist with one element, treat it
1870 # specially as it may be a Wiktionary error; raise
1871 # that nested element to the same level.
1872 # XXX If need be, this block can be easily removed in
1873 # the current recursive logicand the result is one sense entry
1874 # with both glosses in the glosses list, as you would
1875 # expect. If the higher entry has examples, there will
1876 # be a higher entry with some duplicated data.
1877 if len(subentries) == 1:
1878 slc = subentries[0].children
1879 if len(slc) == 1:
1880 # copy current node and modify it so it doesn't
1881 # loop infinitely.
1882 cropped_node = copy.copy(node)
1883 cropped_node.children = [
1884 x
1885 for x in children
1886 if not (
1887 isinstance(x, WikiNode)
1888 and x.kind == NodeKind.LIST
1889 and x.sarg == current_depth + "#"
1890 )
1891 ]
1892 added |= parse_sense_node(cropped_node, sense_base, pos)
1893 nonlocal sense_data # this kludge causes duplicated raw_
1894 # glosses data if this is not done;
1895 # if the top-level (cropped_node)
1896 # does not push_sense() properly or
1897 # parse_sense_node() returns early,
1898 # sense_data is not reset. This happens
1899 # for example when you have a no-gloss
1900 # string like "(intransitive)":
1901 # no gloss, push_sense() returns early
1902 # and sense_data has duplicate data with
1903 # sense_base
1904 sense_data = {}
1905 added |= parse_sense_node(slc[0], sense_base, pos)
1906 return added
1908 return process_gloss_contents(
1909 contents,
1910 pos,
1911 sense_base,
1912 subentries,
1913 others,
1914 gloss_template_args,
1915 added,
1916 my_ordinal,
1917 )
1919 def process_gloss_contents(
1920 contents: list[Union[str, WikiNode]],
1921 pos: str,
1922 sense_base: SenseData,
1923 subentries: list[WikiNode] = [],
1924 others: list[WikiNode] = [],
1925 gloss_template_args: Set[str] = set(),
1926 added: bool = False,
1927 sorting_ordinal: int | None = None,
1928 ) -> bool:
1929 def sense_template_fn(
1930 name: str, ht: TemplateArgs, is_gloss: bool = False
1931 ) -> Optional[str]:
1932 # print(f"sense_template_fn: {name}, {ht}")
1933 if name in wikipedia_templates:
1934 # parse_wikipedia_template(wxr, pos_data, ht)
1935 return None
1936 if is_panel_template(wxr, name):
1937 return ""
1938 if name in INFO_TEMPLATE_FUNCS:
1939 info_data, info_exp = parse_info_template_arguments(
1940 wxr, name, ht, "sense"
1941 )
1942 if info_data or info_exp: 1942 ↛ 1948line 1942 didn't jump to line 1948 because the condition on line 1942 was always true
1943 if info_data: 1943 ↛ 1945line 1943 didn't jump to line 1945 because the condition on line 1943 was always true
1944 data_append(sense_base, "info_templates", info_data)
1945 if info_exp and isinstance(info_exp, str): 1945 ↛ 1947line 1945 didn't jump to line 1947 because the condition on line 1945 was always true
1946 return info_exp
1947 return ""
1948 if name in ("defdate",):
1949 date = clean_node(wxr, None, ht.get(1, ()))
1950 if part_two := ht.get(2): 1950 ↛ 1952line 1950 didn't jump to line 1952 because the condition on line 1950 was never true
1951 # Unicode mdash, not '-'
1952 date += "–" + clean_node(wxr, None, part_two)
1953 refs: dict[str, ReferenceData] = {}
1954 # ref, refn, ref2, ref2n, ref3, ref3n
1955 # ref1 not valid
1956 for k, v in sorted(
1957 (k, v) for k, v in ht.items() if isinstance(k, str)
1958 ):
1959 if m := re.match(r"ref(\d?)(n?)", k): 1959 ↛ 1956line 1959 didn't jump to line 1956 because the condition on line 1959 was always true
1960 ref_v = clean_node(wxr, None, v)
1961 if m.group(1) not in refs: # empty string or digit
1962 refs[m.group(1)] = ReferenceData()
1963 if m.group(2):
1964 refs[m.group(1)]["refn"] = ref_v
1965 else:
1966 refs[m.group(1)]["text"] = ref_v
1967 data_append(
1968 sense_base,
1969 "attestations",
1970 AttestationData(date=date, references=list(refs.values())),
1971 )
1972 return ""
1973 if name == "senseid":
1974 langid = clean_node(wxr, None, ht.get(1, ()))
1975 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1976 if re.match(r"Q\d+$", arg):
1977 data_append(sense_base, "wikidata", arg)
1978 data_append(sense_base, "senseid", langid + ":" + arg)
1979 if name in sense_linkage_templates:
1980 # print(f"SENSE_TEMPLATE_FN: {name}")
1981 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1982 return ""
1983 if name == "†" or name == "zh-obsolete":
1984 data_append(sense_base, "tags", "obsolete")
1985 return ""
1986 if name in {
1987 "ux",
1988 "uxi",
1989 "usex",
1990 "afex",
1991 "prefixusex",
1992 "ko-usex",
1993 "ko-x",
1994 "hi-x",
1995 "ja-usex-inline",
1996 "ja-x",
1997 "quotei",
1998 "he-x",
1999 "hi-x",
2000 "km-x",
2001 "ne-x",
2002 "shn-x",
2003 "th-x",
2004 "ur-x",
2005 }:
2006 # Usage examples are captured separately below. We don't
2007 # want to expand them into glosses even when unusual coding
2008 # is used in the entry.
2009 # These templates may slip through inside another item, but
2010 # currently we're separating out example entries (..#:)
2011 # well enough that there seems to very little contamination.
2012 if is_gloss:
2013 wxr.wtp.wiki_notice(
2014 "Example template is used for gloss text",
2015 sortid="extractor.en.page.sense_template_fn/1415",
2016 )
2017 else:
2018 return ""
2019 if name == "w": 2019 ↛ 2020line 2019 didn't jump to line 2020 because the condition on line 2019 was never true
2020 if ht.get(2) == "Wp":
2021 return ""
2022 for v in ht.values():
2023 v = v.strip()
2024 if v and "<" not in v:
2025 gloss_template_args.add(v)
2026 return None
2028 def extract_link_texts(item: GeneralNode) -> None:
2029 """Recursively extracts link texts from the gloss source. This
2030 information is used to select whether to remove final "." from
2031 form_of/alt_of (e.g., ihm/Hunsrik)."""
2032 if isinstance(item, (list, tuple)):
2033 for x in item:
2034 extract_link_texts(x)
2035 return
2036 if isinstance(item, str):
2037 # There seem to be HTML sections that may futher contain
2038 # unparsed links.
2039 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2039 ↛ 2040line 2039 didn't jump to line 2040 because the loop on line 2039 never started
2040 print("ITER:", m.group(0))
2041 v = m.group(1).split("|")[-1].strip()
2042 if v:
2043 gloss_template_args.add(v)
2044 return
2045 if not isinstance(item, WikiNode): 2045 ↛ 2046line 2045 didn't jump to line 2046 because the condition on line 2045 was never true
2046 return
2047 if item.kind == NodeKind.LINK:
2048 v = item.largs[-1]
2049 if ( 2049 ↛ 2055line 2049 didn't jump to line 2055 because the condition on line 2049 was always true
2050 isinstance(v, list)
2051 and len(v) == 1
2052 and isinstance(v[0], str)
2053 ):
2054 gloss_template_args.add(v[0].strip())
2055 for x in item.children:
2056 extract_link_texts(x)
2058 extract_link_texts(contents)
2060 # get the raw text of non-list contents of this node, and other stuff
2061 # like tag and category data added to sense_base
2062 # cast = no-op type-setter for the type-checker
2063 partial_template_fn = cast(
2064 TemplateFnCallable,
2065 partial(sense_template_fn, is_gloss=True),
2066 )
2067 rawgloss = clean_node(
2068 wxr,
2069 sense_base,
2070 contents,
2071 template_fn=partial_template_fn,
2072 collect_links=True,
2073 )
2075 if not rawgloss: 2075 ↛ 2076line 2075 didn't jump to line 2076 because the condition on line 2075 was never true
2076 return False
2078 # remove manually typed ordered list text at the start("1. ")
2079 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2081 # get stuff like synonyms and categories from "others",
2082 # maybe examples and quotations
2083 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2085 # The gloss could contain templates that produce more list items.
2086 # This happens commonly with, e.g., {{inflection of|...}}. Split
2087 # to parts. However, e.g. Interlingua generates multiple glosses
2088 # in HTML directly without Wikitext markup, so we must also split
2089 # by just newlines.
2090 subglosses = rawgloss.splitlines()
2092 if len(subglosses) == 0: 2092 ↛ 2093line 2092 didn't jump to line 2093 because the condition on line 2092 was never true
2093 return False
2095 if any(s.startswith("#") for s in subglosses):
2096 subtree = wxr.wtp.parse(rawgloss)
2097 # from wikitextprocessor.parser import print_tree
2098 # print("SUBTREE GENERATED BY TEMPLATE:")
2099 # print_tree(subtree)
2100 new_subentries = [
2101 x
2102 for x in subtree.children
2103 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2104 ]
2106 new_others = [
2107 x
2108 for x in subtree.children
2109 if isinstance(x, WikiNode)
2110 and x.kind == NodeKind.LIST
2111 and not x.sarg.endswith("#")
2112 ]
2114 new_contents = [
2115 clean_node(wxr, [], x)
2116 for x in subtree.children
2117 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2118 ]
2120 subentries = subentries or new_subentries
2121 others = others or new_others
2122 subglosses = new_contents
2123 rawgloss = "".join(subglosses)
2124 # Generate no gloss for translation hub pages, but add the
2125 # "translation-hub" tag for them
2126 if rawgloss == "(This entry is a translation hub.)": 2126 ↛ 2127line 2126 didn't jump to line 2127 because the condition on line 2126 was never true
2127 data_append(sense_data, "tags", "translation-hub")
2128 return push_sense(sorting_ordinal)
2130 # Remove certain substrings specific to outer glosses
2131 strip_ends = [", particularly:"]
2132 for x in strip_ends:
2133 if rawgloss.endswith(x):
2134 rawgloss = rawgloss[: -len(x)].strip()
2135 break
2137 # A single gloss, or possibly an outer gloss.
2138 # Check if the possible outer gloss starts with
2139 # parenthesized tags/topics
2141 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2142 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2143 m = QUALIFIERS_RE.match(rawgloss)
2144 # (...): ... or (...(...)...): ...
2145 if m:
2146 q = m.group(1)
2147 rawgloss = rawgloss[m.end() :].strip()
2148 parse_sense_qualifier(wxr, q, sense_base)
2149 if rawgloss == "A pejorative:": 2149 ↛ 2150line 2149 didn't jump to line 2150 because the condition on line 2149 was never true
2150 data_append(sense_base, "tags", "pejorative")
2151 rawgloss = ""
2152 elif rawgloss == "Short forms.": 2152 ↛ 2153line 2152 didn't jump to line 2153 because the condition on line 2152 was never true
2153 data_append(sense_base, "tags", "abbreviation")
2154 rawgloss = ""
2155 elif rawgloss == "Technical or specialized senses.": 2155 ↛ 2156line 2155 didn't jump to line 2156 because the condition on line 2155 was never true
2156 rawgloss = ""
2157 elif rawgloss.startswith("inflection of "):
2158 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2159 if parsed is not None: 2159 ↛ 2168line 2159 didn't jump to line 2168 because the condition on line 2159 was always true
2160 tags, origins = parsed
2161 if origins is not None: 2161 ↛ 2163line 2161 didn't jump to line 2163 because the condition on line 2161 was always true
2162 data_extend(sense_base, "form_of", origins)
2163 if tags is not None: 2163 ↛ 2166line 2163 didn't jump to line 2166 because the condition on line 2163 was always true
2164 data_extend(sense_base, "tags", tags)
2165 else:
2166 data_append(sense_base, "tags", "form-of")
2167 else:
2168 data_append(sense_base, "tags", "form-of")
2169 if rawgloss: 2169 ↛ 2200line 2169 didn't jump to line 2200 because the condition on line 2169 was always true
2170 # Code duplicating a lot of clean-up operations from later in
2171 # this block. We want to clean up the "supergloss" as much as
2172 # possible, in almost the same way as a normal gloss.
2173 supergloss = rawgloss
2175 if supergloss.startswith("; "): 2175 ↛ 2176line 2175 didn't jump to line 2176 because the condition on line 2175 was never true
2176 supergloss = supergloss[1:].strip()
2178 if supergloss.startswith(("^†", "†")):
2179 data_append(sense_base, "tags", "obsolete")
2180 supergloss = supergloss[2:].strip()
2181 elif supergloss.startswith("^‡"): 2181 ↛ 2182line 2181 didn't jump to line 2182 because the condition on line 2181 was never true
2182 data_extend(sense_base, "tags", ["obsolete", "historical"])
2183 supergloss = supergloss[2:].strip()
2185 # remove [14th century...] style brackets at the end
2186 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2188 if supergloss.startswith((",", ":")):
2189 supergloss = supergloss[1:]
2190 supergloss = supergloss.strip()
2191 if supergloss.startswith("N. of "): 2191 ↛ 2192line 2191 didn't jump to line 2192 because the condition on line 2191 was never true
2192 supergloss = "Name of " + supergloss[6:]
2193 supergloss = supergloss[2:]
2194 data_append(sense_base, "glosses", supergloss)
2195 if supergloss in ("A person:",):
2196 data_append(sense_base, "tags", "g-person")
2198 # The main recursive call (except for the exceptions at the
2199 # start of this function).
2200 for sublist in subentries:
2201 if not ( 2201 ↛ 2204line 2201 didn't jump to line 2204 because the condition on line 2201 was never true
2202 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2203 ):
2204 wxr.wtp.debug(
2205 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2206 f"with items that are not LISTs",
2207 sortid="page/1511/20230119",
2208 )
2209 continue
2210 for item in sublist.children:
2211 if not ( 2211 ↛ 2215line 2211 didn't jump to line 2215 because the condition on line 2211 was never true
2212 isinstance(item, WikiNode)
2213 and item.kind == NodeKind.LIST_ITEM
2214 ):
2215 continue
2216 # copy sense_base to prevent cross-contamination between
2217 # subglosses and other subglosses and superglosses
2218 sense_base2 = copy.deepcopy(sense_base)
2219 if parse_sense_node(item, sense_base2, pos): 2219 ↛ 2210line 2219 didn't jump to line 2210 because the condition on line 2219 was always true
2220 added = True
2222 # Capture examples.
2223 # This is called after the recursive calls above so that
2224 # sense_base is not contaminated with meta-data from
2225 # example entries for *this* gloss.
2226 examples = []
2227 if wxr.config.capture_examples: 2227 ↛ 2231line 2227 didn't jump to line 2231 because the condition on line 2227 was always true
2228 examples = extract_examples(others, sense_base)
2230 # push_sense() succeeded somewhere down-river, so skip this level
2231 if added:
2232 if examples:
2233 # this higher-up gloss has examples that we do not want to skip
2234 wxr.wtp.debug(
2235 "'{}[...]' gloss has examples we want to keep, "
2236 "but there are subglosses.".format(repr(rawgloss[:30])),
2237 sortid="page/1498/20230118",
2238 )
2239 else:
2240 return True
2242 # Some entries, e.g., "iacebam", have weird sentences in quotes
2243 # after the gloss, but these sentences don't seem to be intended
2244 # as glosses. Skip them.
2245 indexed_subglosses = list(
2246 (i, gl)
2247 for i, gl in enumerate(subglosses)
2248 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2249 )
2251 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2251 ↛ 2252line 2251 didn't jump to line 2252 because the condition on line 2251 was never true
2252 gl = indexed_subglosses[0][1].strip()
2253 if gl.endswith(":"):
2254 gl = gl[:-1].strip()
2255 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2256 if parsed is not None:
2257 infl_tags, infl_dts = parsed
2258 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2259 # Interpret others as a particular form under
2260 # "inflection of"
2261 data_extend(sense_base, "tags", infl_tags)
2262 data_extend(sense_base, "form_of", infl_dts)
2263 indexed_subglosses = indexed_subglosses[1:]
2264 elif not infl_dts:
2265 data_extend(sense_base, "tags", infl_tags)
2266 indexed_subglosses = indexed_subglosses[1:]
2268 # Create senses for remaining subglosses
2269 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2270 gloss = gloss.strip()
2271 if not gloss and len(indexed_subglosses) > 1: 2271 ↛ 2272line 2271 didn't jump to line 2272 because the condition on line 2271 was never true
2272 continue
2273 # Push a new sense (if the last one is not empty)
2274 if push_sense(sorting_ordinal): 2274 ↛ 2275line 2274 didn't jump to line 2275 because the condition on line 2274 was never true
2275 added = True
2276 # if gloss not in sense_data.get("raw_glosses", ()):
2277 # data_append(sense_data, "raw_glosses", gloss)
2278 if i == 0 and examples:
2279 # In a multi-line gloss, associate examples
2280 # with only one of them.
2281 # XXX or you could use gloss_i == len(indexed_subglosses)
2282 # to associate examples with the *last* one.
2283 data_extend(sense_data, "examples", examples)
2284 if gloss.startswith("; ") and gloss_i > 0: 2284 ↛ 2285line 2284 didn't jump to line 2285 because the condition on line 2284 was never true
2285 gloss = gloss[1:].strip()
2286 # If the gloss starts with †, mark as obsolete
2287 if gloss.startswith("^†"): 2287 ↛ 2288line 2287 didn't jump to line 2288 because the condition on line 2287 was never true
2288 data_append(sense_data, "tags", "obsolete")
2289 gloss = gloss[2:].strip()
2290 elif gloss.startswith("^‡"): 2290 ↛ 2291line 2290 didn't jump to line 2291 because the condition on line 2290 was never true
2291 data_extend(sense_data, "tags", ["obsolete", "historical"])
2292 gloss = gloss[2:].strip()
2293 # Copy data for all senses to this sense
2294 for k, v in sense_base.items():
2295 if isinstance(v, (list, tuple)):
2296 if k != "tags":
2297 # Tags handled below (countable/uncountable special)
2298 data_extend(sense_data, k, v)
2299 else:
2300 assert k not in ("tags", "categories", "topics")
2301 sense_data[k] = v # type:ignore[literal-required]
2302 # Parse the gloss for this particular sense
2303 m = QUALIFIERS_RE.match(gloss)
2304 # (...): ... or (...(...)...): ...
2305 if m:
2306 parse_sense_qualifier(wxr, m.group(1), sense_data)
2307 gloss = gloss[m.end() :].strip()
2309 # Remove common suffix "[from 14th c.]" and similar
2310 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2312 # Check to make sure we don't have unhandled list items in gloss
2313 ofs = max(gloss.find("#"), gloss.find("* "))
2314 if ofs > 10 and "(#)" not in gloss:
2315 wxr.wtp.debug(
2316 "gloss may contain unhandled list items: {}".format(gloss),
2317 sortid="page/1412",
2318 )
2319 elif "\n" in gloss: 2319 ↛ 2320line 2319 didn't jump to line 2320 because the condition on line 2319 was never true
2320 wxr.wtp.debug(
2321 "gloss contains newline: {}".format(gloss),
2322 sortid="page/1416",
2323 )
2325 # Kludge, some glosses have a comma after initial qualifiers in
2326 # parentheses
2327 if gloss.startswith((",", ":")):
2328 gloss = gloss[1:]
2329 gloss = gloss.strip()
2330 if gloss.endswith(":"):
2331 gloss = gloss[:-1].strip()
2332 if gloss.startswith("N. of "): 2332 ↛ 2333line 2332 didn't jump to line 2333 because the condition on line 2332 was never true
2333 gloss = "Name of " + gloss[6:]
2334 if gloss.startswith("†"): 2334 ↛ 2335line 2334 didn't jump to line 2335 because the condition on line 2334 was never true
2335 data_append(sense_data, "tags", "obsolete")
2336 gloss = gloss[1:]
2337 elif gloss.startswith("^†"): 2337 ↛ 2338line 2337 didn't jump to line 2338 because the condition on line 2337 was never true
2338 data_append(sense_data, "tags", "obsolete")
2339 gloss = gloss[2:]
2341 # Copy tags from sense_base if any. This will not copy
2342 # countable/uncountable if either was specified in the sense,
2343 # as sometimes both are specified in word head but only one
2344 # in individual senses.
2345 countability_tags = []
2346 base_tags = sense_base.get("tags", ())
2347 sense_tags = sense_data.get("tags", ())
2348 for tag in base_tags:
2349 if tag in ("countable", "uncountable"):
2350 if tag not in countability_tags: 2350 ↛ 2352line 2350 didn't jump to line 2352 because the condition on line 2350 was always true
2351 countability_tags.append(tag)
2352 continue
2353 if tag not in sense_tags:
2354 data_append(sense_data, "tags", tag)
2355 if countability_tags:
2356 if ( 2356 ↛ 2365line 2356 didn't jump to line 2365 because the condition on line 2356 was always true
2357 "countable" not in sense_tags
2358 and "uncountable" not in sense_tags
2359 ):
2360 data_extend(sense_data, "tags", countability_tags)
2362 # If outer gloss specifies a form-of ("inflection of", see
2363 # aquamarine/German), try to parse the inner glosses as
2364 # tags for an inflected form.
2365 if "form-of" in sense_base.get("tags", ()):
2366 parsed = parse_alt_or_inflection_of(
2367 wxr, gloss, gloss_template_args
2368 )
2369 if parsed is not None: 2369 ↛ 2375line 2369 didn't jump to line 2375 because the condition on line 2369 was always true
2370 infl_tags, infl_dts = parsed
2371 if not infl_dts and infl_tags: 2371 ↛ 2375line 2371 didn't jump to line 2375 because the condition on line 2371 was always true
2372 # Interpret as a particular form under "inflection of"
2373 data_extend(sense_data, "tags", infl_tags)
2375 if not gloss: 2375 ↛ 2376line 2375 didn't jump to line 2376 because the condition on line 2375 was never true
2376 data_append(sense_data, "tags", "empty-gloss")
2377 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2378 if ( 2378 ↛ 2389line 2378 didn't jump to line 2389 because the condition on line 2378 was always true
2379 gloss_i == 0
2380 and len(sense_data.get("glosses", tuple())) >= 1
2381 ):
2382 # If we added a "high-level gloss" from rawgloss, but this
2383 # is that same gloss_i, add this instead of the raw_gloss
2384 # from before if they're different: the rawgloss was not
2385 # cleaned exactly the same as this later gloss
2386 sense_data["glosses"][-1] = gloss
2387 else:
2388 # Add the gloss for the sense.
2389 data_append(sense_data, "glosses", gloss)
2391 # Kludge: there are cases (e.g., etc./Swedish) where there are
2392 # two abbreviations in the same sense, both generated by the
2393 # {{abbreviation of|...}} template. Handle these with some magic.
2394 position = 0
2395 split_glosses = []
2396 for m in re.finditer(r"Abbreviation of ", gloss):
2397 if m.start() != position: 2397 ↛ 2396line 2397 didn't jump to line 2396 because the condition on line 2397 was always true
2398 split_glosses.append(gloss[position : m.start()])
2399 position = m.start()
2400 split_glosses.append(gloss[position:])
2401 for gloss in split_glosses:
2402 # Check if this gloss describes an alt-of or inflection-of
2403 if (
2404 lang_code != "en"
2405 and " " not in gloss
2406 and distw([word], gloss) < 0.3
2407 ):
2408 # Don't try to parse gloss if it is one word
2409 # that is close to the word itself for non-English words
2410 # (probable translations of a tag/form name)
2411 continue
2412 parsed = parse_alt_or_inflection_of(
2413 wxr, gloss, gloss_template_args
2414 )
2415 if parsed is None:
2416 continue
2417 tags, dts = parsed
2418 if not dts and tags:
2419 data_extend(sense_data, "tags", tags)
2420 continue
2421 for dt in dts: # type:ignore[union-attr]
2422 ftags = list(tag for tag in tags if tag != "form-of")
2423 if "alt-of" in tags:
2424 data_extend(sense_data, "tags", ftags)
2425 data_append(sense_data, "alt_of", dt)
2426 elif "compound-of" in tags: 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true
2427 data_extend(sense_data, "tags", ftags)
2428 data_append(sense_data, "compound_of", dt)
2429 elif "synonym-of" in tags: 2429 ↛ 2430line 2429 didn't jump to line 2430 because the condition on line 2429 was never true
2430 data_extend(dt, "tags", ftags)
2431 data_append(sense_data, "synonyms", dt)
2432 elif tags and dt.get("word", "").startswith("of "): 2432 ↛ 2433line 2432 didn't jump to line 2433 because the condition on line 2432 was never true
2433 dt["word"] = dt["word"][3:]
2434 data_append(sense_data, "tags", "form-of")
2435 data_extend(sense_data, "tags", ftags)
2436 data_append(sense_data, "form_of", dt)
2437 elif "form-of" in tags: 2437 ↛ 2421line 2437 didn't jump to line 2421 because the condition on line 2437 was always true
2438 data_extend(sense_data, "tags", tags)
2439 data_append(sense_data, "form_of", dt)
2441 if len(sense_data) == 0:
2442 if len(sense_base.get("tags", [])) == 0: 2442 ↛ 2444line 2442 didn't jump to line 2444 because the condition on line 2442 was always true
2443 del sense_base["tags"]
2444 sense_data.update(sense_base)
2445 if push_sense(sorting_ordinal): 2445 ↛ 2449line 2445 didn't jump to line 2449 because the condition on line 2445 was always true
2446 # push_sense succeded in adding a sense to pos_data
2447 added = True
2448 # print("PARSE_SENSE DONE:", pos_datas[-1])
2449 return added
2451 def parse_inflection(
2452 node: WikiNode, section: str, pos: Optional[str]
2453 ) -> None:
2454 """Parses inflection data (declension, conjugation) from the given
2455 page. This retrieves the actual inflection template
2456 parameters, which are very useful for applications that need
2457 to learn the inflection classes and generate inflected
2458 forms."""
2459 assert isinstance(node, WikiNode)
2460 assert isinstance(section, str)
2461 assert pos is None or isinstance(pos, str)
2462 # print("parse_inflection:", node)
2464 if pos is None: 2464 ↛ 2465line 2464 didn't jump to line 2465 because the condition on line 2464 was never true
2465 wxr.wtp.debug(
2466 "inflection table outside part-of-speech", sortid="page/1812"
2467 )
2468 return
2470 def inflection_template_fn(
2471 name: str, ht: TemplateArgs
2472 ) -> Optional[str]:
2473 # print("decl_conj_template_fn", name, ht)
2474 if is_panel_template(wxr, name): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true
2475 return ""
2476 if name in ("is-u-mutation",): 2476 ↛ 2479line 2476 didn't jump to line 2479 because the condition on line 2476 was never true
2477 # These are not to be captured as an exception to the
2478 # generic code below
2479 return None
2480 m = re.search(
2481 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2482 r"declension|inflection|mut|mutation)($|-)",
2483 name,
2484 )
2485 if m:
2486 args_ht = clean_template_args(wxr, ht)
2487 dt = {"name": name, "args": args_ht}
2488 data_append(pos_data, "inflection_templates", dt)
2490 return None
2492 # Convert the subtree back to Wikitext, then expand all and parse,
2493 # capturing templates in the process
2494 text = wxr.wtp.node_to_wikitext(node.children)
2496 # Split text into separate sections for each to-level template
2497 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2498 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2499 # The (?:...) creates a non-capturing regex group; if it was capturing,
2500 # like the group around it, it would create elements in brace_matches,
2501 # including None if it doesn't match.
2502 # 20250114: Added {| and |} into the regex because tables were being
2503 # cut into pieces by this code. Issue #973, introduction of two-part
2504 # book-end templates similar to trans-top and tran-bottom.
2505 template_sections = []
2506 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2507 # Because there is the possibility of triple curly braces
2508 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2509 # count nesting depth using pairs of two brackets, but
2510 # instead use singular braces ("{ }").
2511 # Because template delimiters should be balanced, regardless
2512 # of whether {{ or {{{ is used, and because we only care
2513 # about the outer-most delimiters (the highest level template)
2514 # we can just count the single braces when those single
2515 # braces are part of a group.
2516 table_nesting = 0
2517 # However, if we have a stray table ({| ... |}) that should always
2518 # be its own section, and should prevent templates from cutting it
2519 # into sections.
2521 # print(f"Parse inflection: {text=}")
2522 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2523 if len(brace_matches) > 1:
2524 tsection: list[str] = []
2525 after_templates = False # kludge to keep any text
2526 # before first template
2527 # with the first template;
2528 # otherwise, text
2529 # goes with preceding template
2530 for m in brace_matches:
2531 if m.startswith("\n; ") and after_templates: 2531 ↛ 2532line 2531 didn't jump to line 2532 because the condition on line 2531 was never true
2532 after_templates = False
2533 template_sections.append(tsection)
2534 tsection = []
2535 tsection.append(m)
2536 elif m.startswith("{{") or m.endswith("{|"):
2537 if (
2538 template_nesting == 0
2539 and after_templates
2540 and table_nesting == 0
2541 ):
2542 template_sections.append(tsection)
2543 tsection = []
2544 # start new section
2545 after_templates = True
2546 if m.startswith("{{"):
2547 template_nesting += 1
2548 else:
2549 # m.endswith("{|")
2550 table_nesting += 1
2551 tsection.append(m)
2552 elif m.startswith("}}") or m.endswith("|}"):
2553 if m.startswith("}}"):
2554 template_nesting -= 1
2555 if template_nesting < 0: 2555 ↛ 2556line 2555 didn't jump to line 2556 because the condition on line 2555 was never true
2556 wxr.wtp.error(
2557 "Negatively nested braces, "
2558 "couldn't split inflection templates, "
2559 "{}/{} section {}".format(
2560 word, language, section
2561 ),
2562 sortid="page/1871",
2563 )
2564 template_sections = [] # use whole text
2565 break
2566 else:
2567 table_nesting -= 1
2568 if table_nesting < 0: 2568 ↛ 2569line 2568 didn't jump to line 2569 because the condition on line 2568 was never true
2569 wxr.wtp.error(
2570 "Negatively nested table braces, "
2571 "couldn't split inflection section, "
2572 "{}/{} section {}".format(
2573 word, language, section
2574 ),
2575 sortid="page/20250114",
2576 )
2577 template_sections = [] # use whole text
2578 break
2579 tsection.append(m)
2580 else:
2581 tsection.append(m)
2582 if tsection: # dangling tsection 2582 ↛ 2590line 2582 didn't jump to line 2590 because the condition on line 2582 was always true
2583 template_sections.append(tsection)
2584 # Why do it this way around? The parser has a preference
2585 # to associate bits outside of tables with the preceding
2586 # table (`after`-variable), so a new tsection begins
2587 # at {{ and everything before it belongs to the previous
2588 # template.
2590 texts = []
2591 if not template_sections:
2592 texts = [text]
2593 else:
2594 for tsection in template_sections:
2595 texts.append("".join(tsection))
2596 if template_nesting != 0: 2596 ↛ 2597line 2596 didn't jump to line 2597 because the condition on line 2596 was never true
2597 wxr.wtp.error(
2598 "Template nesting error: "
2599 "template_nesting = {} "
2600 "couldn't split inflection templates, "
2601 "{}/{} section {}".format(
2602 template_nesting, word, language, section
2603 ),
2604 sortid="page/1896",
2605 )
2606 texts = [text]
2607 for text in texts:
2608 tree = wxr.wtp.parse(
2609 text, expand_all=True, template_fn=inflection_template_fn
2610 )
2612 if not text.strip():
2613 continue
2615 # Parse inflection tables from the section. The data is stored
2616 # under "forms".
2617 if wxr.config.capture_inflections: 2617 ↛ 2607line 2617 didn't jump to line 2607 because the condition on line 2617 was always true
2618 tablecontext = None
2619 m = re.search(r"{{([^}{|]+)\|?", text)
2620 if m:
2621 template_name = m.group(1)
2622 tablecontext = TableContext(template_name)
2624 parse_inflection_section(
2625 wxr,
2626 pos_data,
2627 word,
2628 language,
2629 pos,
2630 section,
2631 tree,
2632 tablecontext=tablecontext,
2633 )
2635 def get_subpage_section(
2636 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2637 ) -> Optional[Union[WikiNode, str]]:
2638 """Loads a subpage of the given page, and finds the section
2639 for the given language, part-of-speech, and section title. This
2640 is used for finding translations and other sections on subpages."""
2641 assert isinstance(language, str)
2642 assert isinstance(title, str)
2643 assert isinstance(subtitle, str)
2644 assert isinstance(seqs, (list, tuple))
2645 for seq in seqs:
2646 for x in seq:
2647 assert isinstance(x, str)
2648 subpage_title = word + "/" + subtitle
2649 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2650 if subpage_content is None:
2651 wxr.wtp.error(
2652 "/translations not found despite "
2653 "{{see translation subpage|...}}",
2654 sortid="page/1934",
2655 )
2656 return None
2658 def recurse(
2659 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2660 ) -> Optional[Union[str, WikiNode]]:
2661 # print(f"seq: {seq}")
2662 if not seq:
2663 return node
2664 if not isinstance(node, WikiNode):
2665 return None
2666 # print(f"node.kind: {node.kind}")
2667 if node.kind in LEVEL_KINDS:
2668 t = clean_node(wxr, None, node.largs[0])
2669 # print(f"t: {t} == seq[0]: {seq[0]}?")
2670 if t.lower() == seq[0].lower():
2671 seq = seq[1:]
2672 if not seq:
2673 return node
2674 for n in node.children:
2675 ret = recurse(n, seq)
2676 if ret is not None:
2677 return ret
2678 return None
2680 tree = wxr.wtp.parse(
2681 subpage_content,
2682 pre_expand=True,
2683 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2684 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2685 )
2686 assert tree.kind == NodeKind.ROOT
2687 for seq in seqs:
2688 ret = recurse(tree, seq)
2689 if ret is None:
2690 wxr.wtp.debug(
2691 "Failed to find subpage section {}/{} seq {}".format(
2692 title, subtitle, seq
2693 ),
2694 sortid="page/1963",
2695 )
2696 return ret
2698 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2699 """Parses translations for a word. This may also pull in translations
2700 from separate translation subpages."""
2701 assert isinstance(data, dict)
2702 assert isinstance(xlatnode, WikiNode)
2703 # print("===== PARSE_TRANSLATIONS {} {} {}"
2704 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2705 # print("parse_translations xlatnode={}".format(xlatnode))
2706 if not wxr.config.capture_translations: 2706 ↛ 2707line 2706 didn't jump to line 2707 because the condition on line 2706 was never true
2707 return
2708 sense_parts: list[Union[WikiNode, str]] = []
2709 sense: Optional[str] = None
2711 def parse_translation_item(
2712 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2713 ) -> None:
2714 nonlocal sense
2715 assert isinstance(contents, list)
2716 assert lang is None or isinstance(lang, str)
2717 # print("PARSE_TRANSLATION_ITEM:", contents)
2719 langcode: Optional[str] = None
2720 if sense is None:
2721 sense = clean_node(wxr, data, sense_parts).strip()
2722 # print("sense <- clean_node: ", sense)
2723 idx = sense.find("See also translations at")
2724 if idx > 0: 2724 ↛ 2725line 2724 didn't jump to line 2725 because the condition on line 2724 was never true
2725 wxr.wtp.debug(
2726 "Skipping translation see also: {}".format(sense),
2727 sortid="page/2361",
2728 )
2729 sense = sense[:idx].strip()
2730 if sense.endswith(":"): 2730 ↛ 2731line 2730 didn't jump to line 2731 because the condition on line 2730 was never true
2731 sense = sense[:-1].strip()
2732 if sense.endswith("—"): 2732 ↛ 2733line 2732 didn't jump to line 2733 because the condition on line 2732 was never true
2733 sense = sense[:-1].strip()
2734 translations_from_template: list[str] = []
2736 def translation_item_template_fn(
2737 name: str, ht: TemplateArgs
2738 ) -> Optional[str]:
2739 nonlocal langcode
2740 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2741 if is_panel_template(wxr, name):
2742 return ""
2743 if name in ("t+check", "t-check", "t-needed"):
2744 # We ignore these templates. They seem to have outright
2745 # garbage in some entries, and very varying formatting in
2746 # others. These should be transitory and unreliable
2747 # anyway.
2748 return "__IGNORE__"
2749 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2750 code = ht.get(1)
2751 if code: 2751 ↛ 2761line 2751 didn't jump to line 2761 because the condition on line 2751 was always true
2752 if langcode and code != langcode:
2753 wxr.wtp.debug(
2754 "inconsistent language codes {} vs "
2755 "{} in translation item: {!r} {}".format(
2756 langcode, code, name, ht
2757 ),
2758 sortid="page/2386",
2759 )
2760 langcode = code
2761 tr = ht.get(2)
2762 if tr:
2763 tr = clean_node(wxr, None, [tr])
2764 translations_from_template.append(tr)
2765 return None
2766 if name == "t-egy":
2767 langcode = "egy"
2768 return None
2769 if name == "ttbc":
2770 code = ht.get(1)
2771 if code: 2771 ↛ 2773line 2771 didn't jump to line 2773 because the condition on line 2771 was always true
2772 langcode = code
2773 return None
2774 if name == "trans-see": 2774 ↛ 2775line 2774 didn't jump to line 2775 because the condition on line 2774 was never true
2775 wxr.wtp.error(
2776 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2777 )
2778 return ""
2779 if name.endswith("-top"): 2779 ↛ 2780line 2779 didn't jump to line 2780 because the condition on line 2779 was never true
2780 return ""
2781 if name.endswith("-bottom"): 2781 ↛ 2782line 2781 didn't jump to line 2782 because the condition on line 2781 was never true
2782 return ""
2783 if name.endswith("-mid"): 2783 ↛ 2784line 2783 didn't jump to line 2784 because the condition on line 2783 was never true
2784 return ""
2785 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2786 # .format(name),
2787 # sortid="page/2414")
2788 return None
2790 sublists = list(
2791 x
2792 for x in contents
2793 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2794 )
2795 contents = list(
2796 x
2797 for x in contents
2798 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2799 )
2801 item = clean_node(
2802 wxr, data, contents, template_fn=translation_item_template_fn
2803 )
2804 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2806 # Parse the translation item.
2807 if item: 2807 ↛ exitline 2807 didn't return from function 'parse_translation_item' because the condition on line 2807 was always true
2808 lang = parse_translation_item_text(
2809 wxr,
2810 word,
2811 data,
2812 item,
2813 sense,
2814 lang,
2815 langcode,
2816 translations_from_template,
2817 is_reconstruction,
2818 )
2820 # Handle sublists. They are frequently used for different
2821 # scripts for the language and different variants of the
2822 # language. We will include the lower-level header as a
2823 # tag in those cases.
2824 for listnode in sublists:
2825 assert listnode.kind == NodeKind.LIST
2826 for node in listnode.children:
2827 if not isinstance(node, WikiNode): 2827 ↛ 2828line 2827 didn't jump to line 2828 because the condition on line 2827 was never true
2828 continue
2829 if node.kind == NodeKind.LIST_ITEM: 2829 ↛ 2826line 2829 didn't jump to line 2826 because the condition on line 2829 was always true
2830 parse_translation_item(node.children, lang=lang)
2832 def parse_translation_template(node: WikiNode) -> None:
2833 assert isinstance(node, WikiNode)
2835 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2836 nonlocal sense_parts
2837 nonlocal sense
2838 if is_panel_template(wxr, name):
2839 return ""
2840 if name == "see also":
2841 # XXX capture
2842 # XXX for example, "/" has top-level list containing
2843 # see also items. So also should parse those.
2844 return ""
2845 if name == "trans-see":
2846 # XXX capture
2847 return ""
2848 if name == "see translation subpage": 2848 ↛ 2849line 2848 didn't jump to line 2849 because the condition on line 2848 was never true
2849 sense_parts = []
2850 sense = None
2851 sub = ht.get(1, "")
2852 if sub:
2853 m = re.match(
2854 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
2855 )
2856 else:
2857 m = None
2858 etym = ""
2859 etym_numbered = ""
2860 pos = ""
2861 if m:
2862 etym_numbered = m.group(1)
2863 etym = m.group(2)
2864 pos = m.group(3)
2865 if not sub:
2866 wxr.wtp.debug(
2867 "no part-of-speech in "
2868 "{{see translation subpage|...}}, "
2869 "defaulting to just wxr.wtp.section "
2870 "(= language)",
2871 sortid="page/2468",
2872 )
2873 # seq sent to get_subpage_section without sub and pos
2874 seq = [
2875 language,
2876 TRANSLATIONS_TITLE,
2877 ]
2878 elif (
2879 m
2880 and etym.lower().strip() in ETYMOLOGY_TITLES
2881 and pos.lower() in POS_TITLES
2882 ):
2883 seq = [
2884 language,
2885 etym_numbered,
2886 pos,
2887 TRANSLATIONS_TITLE,
2888 ]
2889 elif sub.lower() in POS_TITLES:
2890 # seq with sub but not pos
2891 seq = [
2892 language,
2893 sub,
2894 TRANSLATIONS_TITLE,
2895 ]
2896 else:
2897 # seq with sub and pos
2898 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2899 if pos.lower() not in POS_TITLES:
2900 wxr.wtp.debug(
2901 "unhandled see translation subpage: "
2902 "language={} sub={} "
2903 "wxr.wtp.subsection={}".format(
2904 language, sub, wxr.wtp.subsection
2905 ),
2906 sortid="page/2478",
2907 )
2908 seq = [language, sub, pos, TRANSLATIONS_TITLE]
2909 subnode = get_subpage_section(
2910 wxr.wtp.title or "MISSING_TITLE",
2911 TRANSLATIONS_TITLE,
2912 [seq],
2913 )
2914 if subnode is None or not isinstance(subnode, WikiNode):
2915 # Failed to find the normal subpage section
2916 # seq with sub and pos
2917 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2918 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
2919 seqs: list[list[str] | tuple[str, ...]] = [
2920 [TRANSLATIONS_TITLE],
2921 [language, pos],
2922 ]
2923 subnode = get_subpage_section(
2924 wxr.wtp.title or "MISSING_TITLE",
2925 TRANSLATIONS_TITLE,
2926 seqs,
2927 )
2928 if subnode is not None and isinstance(subnode, WikiNode):
2929 parse_translations(data, subnode)
2930 return ""
2931 if name in (
2932 "c",
2933 "C",
2934 "categorize",
2935 "cat",
2936 "catlangname",
2937 "topics",
2938 "top",
2939 "qualifier",
2940 "cln",
2941 ):
2942 # These are expanded in the default way
2943 return None
2944 if name in (
2945 "trans-top",
2946 "trans-top-see",
2947 ):
2948 # XXX capture id from trans-top? Capture sense here
2949 # instead of trying to parse it from expanded content?
2950 if ht.get(1):
2951 sense_parts = []
2952 sense = ht.get(1)
2953 else:
2954 sense_parts = []
2955 sense = None
2956 return None
2957 if name in (
2958 "trans-bottom",
2959 "trans-mid",
2960 "checktrans-mid",
2961 "checktrans-bottom",
2962 ):
2963 return None
2964 if name == "checktrans-top":
2965 sense_parts = []
2966 sense = None
2967 return ""
2968 if name == "trans-top-also":
2969 # XXX capture?
2970 sense_parts = []
2971 sense = None
2972 return ""
2973 wxr.wtp.error(
2974 "UNIMPLEMENTED parse_translation_template: {} {}".format(
2975 name, ht
2976 ),
2977 sortid="page/2517",
2978 )
2979 return ""
2981 wxr.wtp.expand(
2982 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
2983 )
2985 def parse_translation_recurse(xlatnode: WikiNode) -> None:
2986 nonlocal sense
2987 nonlocal sense_parts
2988 for node in xlatnode.children:
2989 # print(node)
2990 if isinstance(node, str):
2991 if sense:
2992 if not node.isspace():
2993 wxr.wtp.debug(
2994 "skipping string in the middle of "
2995 "translations: {}".format(node),
2996 sortid="page/2530",
2997 )
2998 continue
2999 # Add a part to the sense
3000 sense_parts.append(node)
3001 sense = None
3002 continue
3003 assert isinstance(node, WikiNode)
3004 kind = node.kind
3005 if kind == NodeKind.LIST:
3006 for item in node.children:
3007 if not isinstance(item, WikiNode): 3007 ↛ 3008line 3007 didn't jump to line 3008 because the condition on line 3007 was never true
3008 continue
3009 if item.kind != NodeKind.LIST_ITEM: 3009 ↛ 3010line 3009 didn't jump to line 3010 because the condition on line 3009 was never true
3010 continue
3011 if item.sarg == ":": 3011 ↛ 3012line 3011 didn't jump to line 3012 because the condition on line 3011 was never true
3012 continue
3013 parse_translation_item(item.children)
3014 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3014 ↛ 3018line 3014 didn't jump to line 3018 because the condition on line 3014 was never true
3015 # Silently skip list items that are just indented; these
3016 # are used for text between translations, such as indicating
3017 # translations that need to be checked.
3018 pass
3019 elif kind == NodeKind.TEMPLATE:
3020 parse_translation_template(node)
3021 elif kind in ( 3021 ↛ 3026line 3021 didn't jump to line 3026 because the condition on line 3021 was never true
3022 NodeKind.TABLE,
3023 NodeKind.TABLE_ROW,
3024 NodeKind.TABLE_CELL,
3025 ):
3026 parse_translation_recurse(node)
3027 elif kind == NodeKind.HTML:
3028 if node.attrs.get("class") == "NavFrame": 3028 ↛ 3034line 3028 didn't jump to line 3034 because the condition on line 3028 was never true
3029 # Reset ``sense_parts`` (and force recomputing
3030 # by clearing ``sense``) as each NavFrame specifies
3031 # its own sense. This helps eliminate garbage coming
3032 # from text at the beginning at the translations
3033 # section.
3034 sense_parts = []
3035 sense = None
3036 # for item in node.children:
3037 # if not isinstance(item, WikiNode):
3038 # continue
3039 # parse_translation_recurse(item)
3040 parse_translation_recurse(node)
3041 elif kind in LEVEL_KINDS: 3041 ↛ 3043line 3041 didn't jump to line 3043 because the condition on line 3041 was never true
3042 # Sub-levels will be recursed elsewhere
3043 pass
3044 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3045 parse_translation_recurse(node)
3046 elif kind == NodeKind.PREFORMATTED: 3046 ↛ 3047line 3046 didn't jump to line 3047 because the condition on line 3046 was never true
3047 print("parse_translation_recurse: PREFORMATTED:", node)
3048 elif kind == NodeKind.LINK: 3048 ↛ 3102line 3048 didn't jump to line 3102 because the condition on line 3048 was always true
3049 arg0 = node.largs[0]
3050 # Kludge: I've seen occasional normal links to translation
3051 # subpages from main pages (e.g., language/English/Noun
3052 # in July 2021) instead of the normal
3053 # {{see translation subpage|...}} template. This should
3054 # handle them. Note: must be careful not to read other
3055 # links, particularly things like in "human being":
3056 # "a human being -- see [[man/translations]]" (group title)
3057 if ( 3057 ↛ 3065line 3057 didn't jump to line 3065 because the condition on line 3057 was never true
3058 isinstance(arg0, (list, tuple))
3059 and arg0
3060 and isinstance(arg0[0], str)
3061 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3062 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3063 == wxr.wtp.title
3064 ):
3065 wxr.wtp.debug(
3066 "translations subpage link found on main "
3067 "page instead "
3068 "of normal {{see translation subpage|...}}",
3069 sortid="page/2595",
3070 )
3071 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3072 if sub.lower() in POS_TITLES:
3073 seq = [
3074 language,
3075 sub,
3076 TRANSLATIONS_TITLE,
3077 ]
3078 subnode = get_subpage_section(
3079 wxr.wtp.title,
3080 TRANSLATIONS_TITLE,
3081 [seq],
3082 )
3083 if subnode is not None and isinstance(
3084 subnode, WikiNode
3085 ):
3086 parse_translations(data, subnode)
3087 else:
3088 wxr.wtp.error(
3089 "/translations link outside part-of-speech"
3090 )
3092 if (
3093 len(arg0) >= 1
3094 and isinstance(arg0[0], str)
3095 and not arg0[0].lower().startswith("category:")
3096 ):
3097 for x in node.largs[-1]:
3098 if isinstance(x, str): 3098 ↛ 3101line 3098 didn't jump to line 3101 because the condition on line 3098 was always true
3099 sense_parts.append(x)
3100 else:
3101 parse_translation_recurse(x)
3102 elif not sense:
3103 sense_parts.append(node)
3104 else:
3105 wxr.wtp.debug(
3106 "skipping text between translation items/senses: "
3107 "{}".format(node),
3108 sortid="page/2621",
3109 )
3111 # Main code of parse_translation(). We want ``sense`` to be assigned
3112 # regardless of recursion levels, and thus the code is structured
3113 # to define at this level and recurse in parse_translation_recurse().
3114 parse_translation_recurse(xlatnode)
3116 def parse_etymology(data: WordData, node: LevelNode) -> None:
3117 """Parses an etymology section."""
3118 assert isinstance(data, dict)
3119 assert isinstance(node, WikiNode)
3121 templates: list[TemplateData] = []
3123 # Counter for preventing the capture of etymology templates
3124 # when we are inside templates that we want to ignore (i.e.,
3125 # not capture).
3126 ignore_count = 0
3128 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3129 nonlocal ignore_count
3130 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3131 return ""
3132 if re.match(ignored_etymology_templates_re, name):
3133 ignore_count += 1
3134 return None
3136 # CONTINUE_HERE
3138 def etym_post_template_fn(
3139 name: str, ht: TemplateArgs, expansion: str
3140 ) -> None:
3141 nonlocal ignore_count
3142 if name in wikipedia_templates:
3143 parse_wikipedia_template(wxr, data, ht)
3144 return None
3145 if re.match(ignored_etymology_templates_re, name):
3146 ignore_count -= 1
3147 return None
3148 if ignore_count == 0: 3148 ↛ 3154line 3148 didn't jump to line 3154 because the condition on line 3148 was always true
3149 ht = clean_template_args(wxr, ht)
3150 expansion = clean_node(wxr, None, expansion)
3151 templates.append(
3152 {"name": name, "args": ht, "expansion": expansion}
3153 )
3154 return None
3156 # Remove any subsections
3157 contents = list(
3158 x
3159 for x in node.children
3160 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3161 )
3162 # Convert to text, also capturing templates using post_template_fn
3163 text = clean_node(
3164 wxr,
3165 None,
3166 contents,
3167 template_fn=etym_template_fn,
3168 post_template_fn=etym_post_template_fn,
3169 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3170 # Save the collected information.
3171 if len(text) > 0:
3172 data["etymology_text"] = text
3173 if len(templates) > 0:
3174 # Some etymology templates, like Template:root do not generate
3175 # text, so they should be added here. Elsewhere, we check
3176 # for Template:root and add some text to the expansion to please
3177 # the validation.
3178 data["etymology_templates"] = templates
3180 for child_node in node.find_child_recursively( 3180 ↛ exitline 3180 didn't return from function 'parse_etymology' because the loop on line 3180 didn't complete
3181 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3182 ):
3183 if child_node.kind in LEVEL_KIND_FLAGS:
3184 break
3185 elif isinstance( 3185 ↛ 3188line 3185 didn't jump to line 3188 because the condition on line 3185 was never true
3186 child_node, TemplateNode
3187 ) and child_node.template_name in ["zh-x", "zh-q"]:
3188 if "etymology_examples" not in data:
3189 data["etymology_examples"] = []
3190 data["etymology_examples"].extend(
3191 extract_template_zh_x(
3192 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3193 )
3194 )
3196 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3197 """This recurses into a subtree in the parse tree for a page."""
3198 nonlocal etym_data
3199 nonlocal pos_data
3200 nonlocal inside_level_four
3202 redirect_list: list[str] = [] # for `zh-see` template
3204 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3205 """This is called for otherwise unprocessed parts of the page.
3206 We still expand them so that e.g. Category links get captured."""
3207 if name in wikipedia_templates:
3208 data = select_data()
3209 parse_wikipedia_template(wxr, data, ht)
3210 return None
3211 if is_panel_template(wxr, name):
3212 return ""
3213 return None
3215 for node in treenode.children:
3216 if not isinstance(node, WikiNode):
3217 # print(" X{}".format(repr(node)[:40]))
3218 continue
3219 if isinstance(node, TemplateNode):
3220 if process_soft_redirect_template(wxr, node, redirect_list):
3221 continue
3222 elif node.template_name == "zh-forms":
3223 extract_zh_forms_template(wxr, node, select_data())
3224 elif (
3225 node.template_name.endswith("-kanjitab")
3226 or node.template_name == "ja-kt"
3227 ):
3228 extract_ja_kanjitab_template(wxr, node, select_data())
3230 if not isinstance(node, LevelNode):
3231 # XXX handle e.g. wikipedia links at the top of a language
3232 # XXX should at least capture "also" at top of page
3233 if node.kind in (
3234 NodeKind.HLINE,
3235 NodeKind.LIST,
3236 NodeKind.LIST_ITEM,
3237 ):
3238 continue
3239 # print(" UNEXPECTED: {}".format(node))
3240 # Clean the node to collect category links
3241 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3242 continue
3243 t = clean_node(
3244 wxr, etym_data, node.sarg if node.sarg else node.largs
3245 )
3246 t = t.lower()
3247 # XXX these counts were never implemented fully, and even this
3248 # gets discarded: Search STATISTICS_IMPLEMENTATION
3249 wxr.config.section_counts[t] += 1
3250 # print("PROCESS_CHILDREN: T:", repr(t))
3251 if t in IGNORED_TITLES:
3252 pass
3253 elif t.startswith(PRONUNCIATION_TITLE):
3254 # Chinese Pronunciation section kludge; we demote these to
3255 # be level 4 instead of 3 so that they're part of a larger
3256 # etymology hierarchy; usually the data here is empty and
3257 # acts as an inbetween between POS and Etymology data
3258 if lang_code in ("zh",):
3259 inside_level_four = True
3260 if t.startswith(PRONUNCIATION_TITLE + " "):
3261 # Pronunciation 1, etc, are used in Chinese Glyphs,
3262 # and each of them may have senses under Definition
3263 push_level_four_section(True)
3264 wxr.wtp.start_subsection(None)
3265 if wxr.config.capture_pronunciation: 3265 ↛ 3373line 3265 didn't jump to line 3373 because the condition on line 3265 was always true
3266 data = select_data()
3267 parse_pronunciation(
3268 wxr,
3269 node,
3270 data,
3271 etym_data,
3272 have_etym,
3273 base_data,
3274 lang_code,
3275 )
3276 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3277 push_etym()
3278 wxr.wtp.start_subsection(None)
3279 if wxr.config.capture_etymologies: 3279 ↛ 3373line 3279 didn't jump to line 3373 because the condition on line 3279 was always true
3280 m = re.search(r"\s(\d+)$", t)
3281 if m:
3282 etym_data["etymology_number"] = int(m.group(1))
3283 parse_etymology(etym_data, node)
3284 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3285 data = select_data()
3286 extract_descendant_section(wxr, data, node, False)
3287 elif (
3288 t in PROTO_ROOT_DERIVED_TITLES
3289 and pos == "root"
3290 and is_reconstruction
3291 and wxr.config.capture_descendants
3292 ):
3293 data = select_data()
3294 extract_descendant_section(wxr, data, node, True)
3295 elif t == TRANSLATIONS_TITLE:
3296 data = select_data()
3297 parse_translations(data, node)
3298 elif t in INFLECTION_TITLES:
3299 parse_inflection(node, t, pos)
3300 elif t == "alternative forms":
3301 extract_alt_form_section(wxr, select_data(), node)
3302 else:
3303 lst = t.split()
3304 while len(lst) > 1 and lst[-1].isdigit(): 3304 ↛ 3305line 3304 didn't jump to line 3305 because the condition on line 3304 was never true
3305 lst = lst[:-1]
3306 t_no_number = " ".join(lst).lower()
3307 if t_no_number in POS_TITLES:
3308 push_pos()
3309 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3310 pos = dt["pos"] or "MISSING_POS"
3311 wxr.wtp.start_subsection(t)
3312 if "debug" in dt:
3313 wxr.wtp.debug(
3314 "{} in section {}".format(dt["debug"], t),
3315 sortid="page/2755",
3316 )
3317 if "warning" in dt: 3317 ↛ 3318line 3317 didn't jump to line 3318 because the condition on line 3317 was never true
3318 wxr.wtp.wiki_notice(
3319 "{} in section {}".format(dt["warning"], t),
3320 sortid="page/2759",
3321 )
3322 if "error" in dt: 3322 ↛ 3323line 3322 didn't jump to line 3323 because the condition on line 3322 was never true
3323 wxr.wtp.error(
3324 "{} in section {}".format(dt["error"], t),
3325 sortid="page/2763",
3326 )
3327 if "note" in dt: 3327 ↛ 3328line 3327 didn't jump to line 3328 because the condition on line 3327 was never true
3328 wxr.wtp.note(
3329 "{} in section {}".format(dt["note"], t),
3330 sortid="page/20251017a",
3331 )
3332 if "wiki_notice" in dt: 3332 ↛ 3333line 3332 didn't jump to line 3333 because the condition on line 3332 was never true
3333 wxr.wtp.wiki_notice(
3334 "{} in section {}".format(dt["wiki_notices"], t),
3335 sortid="page/20251017b",
3336 )
3337 # Parse word senses for the part-of-speech
3338 parse_part_of_speech(node, pos)
3339 if "tags" in dt:
3340 for pdata in sense_datas:
3341 data_extend(pdata, "tags", dt["tags"])
3342 elif t_no_number in LINKAGE_TITLES:
3343 # print(f"LINKAGE_TITLES NODE {node=}")
3344 rel = LINKAGE_TITLES[t_no_number]
3345 data = select_data()
3346 parse_linkage(
3347 wxr,
3348 data,
3349 rel,
3350 node,
3351 word,
3352 sense_datas,
3353 is_reconstruction,
3354 )
3355 elif t_no_number == COMPOUNDS_TITLE:
3356 data = select_data()
3357 if wxr.config.capture_compounds: 3357 ↛ 3373line 3357 didn't jump to line 3373 because the condition on line 3357 was always true
3358 parse_linkage(
3359 wxr,
3360 data,
3361 "derived",
3362 node,
3363 word,
3364 sense_datas,
3365 is_reconstruction,
3366 )
3368 # XXX parse interesting templates also from other sections. E.g.,
3369 # {{Letter|...}} in ===See also===
3370 # Also <gallery>
3372 # Recurse to children of this node, processing subtitles therein
3373 stack.append(t)
3374 process_children(node, pos)
3375 stack.pop()
3377 if len(redirect_list) > 0:
3378 if len(pos_data) > 0:
3379 pos_data["redirects"] = redirect_list
3380 if "pos" not in pos_data: 3380 ↛ 3381line 3380 didn't jump to line 3381 because the condition on line 3380 was never true
3381 pos_data["pos"] = "soft-redirect"
3382 else:
3383 new_page_data = copy.deepcopy(base_data)
3384 new_page_data["redirects"] = redirect_list
3385 if "pos" not in new_page_data: 3385 ↛ 3387line 3385 didn't jump to line 3387 because the condition on line 3385 was always true
3386 new_page_data["pos"] = "soft-redirect"
3387 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3388 page_datas.append(new_page_data)
3390 def extract_examples(
3391 others: list[WikiNode], sense_base: SenseData
3392 ) -> list[ExampleData]:
3393 """Parses through a list of definitions and quotes to find examples.
3394 Returns a list of example dicts to be added to sense data. Adds
3395 meta-data, mostly categories, into sense_base."""
3396 assert isinstance(others, list)
3397 examples: list[ExampleData] = []
3399 for sub in others:
3400 if not sub.sarg.endswith((":", "*")): 3400 ↛ 3401line 3400 didn't jump to line 3401 because the condition on line 3400 was never true
3401 continue
3402 for item in sub.children:
3403 if not isinstance(item, WikiNode): 3403 ↛ 3404line 3403 didn't jump to line 3404 because the condition on line 3403 was never true
3404 continue
3405 if item.kind != NodeKind.LIST_ITEM: 3405 ↛ 3406line 3405 didn't jump to line 3406 because the condition on line 3405 was never true
3406 continue
3407 usex_type = None
3408 example_template_args = []
3409 example_template_names = []
3410 taxons = set()
3412 # Bypass this function when parsing Chinese, Japanese and
3413 # quotation templates.
3414 new_example_lists = extract_example_list_item(
3415 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3416 )
3417 if len(new_example_lists) > 0:
3418 examples.extend(new_example_lists)
3419 continue
3421 def usex_template_fn(
3422 name: str, ht: TemplateArgs
3423 ) -> Optional[str]:
3424 nonlocal usex_type
3425 if is_panel_template(wxr, name):
3426 return ""
3427 if name in usex_templates:
3428 usex_type = "example"
3429 example_template_args.append(ht)
3430 example_template_names.append(name)
3431 elif name in quotation_templates:
3432 usex_type = "quotation"
3433 elif name in taxonomy_templates: 3433 ↛ 3434line 3433 didn't jump to line 3434 because the condition on line 3433 was never true
3434 taxons.update(ht.get(1, "").split())
3435 for prefix in template_linkages_to_ignore_in_examples:
3436 if re.search(
3437 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3438 ):
3439 return ""
3440 return None
3442 # bookmark
3443 ruby: list[tuple[str, str]] = []
3444 contents = item.children
3445 if lang_code == "ja":
3446 # Capture ruby contents if this is a Japanese language
3447 # example.
3448 # print(contents)
3449 if ( 3449 ↛ 3454line 3449 didn't jump to line 3454 because the condition on line 3449 was never true
3450 contents
3451 and isinstance(contents, str)
3452 and re.match(r"\s*$", contents[0])
3453 ):
3454 contents = contents[1:]
3455 exp = wxr.wtp.parse(
3456 wxr.wtp.node_to_wikitext(contents),
3457 # post_template_fn=head_post_template_fn,
3458 expand_all=True,
3459 )
3460 rub, rest = extract_ruby(wxr, exp.children)
3461 if rub:
3462 for rtup in rub:
3463 ruby.append(rtup)
3464 contents = rest
3465 subtext = clean_node(
3466 wxr, sense_base, contents, template_fn=usex_template_fn
3467 )
3469 frozen_taxons = frozenset(taxons)
3470 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3472 # print(f"{subtext=}")
3473 subtext = re.sub(
3474 r"\s*\(please add an English "
3475 r"translation of this "
3476 r"(example|usage example|quote)\)",
3477 "",
3478 subtext,
3479 ).strip()
3480 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3481 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3482 # print("subtext:", repr(subtext))
3484 lines = subtext.splitlines()
3485 # print(lines)
3487 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3488 lines = list(
3489 x
3490 for x in lines
3491 if not re.match(
3492 r"(Synonyms: |Antonyms: |Hyponyms: |"
3493 r"Synonym: |Antonym: |Hyponym: |"
3494 r"Hypernyms: |Derived terms: |"
3495 r"Related terms: |"
3496 r"Hypernym: |Derived term: |"
3497 r"Coordinate terms:|"
3498 r"Related term: |"
3499 r"For more quotations using )",
3500 x,
3501 )
3502 )
3503 tr = ""
3504 ref = ""
3505 roman = ""
3506 # for line in lines:
3507 # print("LINE:", repr(line))
3508 # print(classify_desc(line))
3509 if len(lines) == 1 and lang_code != "en":
3510 parts = example_splitter_re.split(lines[0])
3511 if ( 3511 ↛ 3519line 3511 didn't jump to line 3519 because the condition on line 3511 was never true
3512 len(parts) > 2
3513 and len(example_template_args) == 1
3514 and any(
3515 ("―" in s) or ("—" in s)
3516 for s in example_template_args[0].values()
3517 )
3518 ):
3519 if nparts := synch_splits_with_args(
3520 lines[0], example_template_args[0]
3521 ):
3522 parts = nparts
3523 if ( 3523 ↛ 3528line 3523 didn't jump to line 3528 because the condition on line 3523 was never true
3524 len(example_template_args) == 1
3525 and "lit" in example_template_args[0]
3526 ):
3527 # ugly brute-force kludge in case there's a lit= arg
3528 literally = example_template_args[0].get("lit", "")
3529 if literally:
3530 literally = (
3531 " (literally, “"
3532 + clean_value(wxr, literally)
3533 + "”)"
3534 )
3535 else:
3536 literally = ""
3537 if ( 3537 ↛ 3576line 3537 didn't jump to line 3576 because the condition on line 3537 was never true
3538 len(example_template_args) == 1
3539 and len(parts) == 2
3540 and len(example_template_args[0])
3541 - (
3542 # horrible kludge to ignore these arguments
3543 # when calculating how many there are
3544 sum(
3545 s in example_template_args[0]
3546 for s in (
3547 "lit", # generates text, but we handle it
3548 "inline",
3549 "noenum",
3550 "nocat",
3551 "sort",
3552 )
3553 )
3554 )
3555 == 3
3556 and clean_value(
3557 wxr, example_template_args[0].get(2, "")
3558 )
3559 == parts[0].strip()
3560 and clean_value(
3561 wxr,
3562 (
3563 example_template_args[0].get(3)
3564 or example_template_args[0].get("translation")
3565 or example_template_args[0].get("t", "")
3566 )
3567 + literally, # in case there's a lit= argument
3568 )
3569 == parts[1].strip()
3570 ):
3571 # {{exampletemplate|ex|Foo bar baz|English translation}}
3572 # is a pretty reliable 'heuristic', so we use it here
3573 # before the others. To be extra sure the template
3574 # doesn't do anything weird, we compare the arguments
3575 # and the output to each other.
3576 lines = [parts[0].strip()]
3577 tr = parts[1].strip()
3578 elif (
3579 len(parts) == 2
3580 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3581 ):
3582 # These other branches just do some simple heuristics w/
3583 # the expanded output of the template (if applicable).
3584 lines = [parts[0].strip()]
3585 tr = parts[1].strip()
3586 elif ( 3586 ↛ 3592line 3586 didn't jump to line 3592 because the condition on line 3586 was never true
3587 len(parts) == 3
3588 and classify_desc2(parts[1])
3589 in ("romanization", "english")
3590 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3591 ):
3592 lines = [parts[0].strip()]
3593 roman = parts[1].strip()
3594 tr = parts[2].strip()
3595 else:
3596 parts = re.split(r"\s+-\s+", lines[0])
3597 if ( 3597 ↛ 3601line 3597 didn't jump to line 3601 because the condition on line 3597 was never true
3598 len(parts) == 2
3599 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3600 ):
3601 lines = [parts[0].strip()]
3602 tr = parts[1].strip()
3603 elif len(lines) > 1:
3604 if any(
3605 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3606 ) and not (len(example_template_names) == 1):
3607 refs: list[str] = []
3608 for i in range(len(lines)): 3608 ↛ 3614line 3608 didn't jump to line 3614 because the loop on line 3608 didn't complete
3609 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3609 ↛ 3610line 3609 didn't jump to line 3610 because the condition on line 3609 was never true
3610 break
3611 refs.append(lines[i].strip())
3612 if re.search(r"[]\d:)]\s*$", lines[i]):
3613 break
3614 ref = " ".join(refs)
3615 lines = lines[i + 1 :]
3616 if (
3617 lang_code != "en"
3618 and len(lines) >= 2
3619 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3620 ):
3621 i = len(lines) - 1
3622 while ( 3622 ↛ 3627line 3622 didn't jump to line 3627 because the condition on line 3622 was never true
3623 i > 1
3624 and classify_desc2(lines[i - 1])
3625 in ENGLISH_TEXTS
3626 ):
3627 i -= 1
3628 tr = "\n".join(lines[i:])
3629 lines = lines[:i]
3630 if len(lines) >= 2:
3631 if classify_desc2(lines[-1]) == "romanization":
3632 roman = lines[-1].strip()
3633 lines = lines[:-1]
3635 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3636 ref = lines[0]
3637 lines = lines[1:]
3638 elif lang_code != "en" and len(lines) == 2:
3639 cls1 = classify_desc2(lines[0])
3640 cls2 = classify_desc2(lines[1])
3641 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3642 tr = lines[1]
3643 lines = [lines[0]]
3644 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3644 ↛ 3645line 3644 didn't jump to line 3645 because the condition on line 3644 was never true
3645 tr = lines[0]
3646 lines = [lines[1]]
3647 elif ( 3647 ↛ 3654line 3647 didn't jump to line 3654 because the condition on line 3647 was never true
3648 re.match(r"^[#*]*:+", lines[1])
3649 and classify_desc2(
3650 re.sub(r"^[#*:]+\s*", "", lines[1])
3651 )
3652 in ENGLISH_TEXTS
3653 ):
3654 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3655 lines = [lines[0]]
3656 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3657 # Both were classified as English, but
3658 # presumably one is not. Assume first is
3659 # non-English, as that seems more common.
3660 tr = lines[1]
3661 lines = [lines[0]]
3662 elif (
3663 usex_type != "quotation"
3664 and lang_code != "en"
3665 and len(lines) == 3
3666 ):
3667 cls1 = classify_desc2(lines[0])
3668 cls2 = classify_desc2(lines[1])
3669 cls3 = classify_desc2(lines[2])
3670 if (
3671 cls3 == "english"
3672 and cls2 in ("english", "romanization")
3673 and cls1 != "english"
3674 ):
3675 tr = lines[2].strip()
3676 roman = lines[1].strip()
3677 lines = [lines[0].strip()]
3678 elif ( 3678 ↛ 3686line 3678 didn't jump to line 3686 because the condition on line 3678 was never true
3679 usex_type == "quotation"
3680 and lang_code != "en"
3681 and len(lines) > 2
3682 ):
3683 # for x in lines:
3684 # print(" LINE: {}: {}"
3685 # .format(classify_desc2(x), x))
3686 if re.match(r"^[#*]*:+\s*$", lines[1]):
3687 ref = lines[0]
3688 lines = lines[2:]
3689 cls1 = classify_desc2(lines[-1])
3690 if cls1 == "english":
3691 i = len(lines) - 1
3692 while (
3693 i > 1
3694 and classify_desc2(lines[i - 1])
3695 == ENGLISH_TEXTS
3696 ):
3697 i -= 1
3698 tr = "\n".join(lines[i:])
3699 lines = lines[:i]
3701 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3702 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3703 tr = re.sub(r"^[#*:]+\s*", "", tr)
3704 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3705 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3706 ref = re.sub(r"^[#*:]+\s*", "", ref)
3707 ref = re.sub(
3708 r", (volume |number |page )?“?"
3709 r"\(please specify ([^)]|\(s\))*\)”?|"
3710 ", text here$",
3711 "",
3712 ref,
3713 )
3714 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3715 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3716 subtext = "\n".join(x for x in lines if x)
3717 if not tr and lang_code != "en":
3718 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3719 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3719 ↛ 3720line 3719 didn't jump to line 3720 because the condition on line 3719 was never true
3720 tr = m.group(2)
3721 subtext = subtext[: m.start()] + m.group(1)
3722 elif lines:
3723 parts = re.split(r"\s*[―—]+\s*", lines[0])
3724 if ( 3724 ↛ 3728line 3724 didn't jump to line 3728 because the condition on line 3724 was never true
3725 len(parts) == 2
3726 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3727 ):
3728 subtext = parts[0].strip()
3729 tr = parts[1].strip()
3730 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3731 subtext = re.sub(
3732 r"(please add an English translation of "
3733 r"this (quote|usage example))",
3734 "",
3735 subtext,
3736 )
3737 subtext = re.sub(
3738 r"\s*→New International Version " "translation$",
3739 "",
3740 subtext,
3741 ) # e.g. pis/Tok Pisin (Bible)
3742 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
3743 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
3744 note = None
3745 m = re.match(r"^\(([^)]*)\):\s+", subtext)
3746 if ( 3746 ↛ 3754line 3746 didn't jump to line 3754 because the condition on line 3746 was never true
3747 m is not None
3748 and lang_code != "en"
3749 and (
3750 m.group(1).startswith("with ")
3751 or classify_desc2(m.group(1)) == "english"
3752 )
3753 ):
3754 note = m.group(1)
3755 subtext = subtext[m.end() :]
3756 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
3757 ref = re.sub(r",\s*→ISBN", "", ref)
3758 ref = ref.strip()
3759 if ref.endswith(":") or ref.endswith(","):
3760 ref = ref[:-1].strip()
3761 ref = re.sub(r"\s+,\s+", ", ", ref)
3762 ref = re.sub(r"\s+", " ", ref)
3763 if ref and not subtext: 3763 ↛ 3764line 3763 didn't jump to line 3764 because the condition on line 3763 was never true
3764 subtext = ref
3765 ref = ""
3766 if subtext:
3767 dt: ExampleData = {"text": subtext}
3768 if ref:
3769 dt["ref"] = ref
3770 if tr:
3771 dt["english"] = tr # DEPRECATED for "translation"
3772 dt["translation"] = tr
3773 if usex_type:
3774 dt["type"] = usex_type
3775 if note: 3775 ↛ 3776line 3775 didn't jump to line 3776 because the condition on line 3775 was never true
3776 dt["note"] = note
3777 if roman:
3778 dt["roman"] = roman
3779 if ruby:
3780 dt["ruby"] = ruby
3781 examples.append(dt)
3783 return examples
3785 # Main code of parse_language()
3786 # Process the section
3787 stack.append(language)
3788 process_children(langnode, None)
3789 stack.pop()
3791 # Finalize word entires
3792 push_etym()
3793 ret = []
3794 for data in page_datas:
3795 merge_base(data, base_data)
3796 ret.append(data)
3798 # Copy all tags to word senses
3799 for data in ret:
3800 if "senses" not in data: 3800 ↛ 3801line 3800 didn't jump to line 3801 because the condition on line 3800 was never true
3801 continue
3802 # WordData should not have a 'tags' field, but if it does, it's
3803 # deleted and its contents removed and placed in each sense;
3804 # that's why the type ignores.
3805 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
3806 if "tags" in data:
3807 del data["tags"] # type: ignore[typeddict-item]
3808 for sense in data["senses"]:
3809 data_extend(sense, "tags", tags)
3811 return ret
3814def parse_wikipedia_template(
3815 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
3816) -> None:
3817 """Helper function for parsing {{wikipedia|...}} and related templates."""
3818 assert isinstance(wxr, WiktextractContext)
3819 assert isinstance(data, dict)
3820 assert isinstance(ht, dict)
3821 langid = clean_node(wxr, data, ht.get("lang", ()))
3822 pagename = (
3823 clean_node(wxr, data, ht.get(1, ()))
3824 or wxr.wtp.title
3825 or "MISSING_PAGE_TITLE"
3826 )
3827 if langid:
3828 data_append(data, "wikipedia", langid + ":" + pagename)
3829 else:
3830 data_append(data, "wikipedia", pagename)
3833def parse_top_template(
3834 wxr: WiktextractContext, node: WikiNode, data: WordData
3835) -> None:
3836 """Parses a template that occurs on the top-level in a page, before any
3837 language subtitles."""
3838 assert isinstance(wxr, WiktextractContext)
3839 assert isinstance(node, WikiNode)
3840 assert isinstance(data, dict)
3842 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3843 if name in wikipedia_templates:
3844 parse_wikipedia_template(wxr, data, ht)
3845 return None
3846 if is_panel_template(wxr, name):
3847 return ""
3848 if name in ("reconstruction",): 3848 ↛ 3849line 3848 didn't jump to line 3849 because the condition on line 3848 was never true
3849 return ""
3850 if name.lower() == "also" or name.lower().startswith("also/"):
3851 # XXX shows related words that might really have been the intended
3852 # word, capture them
3853 return ""
3854 if name == "see also": 3854 ↛ 3856line 3854 didn't jump to line 3856 because the condition on line 3854 was never true
3855 # XXX capture
3856 return ""
3857 if name == "cardinalbox": 3857 ↛ 3859line 3857 didn't jump to line 3859 because the condition on line 3857 was never true
3858 # XXX capture
3859 return ""
3860 if name == "character info": 3860 ↛ 3862line 3860 didn't jump to line 3862 because the condition on line 3860 was never true
3861 # XXX capture
3862 return ""
3863 if name == "commonscat": 3863 ↛ 3865line 3863 didn't jump to line 3865 because the condition on line 3863 was never true
3864 # XXX capture link to Wikimedia commons
3865 return ""
3866 if name == "wrongtitle": 3866 ↛ 3869line 3866 didn't jump to line 3869 because the condition on line 3866 was never true
3867 # XXX this should be captured to replace page title with the
3868 # correct title. E.g. ⿰亻革家
3869 return ""
3870 if name == "wikidata": 3870 ↛ 3871line 3870 didn't jump to line 3871 because the condition on line 3870 was never true
3871 arg = clean_node(wxr, data, ht.get(1, ()))
3872 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
3873 data_append(data, "wikidata", arg)
3874 return ""
3875 wxr.wtp.debug(
3876 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
3877 sortid="page/2870",
3878 )
3879 return ""
3881 clean_node(wxr, None, [node], template_fn=top_template_fn)
3884def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
3885 """Fix subtitle hierarchy to be strict Language -> Etymology ->
3886 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
3887 that are next to each other."""
3889 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
3890 # section get overwritten. In this case, let's just combine the two.
3892 # In Chinese entries, Pronunciation can be preceded on the
3893 # same level 3 by its Etymology *and* Glyph Origin sections:
3894 # ===Glyph Origin===
3895 # ===Etymology===
3896 # ===Pronunciation===
3897 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
3898 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
3899 # are now level 6
3901 # Known lowercase PoS names are in part_of_speech_map
3902 # Known lowercase linkage section names are in linkage_map
3904 old = re.split(
3905 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
3906 )
3908 parts = []
3909 npar = 4 # Number of parentheses in above expression
3910 parts.append(old[0])
3911 prev_level = None
3912 level = None
3913 skip_level_title = False # When combining etymology sections
3914 for i in range(1, len(old), npar + 1):
3915 left = old[i]
3916 right = old[i + npar - 1]
3917 # remove Wikilinks in title
3918 title = re.sub(r"^\[\[", "", old[i + 1])
3919 title = re.sub(r"\]\]$", "", title)
3920 prev_level = level
3921 level = len(left)
3922 part = old[i + npar]
3923 if level != len(right): 3923 ↛ 3924line 3923 didn't jump to line 3924 because the condition on line 3923 was never true
3924 wxr.wtp.debug(
3925 "subtitle has unbalanced levels: "
3926 "{!r} has {} on the left and {} on the right".format(
3927 title, left, right
3928 ),
3929 sortid="page/2904",
3930 )
3931 lc = title.lower()
3932 if name_to_code(title, "en") != "":
3933 if level > 2: 3933 ↛ 3934line 3933 didn't jump to line 3934 because the condition on line 3933 was never true
3934 wxr.wtp.debug(
3935 "subtitle has language name {} at level {}".format(
3936 title, level
3937 ),
3938 sortid="page/2911",
3939 )
3940 level = 2
3941 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
3942 if level > 3: 3942 ↛ 3943line 3942 didn't jump to line 3943 because the condition on line 3942 was never true
3943 wxr.wtp.debug(
3944 "etymology section {} at level {}".format(title, level),
3945 sortid="page/2917",
3946 )
3947 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
3948 # sections cheek-to-cheek
3949 skip_level_title = True
3950 # Modify the title of previous ("Glyph Origin") section, in
3951 # case we have a meaningful title like "Etymology 1"
3952 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
3953 level = 3
3954 elif lc.startswith(PRONUNCIATION_TITLE):
3955 # Pronunciation is now a level between POS and Etymology, so
3956 # we need to shift everything down by one
3957 level = 4
3958 elif lc in POS_TITLES:
3959 level = 5
3960 elif lc == TRANSLATIONS_TITLE:
3961 level = 6
3962 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
3963 level = 6
3964 elif lc in INFLECTION_TITLES:
3965 level = 6
3966 elif lc == DESCENDANTS_TITLE:
3967 level = 6
3968 elif title in PROTO_ROOT_DERIVED_TITLES: 3968 ↛ 3969line 3968 didn't jump to line 3969 because the condition on line 3968 was never true
3969 level = 6
3970 elif lc in IGNORED_TITLES:
3971 level = 6
3972 else:
3973 level = 6
3974 if skip_level_title:
3975 skip_level_title = False
3976 parts.append(part)
3977 else:
3978 parts.append("{}{}{}".format("=" * level, title, "=" * level))
3979 parts.append(part)
3980 # print("=" * level, title)
3981 # if level != len(left):
3982 # print(" FIXED LEVEL OF {} {} -> {}"
3983 # .format(title, len(left), level))
3985 text = "".join(parts)
3986 # print(text)
3987 return text
3990def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
3991 # Skip translation pages
3992 if word.endswith("/" + TRANSLATIONS_TITLE): 3992 ↛ 3993line 3992 didn't jump to line 3993 because the condition on line 3992 was never true
3993 return []
3995 if wxr.config.verbose: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true
3996 logger.info(f"Parsing page: {word}")
3998 wxr.config.word = word
3999 wxr.wtp.start_page(word)
4001 # Remove <noinclude> and similar tags from main pages. They
4002 # should not appear there, but at least net/Elfdala has one and it
4003 # is probably not the only one.
4004 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4005 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4006 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4008 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4009 # pages that have, for example, Translations section under Linkage, or
4010 # Translations section on the same level as Noun. Enforce a proper
4011 # hierarchy by manipulating the subtitle levels in certain cases.
4012 text = fix_subtitle_hierarchy(wxr, text)
4014 # Parse the page, pre-expanding those templates that are likely to
4015 # influence parsing
4016 tree = wxr.wtp.parse(
4017 text,
4018 pre_expand=True,
4019 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4020 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4021 )
4022 # from wikitextprocessor.parser import print_tree
4023 # print("PAGE PARSE:", print_tree(tree))
4025 top_data: WordData = {}
4027 # Iterate over top-level titles, which should be languages for normal
4028 # pages
4029 by_lang = defaultdict(list)
4030 for langnode in tree.children:
4031 if not isinstance(langnode, WikiNode):
4032 continue
4033 if langnode.kind == NodeKind.TEMPLATE:
4034 parse_top_template(wxr, langnode, top_data)
4035 continue
4036 if langnode.kind == NodeKind.LINK:
4037 # Some pages have links at top level, e.g., "trees" in Wiktionary
4038 continue
4039 if langnode.kind != NodeKind.LEVEL2: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true
4040 wxr.wtp.debug(
4041 f"unexpected top-level node: {langnode}", sortid="page/3014"
4042 )
4043 continue
4044 lang = clean_node(
4045 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4046 )
4047 lang_code = name_to_code(lang, "en")
4048 if lang_code == "": 4048 ↛ 4049line 4048 didn't jump to line 4049 because the condition on line 4048 was never true
4049 wxr.wtp.debug(
4050 f"unrecognized language name: {lang}", sortid="page/3019"
4051 )
4052 if (
4053 wxr.config.capture_language_codes
4054 and lang_code not in wxr.config.capture_language_codes
4055 ):
4056 continue
4057 wxr.wtp.start_section(lang)
4059 # Collect all words from the page.
4060 # print(f"{langnode=}")
4061 datas = parse_language(wxr, langnode, lang, lang_code)
4063 # Propagate fields resulting from top-level templates to this
4064 # part-of-speech.
4065 for data in datas:
4066 if "lang" not in data: 4066 ↛ 4067line 4066 didn't jump to line 4067 because the condition on line 4066 was never true
4067 wxr.wtp.debug(
4068 "internal error -- no lang in data: {}".format(data),
4069 sortid="page/3034",
4070 )
4071 continue
4072 for k, v in top_data.items():
4073 assert isinstance(v, (list, tuple))
4074 data_extend(data, k, v)
4075 by_lang[data["lang"]].append(data)
4077 # XXX this code is clearly out of date. There is no longer a "conjugation"
4078 # field. FIX OR REMOVE.
4079 # Do some post-processing on the words. For example, we may distribute
4080 # conjugation information to all the words.
4081 ret = []
4082 for lang, lang_datas in by_lang.items():
4083 ret.extend(lang_datas)
4085 for x in ret:
4086 if x["word"] != word:
4087 if word.startswith("Unsupported titles/"):
4088 wxr.wtp.debug(
4089 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4090 sortid="20231101/3578page.py",
4091 )
4092 else:
4093 wxr.wtp.debug(
4094 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4095 sortid="20231101/3582page.py",
4096 )
4097 x["original_title"] = word
4098 # validate tag data
4099 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4100 return ret
4103def recursively_separate_raw_tags(
4104 wxr: WiktextractContext, data: dict[str, Any]
4105) -> None:
4106 if not isinstance(data, dict): 4106 ↛ 4107line 4106 didn't jump to line 4107 because the condition on line 4106 was never true
4107 wxr.wtp.error(
4108 "'data' is not dict; most probably "
4109 "data has a list that contains at least one dict and "
4110 "at least one non-dict item",
4111 sortid="en/page-4016/20240419",
4112 )
4113 return
4114 new_tags: list[str] = []
4115 raw_tags: list[str] = data.get("raw_tags", [])
4116 for field, val in data.items():
4117 if field == "tags":
4118 for tag in val:
4119 if tag not in valid_tags:
4120 raw_tags.append(tag)
4121 else:
4122 new_tags.append(tag)
4123 if isinstance(val, list):
4124 if len(val) > 0 and isinstance(val[0], dict):
4125 for d in val:
4126 recursively_separate_raw_tags(wxr, d)
4127 if "tags" in data and not new_tags:
4128 del data["tags"]
4129 elif new_tags:
4130 data["tags"] = new_tags
4131 if raw_tags:
4132 data["raw_tags"] = raw_tags
4135def process_soft_redirect_template(
4136 wxr: WiktextractContext,
4137 template_node: TemplateNode,
4138 redirect_pages: list[str],
4139) -> bool:
4140 # return `True` if the template is soft redirect template
4141 if template_node.template_name == "zh-see":
4142 # https://en.wiktionary.org/wiki/Template:zh-see
4143 title = clean_node(
4144 wxr, None, template_node.template_parameters.get(1, "")
4145 )
4146 if title != "": 4146 ↛ 4148line 4146 didn't jump to line 4148 because the condition on line 4146 was always true
4147 redirect_pages.append(title)
4148 return True
4149 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4150 # https://en.wiktionary.org/wiki/Template:ja-see
4151 for key, value in template_node.template_parameters.items():
4152 if isinstance(key, int): 4152 ↛ 4151line 4152 didn't jump to line 4151 because the condition on line 4152 was always true
4153 title = clean_node(wxr, None, value)
4154 if title != "": 4154 ↛ 4151line 4154 didn't jump to line 4151 because the condition on line 4154 was always true
4155 redirect_pages.append(title)
4156 return True
4157 return False
4160ZH_FORMS_TAGS = {
4161 "trad.": "Traditional-Chinese",
4162 "simp.": "Simplified-Chinese",
4163 "alternative forms": "alternative",
4164 "2nd round simp.": "Second-Round-Simplified-Chinese",
4165}
4168def extract_zh_forms_template(
4169 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4170):
4171 # https://en.wiktionary.org/wiki/Template:zh-forms
4172 lit_meaning = clean_node(
4173 wxr, None, t_node.template_parameters.get("lit", "")
4174 )
4175 if lit_meaning != "":
4176 base_data["literal_meaning"] = lit_meaning
4177 expanded_node = wxr.wtp.parse(
4178 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4179 )
4180 for table in expanded_node.find_child(NodeKind.TABLE):
4181 for row in table.find_child(NodeKind.TABLE_ROW):
4182 row_header = ""
4183 row_header_tags: list[str] = []
4184 header_has_span = False
4185 for cell in row.find_child(
4186 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4187 ):
4188 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4189 row_header, row_header_tags, header_has_span = (
4190 extract_zh_forms_header_cell(wxr, base_data, cell)
4191 )
4192 elif not header_has_span:
4193 extract_zh_forms_data_cell(
4194 wxr, base_data, cell, row_header, row_header_tags
4195 )
4197 if "forms" in base_data and len(base_data["forms"]) == 0: 4197 ↛ 4198line 4197 didn't jump to line 4198 because the condition on line 4197 was never true
4198 del base_data["forms"]
4201def extract_zh_forms_header_cell(
4202 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4203) -> tuple[str, list[str], bool]:
4204 row_header = ""
4205 row_header_tags = []
4206 header_has_span = False
4207 first_span_index = len(header_cell.children)
4208 for index, span_tag in header_cell.find_html("span", with_index=True):
4209 if index < first_span_index: 4209 ↛ 4211line 4209 didn't jump to line 4211 because the condition on line 4209 was always true
4210 first_span_index = index
4211 header_has_span = True
4212 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4213 for raw_tag in row_header.split(" and "):
4214 raw_tag = raw_tag.strip()
4215 if raw_tag != "":
4216 row_header_tags.append(raw_tag)
4217 for span_tag in header_cell.find_html_recursively("span"):
4218 span_lang = span_tag.attrs.get("lang", "")
4219 form_nodes = []
4220 sup_title = ""
4221 for node in span_tag.children:
4222 if isinstance(node, HTMLNode) and node.tag == "sup": 4222 ↛ 4223line 4222 didn't jump to line 4223 because the condition on line 4222 was never true
4223 for sup_span in node.find_html("span"):
4224 sup_title = sup_span.attrs.get("title", "")
4225 else:
4226 form_nodes.append(node)
4227 if span_lang in ["zh-Hant", "zh-Hans"]:
4228 for word in clean_node(wxr, None, form_nodes).split("/"):
4229 if word not in [wxr.wtp.title, ""]:
4230 form = {"form": word}
4231 for raw_tag in row_header_tags:
4232 if raw_tag in ZH_FORMS_TAGS: 4232 ↛ 4235line 4232 didn't jump to line 4235 because the condition on line 4232 was always true
4233 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4234 else:
4235 data_append(form, "raw_tags", raw_tag)
4236 if sup_title != "": 4236 ↛ 4237line 4236 didn't jump to line 4237 because the condition on line 4236 was never true
4237 data_append(form, "raw_tags", sup_title)
4238 data_append(base_data, "forms", form)
4239 return row_header, row_header_tags, header_has_span
4242TagLiteral = Literal["tags", "raw_tags"]
4243TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags")
4246def extract_zh_forms_data_cell(
4247 wxr: WiktextractContext,
4248 base_data: WordData,
4249 cell: WikiNode,
4250 row_header: str,
4251 row_header_tags: list[str],
4252) -> None:
4253 from .zh_pron_tags import ZH_PRON_TAGS
4255 forms: list[FormData] = []
4256 for top_span_tag in cell.find_html("span"):
4257 span_style = top_span_tag.attrs.get("style", "")
4258 span_lang = top_span_tag.attrs.get("lang", "")
4259 if span_style == "white-space:nowrap;":
4260 extract_zh_forms_data_cell(
4261 wxr, base_data, top_span_tag, row_header, row_header_tags
4262 )
4263 elif "font-size:80%" in span_style:
4264 raw_tag = clean_node(wxr, None, top_span_tag)
4265 if raw_tag != "": 4265 ↛ 4256line 4265 didn't jump to line 4256 because the condition on line 4265 was always true
4266 for form in forms:
4267 if raw_tag in ZH_PRON_TAGS: 4267 ↛ 4273line 4267 didn't jump to line 4273 because the condition on line 4267 was always true
4268 tr_tag = ZH_PRON_TAGS[raw_tag]
4269 if isinstance(tr_tag, list): 4269 ↛ 4270line 4269 didn't jump to line 4270 because the condition on line 4269 was never true
4270 data_extend(form, "tags", tr_tag)
4271 elif isinstance(tr_tag, str): 4271 ↛ 4266line 4271 didn't jump to line 4266 because the condition on line 4271 was always true
4272 data_append(form, "tags", tr_tag)
4273 elif raw_tag in valid_tags:
4274 data_append(form, "tags", raw_tag)
4275 else:
4276 data_append(form, "raw_tags", raw_tag)
4277 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4277 ↛ 4256line 4277 didn't jump to line 4256 because the condition on line 4277 was always true
4278 word = clean_node(wxr, None, top_span_tag)
4279 if word not in ["", "/", wxr.wtp.title]:
4280 form = {"form": word}
4281 if row_header != "anagram": 4281 ↛ 4287line 4281 didn't jump to line 4287 because the condition on line 4281 was always true
4282 for raw_tag in row_header_tags:
4283 if raw_tag in ZH_FORMS_TAGS: 4283 ↛ 4286line 4283 didn't jump to line 4286 because the condition on line 4283 was always true
4284 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4285 else:
4286 data_append(form, "raw_tags", raw_tag)
4287 if span_lang == "zh-Hant":
4288 data_append(form, "tags", "Traditional-Chinese")
4289 elif span_lang == "zh-Hans":
4290 data_append(form, "tags", "Simplified-Chinese")
4291 forms.append(form)
4293 if row_header == "anagram": 4293 ↛ 4294line 4293 didn't jump to line 4294 because the condition on line 4293 was never true
4294 for form in forms:
4295 l_data: LinkageData = {"word": form["form"]}
4296 for key in TAG_LITERALS_TUPLE:
4297 if key in form:
4298 l_data[key] = form[key]
4299 data_append(base_data, "anagrams", l_data)
4300 else:
4301 data_extend(base_data, "forms", forms)
4304def extract_ja_kanjitab_template(
4305 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4306):
4307 # https://en.wiktionary.org/wiki/Template:ja-kanjitab
4308 expanded_node = wxr.wtp.parse(
4309 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4310 )
4311 for table in expanded_node.find_child(NodeKind.TABLE):
4312 is_alt_form_table = False
4313 for row in table.find_child(NodeKind.TABLE_ROW):
4314 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
4315 header_text = clean_node(wxr, None, header_node)
4316 if header_text.startswith("Alternative spelling"):
4317 is_alt_form_table = True
4318 if not is_alt_form_table:
4319 continue
4320 forms = []
4321 for row in table.find_child(NodeKind.TABLE_ROW):
4322 for cell_node in row.find_child(NodeKind.TABLE_CELL):
4323 for child_node in cell_node.children:
4324 if isinstance(child_node, HTMLNode):
4325 if child_node.tag == "span":
4326 word = clean_node(wxr, None, child_node)
4327 if word != "": 4327 ↛ 4323line 4327 didn't jump to line 4323 because the condition on line 4327 was always true
4328 forms.append(
4329 {
4330 "form": word,
4331 "tags": ["alternative", "kanji"],
4332 }
4333 )
4334 elif child_node.tag == "small":
4335 raw_tag = clean_node(wxr, None, child_node).strip(
4336 "()"
4337 )
4338 if raw_tag != "" and len(forms) > 0: 4338 ↛ 4323line 4338 didn't jump to line 4323 because the condition on line 4338 was always true
4339 data_append(
4340 forms[-1],
4341 "tags"
4342 if raw_tag in valid_tags
4343 else "raw_tags",
4344 raw_tag,
4345 )
4346 data_extend(base_data, "forms", forms)
4347 for link_node in expanded_node.find_child(NodeKind.LINK):
4348 clean_node(wxr, base_data, link_node)