Coverage for src / wiktextract / extractor / en / page.py: 79%
1825 statements
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
« prev ^ index » next coverage.py v7.14.0, created at 2026-05-11 04:48 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Literal,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .descendant import extract_descendant_section
51from .example import extract_example_list_item, extract_template_zh_x
52from .form_descriptions import (
53 classify_desc,
54 decode_tags,
55 distw,
56 parse_alt_or_inflection_of,
57 parse_sense_qualifier,
58 parse_word_head,
59)
60from .inflection import TableContext, parse_inflection_section
61from .info_templates import (
62 INFO_TEMPLATE_FUNCS,
63 parse_info_template_arguments,
64 parse_info_template_node,
65)
66from .linkages import (
67 extract_alt_form_section,
68 parse_linkage,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 FormData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280FLOATING_TABLE_TEMPLATES: set[str] = {
281 # az-suffix-form creates a style=floatright div that is otherwise
282 # deleted; if it is not pre-expanded, we can intercept the template
283 # so we add this set into do_not_pre_expand, and intercept the
284 # templates in parse_part_of_speech
285 "az-suffix-forms",
286 "az-inf-p",
287 "kk-suffix-forms",
288 "ky-suffix-forms",
289 "tr-inf-p",
290 "tr-suffix-forms",
291 "tt-suffix-forms",
292 "uz-suffix-forms",
293}
294# These two should contain template names that should always be
295# pre-expanded when *first* processing the tree, or not pre-expanded
296# so that the template are left in place with their identifying
297# name intact for later filtering.
299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
302# Additional templates to be expanded in the pre-expand phase
303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
304 "multitrans",
305 "multitrans-nowiki",
306 "trans-top",
307 "trans-top-also",
308 "trans-bottom",
309 "checktrans-top",
310 "checktrans-bottom",
311 "col",
312 "col1",
313 "col2",
314 "col3",
315 "col4",
316 "col5",
317 "col1-u",
318 "col2-u",
319 "col3-u",
320 "col4-u",
321 "col5-u",
322 "check deprecated lang param usage",
323 "deprecated code",
324 "ru-verb-alt-ё",
325 "ru-noun-alt-ё",
326 "ru-adj-alt-ё",
327 "ru-proper noun-alt-ё",
328 "ru-pos-alt-ё",
329 "ru-alt-ё",
330 "inflection of",
331 "no deprecated lang param usage",
332 "transclude", # these produce sense entries (or other lists)
333 "tcl",
334}
336# Inverse linkage for those that have them
337linkage_inverses: dict[str, str] = {
338 # XXX this is not currently used, move to post-processing
339 "synonyms": "synonyms",
340 "hypernyms": "hyponyms",
341 "hyponyms": "hypernyms",
342 "holonyms": "meronyms",
343 "meronyms": "holonyms",
344 "derived": "derived_from",
345 "coordinate_terms": "coordinate_terms",
346 "troponyms": "hypernyms",
347 "antonyms": "antonyms",
348 "instances": "instance_of",
349 "related": "related",
350}
352# Templates that are used to form panels on pages and that
353# should be ignored in various positions
354PANEL_TEMPLATES: set[str] = {
355 "Character info",
356 "CJKV",
357 "French personal pronouns",
358 "French possessive adjectives",
359 "French possessive pronouns",
360 "Han etym",
361 "Japanese demonstratives",
362 "Latn-script",
363 "LDL",
364 "MW1913Abbr",
365 "Number-encoding",
366 "Nuttall",
367 "Spanish possessive adjectives",
368 "Spanish possessive pronouns",
369 "USRegionDisputed",
370 "Webster 1913",
371 "ase-rfr",
372 "attention",
373 "attn",
374 "beer",
375 "broken ref",
376 "ca-compass",
377 "character info",
378 "character info/var",
379 "checksense",
380 "compass-fi",
381 "copyvio suspected",
382 "delete",
383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
384 "etystub",
385 "examples",
386 "hu-corr",
387 "hu-suff-pron",
388 "interwiktionary",
389 "ja-kanjitab",
390 "ja-kt",
391 "ko-hanja-search",
392 "look",
393 "maintenance box",
394 "maintenance line",
395 "mediagenic terms",
396 "merge",
397 "missing template",
398 "morse links",
399 "move",
400 "multiple images",
401 "no inline",
402 "picdic",
403 "picdicimg",
404 "picdiclabel",
405 "polyominoes",
406 "predidential nomics",
407 "punctuation", # This actually gets pre-expanded
408 "reconstructed",
409 "request box",
410 "rf-sound example",
411 "rfaccents",
412 "rfap",
413 "rfaspect",
414 "rfc",
415 "rfc-auto",
416 "rfc-header",
417 "rfc-level",
418 "rfc-pron-n",
419 "rfc-sense",
420 "rfclarify",
421 "rfd",
422 "rfd-redundant",
423 "rfd-sense",
424 "rfdate",
425 "rfdatek",
426 "rfdef",
427 "rfe",
428 "rfe/dowork",
429 "rfex",
430 "rfexp",
431 "rfform",
432 "rfgender",
433 "rfi",
434 "rfinfl",
435 "rfm",
436 "rfm-sense",
437 "rfp",
438 "rfp-old",
439 "rfquote",
440 "rfquote-sense",
441 "rfquotek",
442 "rfref",
443 "rfscript",
444 "rft2",
445 "rftaxon",
446 "rftone",
447 "rftranslit",
448 "rfv",
449 "rfv-etym",
450 "rfv-pron",
451 "rfv-quote",
452 "rfv-sense",
453 "selfref",
454 "split",
455 "stroke order", # XXX consider capturing this?
456 "stub entry",
457 "t-needed",
458 "tbot entry",
459 "tea room",
460 "tea room sense",
461 # "ttbc", - XXX needed in at least on/Preposition/Translation page
462 "unblock",
463 "unsupportedpage",
464 "video frames",
465 "was wotd",
466 "wrongtitle",
467 "zh-forms",
468 "zh-hanzi-box",
469 "no entry",
470}
472# Template name prefixes used for language-specific panel templates (i.e.,
473# templates that create side boxes or notice boxes or that should generally
474# be ignored).
475PANEL_PREFIXES: set[str] = {
476 "list:compass points/",
477 "list:Gregorian calendar months/",
478 "RQ:",
479}
481# Templates used for wikipedia links.
482wikipedia_templates: set[str] = {
483 "wikipedia",
484 "slim-wikipedia",
485 "w",
486 "W",
487 "swp",
488 "wiki",
489 "Wikipedia",
490 "wtorw",
491}
492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started
493 print(
494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
495 x
496 )
497 )
499# Mapping from a template name (without language prefix) for the main word
500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
501# it could validly occur. This is used as just a sanity check to give
502# warnings about probably incorrect coding in Wiktionary.
503template_allowed_pos_map: dict[str, list[str]] = {
504 "abbr": ["abbrev"],
505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
506 "plural noun": ["noun", "name"],
507 "plural-noun": ["noun", "name"],
508 "proper noun": ["noun", "name"],
509 "proper-noun": ["name", "noun"],
510 "prop": ["name", "noun"],
511 "verb": ["verb", "phrase"],
512 "gerund": ["verb"],
513 "particle": ["adv", "particle"],
514 "adj": ["adj", "adj_noun"],
515 "pron": ["pron", "noun"],
516 "name": ["name", "noun"],
517 "adv": ["adv", "intj", "conj", "particle"],
518 "phrase": ["phrase", "prep_phrase"],
519 "noun phrase": ["phrase"],
520 "ordinal": ["num"],
521 "number": ["num"],
522 "pos": ["affix", "name", "num"],
523 "suffix": ["suffix", "affix"],
524 "character": ["character"],
525 "letter": ["character"],
526 "kanji": ["character"],
527 "cont": ["abbrev"],
528 "interj": ["intj"],
529 "con": ["conj"],
530 "part": ["particle"],
531 "prep": ["prep", "postp"],
532 "postp": ["postp"],
533 "misspelling": ["noun", "adj", "verb", "adv"],
534 "part-form": ["verb"],
535}
536for k, v in template_allowed_pos_map.items():
537 for x in v:
538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 print(
540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
541 "".format(x, k, v)
542 )
543 assert False
546# Templates ignored during etymology extraction, i.e., these will not be listed
547# in the extracted etymology templates.
548ignored_etymology_templates: list[str] = [
549 "...",
550 "IPAchar",
551 "ipachar",
552 "ISBN",
553 "isValidPageName",
554 "redlink category",
555 "deprecated code",
556 "check deprecated lang param usage",
557 "para",
558 "p",
559 "cite",
560 "Cite news",
561 "Cite newsgroup",
562 "cite paper",
563 "cite MLLM 1976",
564 "cite journal",
565 "cite news/documentation",
566 "cite paper/documentation",
567 "cite video game",
568 "cite video game/documentation",
569 "cite newsgroup",
570 "cite newsgroup/documentation",
571 "cite web/documentation",
572 "cite news",
573 "Cite book",
574 "Cite-book",
575 "cite book",
576 "cite web",
577 "cite-usenet",
578 "cite-video/documentation",
579 "Cite-journal",
580 "rfe",
581 "catlangname",
582 "cln",
583 "langname-lite",
584 "no deprecated lang param usage",
585 "mention",
586 "m",
587 "m-self",
588 "link",
589 "l",
590 "ll",
591 "l-self",
592]
593# Regexp for matching ignored etymology template names. This adds certain
594# prefixes to the names listed above.
595ignored_etymology_templates_re = re.compile(
596 r"^((cite-|R:|RQ:).*|"
597 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
598 + r")$"
599)
601# Regexp for matching ignored descendants template names. Right now we just
602# copy the ignored etymology templates
603ignored_descendants_templates_re = ignored_etymology_templates_re
605# Set of template names that are used to define usage examples. If the usage
606# example contains one of these templates, then it its type is set to
607# "example"
608usex_templates: set[str] = {
609 "afex",
610 "affixusex",
611 "co", # {{collocation}} acts like a example template, specifically for
612 # pairs of combinations of words that are more common than you'd
613 # except would be randomly; hlavní#Czech
614 "coi",
615 "collocation",
616 "el-example",
617 "el-x",
618 "example",
619 "examples",
620 "he-usex",
621 "he-x",
622 "hi-usex",
623 "hi-x",
624 "ja-usex-inline",
625 "ja-usex",
626 "ja-x",
627 "jbo-example",
628 "jbo-x",
629 "km-usex",
630 "km-x",
631 "ko-usex",
632 "ko-x",
633 "lo-usex",
634 "lo-x",
635 "ne-x",
636 "ne-usex",
637 "prefixusex",
638 "ryu-usex",
639 "ryu-x",
640 "shn-usex",
641 "shn-x",
642 "suffixusex",
643 "th-usex",
644 "th-x",
645 "ur-usex",
646 "ur-x",
647 "usex",
648 "usex-suffix",
649 "ux",
650 "uxi",
651}
653stop_head_at_these_templates: set[str] = {
654 "category",
655 "cat",
656 "topics",
657 "catlangname",
658 "c",
659 "C",
660 "top",
661 "cln",
662}
664# Set of template names that are used to define quotation examples. If the
665# usage example contains one of these templates, then its type is set to
666# "quotation".
667quotation_templates: set[str] = {
668 "collapse-quote",
669 "quote-av",
670 "quote-book",
671 "quote-GYLD",
672 "quote-hansard",
673 "quotei",
674 "quote-journal",
675 "quotelite",
676 "quote-mailing list",
677 "quote-meta",
678 "quote-newsgroup",
679 "quote-song",
680 "quote-text",
681 "quote",
682 "quote-us-patent",
683 "quote-video game",
684 "quote-web",
685 "quote-wikipedia",
686 "wikiquote",
687 "Wikiquote",
688 "Q",
689}
691taxonomy_templates = {
692 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
693 "taxfmt",
694 "taxlink",
695 "taxlink2",
696 "taxlinknew",
697 "taxlook",
698}
700# Template names, this was exctracted from template_linkage_mappings,
701# because the code using template_linkage_mappings was actually not used
702# (but not removed).
703template_linkages_to_ignore_in_examples: set[str] = {
704 "syn",
705 "synonyms",
706 "ant",
707 "antonyms",
708 "hyp",
709 "hyponyms",
710 "der",
711 "derived terms",
712 "coordinate terms",
713 "cot",
714 "rel",
715 "col",
716 "inline alt forms",
717 "alti",
718 "comeronyms",
719 "holonyms",
720 "holo",
721 "hypernyms",
722 "hyper",
723 "meronyms",
724 "mero",
725 "troponyms",
726 "perfectives",
727 "pf",
728 "imperfectives",
729 "impf",
730 "syndiff",
731 "synsee",
732 # not linkage nor example templates
733 "sense",
734 "s",
735 "color panel",
736 "colour panel",
737}
739# Maps template name used in a word sense to a linkage field that it adds.
740sense_linkage_templates: dict[str, str] = {
741 "syn": "synonyms",
742 "synonyms": "synonyms",
743 "synsee": "synonyms",
744 "syndiff": "synonyms",
745 "hyp": "hyponyms",
746 "hyponyms": "hyponyms",
747 "ant": "antonyms",
748 "antonyms": "antonyms",
749 "alti": "related",
750 "inline alt forms": "related",
751 "coordinate terms": "coordinate_terms",
752 "cot": "coordinate_terms",
753 "comeronyms": "related",
754 "holonyms": "holonyms",
755 "holo": "holonyms",
756 "hypernyms": "hypernyms",
757 "hyper": "hypernyms",
758 "meronyms": "meronyms",
759 "mero": "meronyms",
760 "troponyms": "troponyms",
761 "perfectives": "related",
762 "pf": "related",
763 "imperfectives": "related",
764 "impf": "related",
765 "parasynonyms": "synonyms",
766 "par": "synonyms",
767 "parasyn": "synonyms",
768 "nearsyn": "synonyms",
769 "near-syn": "synonyms",
770}
772sense_linkage_templates_tags: dict[str, list[str]] = {
773 "alti": ["alternative"],
774 "inline alt forms": ["alternative"],
775 "comeronyms": ["comeronym"],
776 "perfectives": ["perfective"],
777 "pf": ["perfective"],
778 "imperfectives": ["imperfective"],
779 "impf": ["imperfective"],
780}
783def decode_html_entities(v: Union[str, int]) -> str:
784 """Decodes HTML entities from a value, converting them to the respective
785 Unicode characters/strings."""
786 if isinstance(v, int):
787 # I changed this to return str(v) instead of v = str(v),
788 # but there might have been the intention to have more logic
789 # here. html.unescape would not do anything special with an integer,
790 # it needs html escape symbols (&xx;).
791 return str(v)
792 return html.unescape(v)
795def parse_sense_linkage(
796 wxr: WiktextractContext,
797 data: SenseData,
798 name: str,
799 ht: TemplateArgs,
800 pos: str,
801) -> None:
802 """Parses a linkage (synonym, etc) specified in a word sense."""
803 assert isinstance(wxr, WiktextractContext)
804 assert isinstance(data, dict)
805 assert isinstance(name, str)
806 assert isinstance(ht, dict)
807 field = sense_linkage_templates[name]
808 field_tags = sense_linkage_templates_tags.get(name, [])
809 for i in range(2, 20):
810 if i not in ht:
811 break
812 w = clean_node(wxr, data, ht[i])
813 if "#" in w:
814 w = w[: w.index("#")]
815 if w in ["", "<"]: # used in "hypernyms" template
816 continue
817 is_thesaurus = False
818 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
819 if w.startswith(alias):
820 is_thesaurus = True
821 w = w[len(alias) :]
822 if w != wxr.wtp.title: 822 ↛ 842line 822 didn't jump to line 842 because the condition on line 822 was always true
823 from ...thesaurus import search_thesaurus
825 lang_code = clean_node(wxr, None, ht.get(1, ""))
826 for t_data in search_thesaurus(
827 wxr.thesaurus_db_conn, # type: ignore
828 w,
829 lang_code,
830 pos,
831 "synonyms", # GH issue #1570
832 ):
833 l_data: LinkageData = {
834 "word": t_data.term,
835 "source": "Thesaurus:" + w,
836 }
837 if len(t_data.tags) > 0: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true
838 l_data["tags"] = t_data.tags
839 if len(t_data.raw_tags) > 0: 839 ↛ 840line 839 didn't jump to line 840 because the condition on line 839 was never true
840 l_data["raw_tags"] = t_data.raw_tags
841 data_append(data, field, l_data)
842 break
843 if is_thesaurus:
844 continue
845 tags: list[str] = []
846 topics: list[str] = []
847 english: Optional[str] = None
848 # Try to find qualifiers for this synonym
849 q = ht.get("q{}".format(i - 1))
850 if q:
851 cls = classify_desc(q)
852 if cls == "tags":
853 tagsets1, topics1 = decode_tags(q)
854 for ts in tagsets1:
855 tags.extend(ts)
856 topics.extend(topics1)
857 elif cls == "english": 857 ↛ 863line 857 didn't jump to line 863 because the condition on line 857 was always true
858 if english: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true
859 english += "; " + q
860 else:
861 english = q
862 # Try to find English translation for this synonym
863 t = ht.get("t{}".format(i - 1))
864 if t: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true
865 if english:
866 english += "; " + t
867 else:
868 english = t
870 # See if the linkage contains a parenthesized alt
871 alt = None
872 m = re.search(r"\(([^)]+)\)$", w)
873 if m: 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true
874 w = w[: m.start()].strip()
875 alt = m.group(1)
877 dt = {"word": w}
878 if field_tags: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true
879 data_extend(dt, "tags", field_tags)
880 if tags:
881 data_extend(dt, "tags", tags)
882 if topics: 882 ↛ 883line 882 didn't jump to line 883 because the condition on line 882 was never true
883 data_extend(dt, "topics", topics)
884 if english:
885 dt["english"] = english # DEPRECATED for "translation"
886 dt["translation"] = english
887 if alt: 887 ↛ 888line 887 didn't jump to line 888 because the condition on line 887 was never true
888 dt["alt"] = alt
889 data_append(data, field, dt)
892EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
893example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
894captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
897def synch_splits_with_args(
898 line: str, targs: TemplateArgs
899) -> Optional[list[str]]:
900 """If it looks like there's something weird with how a line of example
901 text has been split, this function will do the splitting after counting
902 occurences of the splitting regex inside the two main template arguments
903 containing the string data for the original language example and the
904 English translations.
905 """
906 # Previously, we split without capturing groups, but here we want to
907 # keep the original splitting hyphen regex intact.
908 fparts = captured_splitters_re.split(line)
909 new_parts = []
910 # ["First", " – ", "second", " – ", "third..."] from OL argument
911 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
912 new_parts.append("".join(fparts[:first]))
913 # Translation argument
914 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
915 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
916 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
917 new_parts.append("".join(fparts[first + 1 : second]))
919 if all(new_parts): # no empty strings from the above spaghetti
920 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
921 return new_parts
922 else:
923 return None
926QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
927QUALIFIERS_RE = re.compile(QUALIFIERS)
928# (...): ... or (...(...)...): ...
931def parse_language(
932 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
933) -> list[WordData]:
934 """Iterates over the text of the page, returning words (parts-of-speech)
935 defined on the page one at a time. (Individual word senses for the
936 same part-of-speech are typically encoded in the same entry.)"""
937 # imported here to avoid circular import
938 from .pronunciation import parse_pronunciation
940 assert isinstance(wxr, WiktextractContext)
941 assert isinstance(langnode, WikiNode)
942 assert isinstance(language, str)
943 assert isinstance(lang_code, str)
944 # print("parse_language", language)
946 is_reconstruction = False
947 word: str = wxr.wtp.title # type: ignore[assignment]
948 unsupported_prefix = "Unsupported titles/"
949 if word.startswith(unsupported_prefix):
950 w = word[len(unsupported_prefix) :]
951 if w in unsupported_title_map: 951 ↛ 954line 951 didn't jump to line 954 because the condition on line 951 was always true
952 word = unsupported_title_map[w]
953 else:
954 wxr.wtp.error(
955 "Unimplemented unsupported title: {}".format(word),
956 sortid="page/870",
957 )
958 word = w
959 elif word.startswith("Reconstruction:"):
960 word = word[word.find("/") + 1 :]
961 is_reconstruction = True
963 base_data: WordData = {
964 "word": word,
965 "lang": language,
966 "lang_code": lang_code,
967 }
968 if is_reconstruction:
969 data_append(base_data, "tags", "reconstruction")
970 sense_data: SenseData = {}
971 pos_data: WordData = {} # For a current part-of-speech
972 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
973 etym_data: WordData = {} # For one etymology
974 sense_datas: list[SenseData] = []
975 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
976 # Never reset, do not use as data
977 level_four_datas: list[WordData] = []
978 etym_datas: list[WordData] = []
979 page_datas: list[WordData] = []
980 have_etym = False
981 inside_level_four = False # This is for checking if the etymology section
982 # or article has a Pronunciation section, for Chinese mostly; because
983 # Chinese articles can have three level three sections (two etymology
984 # sections and pronunciation sections) one after another, we need a kludge
985 # to better keep track of whether we're in a normal "etym" or inside a
986 # "level four" (which is what we've turned the level three Pron sections
987 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
988 # a step.
989 stack: list[str] = [] # names of items on the "stack"
991 def merge_base(data: WordData, base: WordData) -> None:
992 for k, v in base.items():
993 # Copy the value to ensure that we don't share lists or
994 # dicts between structures (even nested ones).
995 v = copy.deepcopy(v)
996 if k not in data:
997 # The list was copied above, so this will not create shared ref
998 data[k] = v # type: ignore[literal-required]
999 continue
1000 if data[k] == v: # type: ignore[literal-required]
1001 continue
1002 if ( 1002 ↛ 1010line 1002 didn't jump to line 1010 because the condition on line 1002 was always true
1003 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1004 or isinstance(
1005 v,
1006 (list, tuple), # Should this be "and"?
1007 )
1008 ):
1009 data[k] = list(data[k]) + list(v) # type: ignore
1010 elif data[k] != v: # type: ignore[literal-required]
1011 wxr.wtp.warning(
1012 "conflicting values for {} in merge_base: "
1013 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1014 sortid="page/904",
1015 )
1017 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1018 """Remove unnecessary keys from dict values
1019 in a list comprehension..."""
1020 if key in pron:
1021 pron.pop(key) # type: ignore
1022 return pron
1024 def sound_matches_pos(sound: SoundData, pos: str) -> bool:
1025 if "pos" not in sound:
1026 return True
1027 sound_pos = sound["pos"] # type: ignore[typeddict-item]
1028 return pos in sound_pos
1030 def strip_sound_pos(sound: SoundData) -> SoundData:
1031 complementary_pop(sound, "pos")
1032 return sound
1034 # If the result has sounds, eliminate sounds that have a prefix that
1035 # does not match "word" or one of "forms"
1036 if "sounds" in data and "word" in data:
1037 accepted = [data["word"]]
1038 accepted.extend(f["form"] for f in data.get("forms", dict()))
1039 data["sounds"] = list(
1040 s
1041 for s in data["sounds"]
1042 if "form" not in s or s["form"] in accepted
1043 )
1044 # If the result has sounds, eliminate sounds that have a pos that
1045 # does not match "pos"
1046 if "sounds" in data and "pos" in data:
1047 data["sounds"] = list(
1048 strip_sound_pos(s)
1049 for s in data["sounds"]
1050 # "pos" is not a field of SoundData, correctly, so we're
1051 # removing it here. It's a kludge on a kludge on a kludge.
1052 if sound_matches_pos(s, data["pos"])
1053 )
1054 elif "sounds" in data: 1054 ↛ 1055line 1054 didn't jump to line 1055 because the condition on line 1054 was never true
1055 data["sounds"] = [strip_sound_pos(s) for s in data["sounds"]]
1057 def push_sense(sorting_ordinal: int | None = None) -> bool:
1058 """Starts collecting data for a new word sense. This returns True
1059 if a sense was added."""
1060 nonlocal sense_data
1061 if sorting_ordinal is None:
1062 sorting_ordinal = sense_ordinal
1063 tags = sense_data.get("tags", ())
1064 if (
1065 not sense_data.get("glosses")
1066 and "translation-hub" not in tags
1067 and "no-gloss" not in tags
1068 ):
1069 return False
1071 if ( 1071 ↛ 1081line 1071 didn't jump to line 1081 because the condition on line 1071 was never true
1072 (
1073 "participle" in sense_data.get("tags", ())
1074 or "infinitive" in sense_data.get("tags", ())
1075 )
1076 and "alt_of" not in sense_data
1077 and "form_of" not in sense_data
1078 and "etymology_text" in etym_data
1079 and etym_data["etymology_text"] != ""
1080 ):
1081 etym = etym_data["etymology_text"]
1082 etym = etym.split(". ")[0]
1083 ret = parse_alt_or_inflection_of(wxr, etym, set())
1084 if ret is not None:
1085 tags, lst = ret
1086 assert isinstance(lst, (list, tuple))
1087 if "form-of" in tags:
1088 data_extend(sense_data, "form_of", lst)
1089 data_extend(sense_data, "tags", tags)
1090 elif "alt-of" in tags:
1091 data_extend(sense_data, "alt_of", lst)
1092 data_extend(sense_data, "tags", tags)
1094 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1094 ↛ 1097line 1094 didn't jump to line 1097 because the condition on line 1094 was never true
1095 "tags", ()
1096 ):
1097 data_append(sense_data, "tags", "no-gloss")
1099 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore
1100 sense_datas.append(sense_data)
1101 sense_data = {}
1102 return True
1104 def push_pos(sorting_ordinal: int | None = None) -> None:
1105 """Starts collecting data for a new part-of-speech."""
1106 nonlocal pos_data
1107 nonlocal sense_datas
1108 push_sense(sorting_ordinal)
1109 if wxr.wtp.subsection:
1110 data: WordData = {"senses": sense_datas}
1111 merge_base(data, pos_data)
1112 level_four_datas.append(data)
1113 pos_data = {}
1114 sense_datas = []
1115 wxr.wtp.start_subsection(None)
1117 def push_level_four_section(clear_sound_data: bool) -> None:
1118 """Starts collecting data for a new level four sections, which
1119 is usually virtual and empty, unless the article has Chinese
1120 'Pronunciation' sections that are etymology-section-like but
1121 under etymology, and at the same level in the source. We modify
1122 the source to demote Pronunciation sections like that to level
1123 4, and other sections one step lower."""
1124 nonlocal level_four_data
1125 nonlocal level_four_datas
1126 nonlocal etym_datas
1127 push_pos()
1128 # print(f"======\n{etym_data=}")
1129 # print(f"======\n{etym_datas=}")
1130 # print(f"======\n{level_four_data=}")
1131 # print(f"======\n{level_four_datas=}")
1132 for data in level_four_datas:
1133 merge_base(data, level_four_data)
1134 etym_datas.append(data)
1135 for data in etym_datas:
1136 merge_base(data, etym_data)
1137 page_datas.append(data)
1138 if clear_sound_data:
1139 level_four_data = {}
1140 level_four_datas = []
1141 etym_datas = []
1143 def push_etym() -> None:
1144 """Starts collecting data for a new etymology."""
1145 nonlocal etym_data
1146 nonlocal etym_datas
1147 nonlocal have_etym
1148 nonlocal inside_level_four
1149 have_etym = True
1150 push_level_four_section(False)
1151 inside_level_four = False
1152 # etymology section could under pronunciation section
1153 etym_data = (
1154 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1155 )
1157 def select_data() -> WordData:
1158 """Selects where to store data (pos or etym) based on whether we
1159 are inside a pos (part-of-speech)."""
1160 # print(f"{wxr.wtp.subsection=}")
1161 # print(f"{stack=}")
1162 if wxr.wtp.subsection is not None:
1163 return pos_data
1164 if inside_level_four:
1165 return level_four_data
1166 if stack[-1] == language:
1167 return base_data
1168 return etym_data
1170 term_label_templates: list[TemplateData] = []
1172 def head_post_template_fn(
1173 name: str, ht: TemplateArgs, expansion: str
1174 ) -> Optional[str]:
1175 """Handles special templates in the head section of a word. Head
1176 section is the text after part-of-speech subtitle and before word
1177 sense list. Typically it generates the bold line for the word, but
1178 may also contain other useful information that often ends in
1179 side boxes. We want to capture some of that additional information."""
1180 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1181 if is_panel_template(wxr, name): 1181 ↛ 1184line 1181 didn't jump to line 1184 because the condition on line 1181 was never true
1182 # Completely ignore these templates (not even recorded in
1183 # head_templates)
1184 return ""
1185 if name == "head":
1186 # XXX are these also captured in forms? Should this special case
1187 # be removed?
1188 t = ht.get(2, "")
1189 if t == "pinyin": 1189 ↛ 1190line 1189 didn't jump to line 1190 because the condition on line 1189 was never true
1190 data_append(pos_data, "tags", "Pinyin")
1191 elif t == "romanization": 1191 ↛ 1192line 1191 didn't jump to line 1192 because the condition on line 1191 was never true
1192 data_append(pos_data, "tags", "romanization")
1193 if (
1194 HEAD_TAG_RE.search(name) is not None
1195 or name in WORD_LEVEL_HEAD_TEMPLATES
1196 ):
1197 args_ht = clean_template_args(wxr, ht)
1198 cleaned_expansion = clean_node(wxr, None, expansion)
1199 dt: TemplateData = {
1200 "name": name,
1201 "args": args_ht,
1202 "expansion": cleaned_expansion,
1203 }
1204 data_append(pos_data, "head_templates", dt)
1205 if name in WORD_LEVEL_HEAD_TEMPLATES:
1206 term_label_templates.append(dt)
1207 # Squash these, their tags are applied to the whole word,
1208 # and some cause problems like "term-label"
1209 return ""
1211 # The following are both captured in head_templates and parsed
1212 # separately
1214 if name in wikipedia_templates:
1215 # Note: various places expect to have content from wikipedia
1216 # templates, so cannot convert this to empty
1217 parse_wikipedia_template(wxr, pos_data, ht)
1218 return None
1220 if name == "number box": 1220 ↛ 1222line 1220 didn't jump to line 1222 because the condition on line 1220 was never true
1221 # XXX extract numeric value?
1222 return ""
1223 if name == "enum":
1224 # XXX extract?
1225 return ""
1226 if name == "cardinalbox": 1226 ↛ 1229line 1226 didn't jump to line 1229 because the condition on line 1226 was never true
1227 # XXX extract similar to enum?
1228 # XXX this can also occur in top-level under language
1229 return ""
1230 if name == "Han simplified forms": 1230 ↛ 1232line 1230 didn't jump to line 1232 because the condition on line 1230 was never true
1231 # XXX extract?
1232 return ""
1233 # if name == "ja-kanji forms":
1234 # # XXX extract?
1235 # return ""
1236 # if name == "vi-readings":
1237 # # XXX extract?
1238 # return ""
1239 # if name == "ja-kanji":
1240 # # XXX extract?
1241 # return ""
1242 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1242 ↛ 1244line 1242 didn't jump to line 1244 because the condition on line 1242 was never true
1243 # XXX extract?
1244 return ""
1245 if name == "defdate": 1245 ↛ 1247line 1245 didn't jump to line 1247 because the condition on line 1245 was never true
1246 # the one exampe I saw of this was weird.
1247 return ""
1249 return None
1251 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1252 """Parses the subsection for a part-of-speech under a language on
1253 a page."""
1254 assert isinstance(posnode, WikiNode)
1255 assert isinstance(pos, str)
1256 # print("parse_part_of_speech", pos)
1257 pos_data["pos"] = pos
1258 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1259 lists: list[list[WikiNode]] = [[]] # list of lists
1260 first_para = True
1261 first_head_tmplt = True
1262 collecting_head = True
1263 start_of_paragraph = True
1265 # XXX extract templates from posnode with recursively_extract
1266 # that break stuff, like ja-kanji or az-suffix-form.
1267 # Do the extraction with a list of template names, combined from
1268 # different lists, then separate out them into different lists
1269 # that are handled at different points of the POS section.
1270 # First, extract az-suffix-form, put it in `inflection`,
1271 # and parse `inflection`'s content when appropriate later.
1272 # The contents of az-suffix-form (and ja-kanji) that generate
1273 # divs with "floatright" in their style gets deleted by
1274 # clean_value, so templates that slip through from here won't
1275 # break anything.
1276 # XXX bookmark
1277 # print("===================")
1278 # print(posnode.children)
1280 floaters, poschildren = recursively_extract(
1281 posnode.children,
1282 lambda x: (
1283 isinstance(x, WikiNode)
1284 and (
1285 (
1286 isinstance(x, TemplateNode)
1287 and x.template_name in FLOATING_TABLE_TEMPLATES
1288 )
1289 or (
1290 x.kind == NodeKind.LINK
1291 # Need to check for stringiness because some links are
1292 # broken; for example, if a template is missing an
1293 # argument, a link might look like `[[{{{1}}}...]]`
1294 and len(x.largs) > 0
1295 and len(x.largs[0]) > 0
1296 and isinstance(x.largs[0][0], str)
1297 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1298 )
1299 )
1300 ),
1301 )
1302 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1303 tempnode.largs = [["Inflection"]]
1304 tempnode.children = floaters
1305 parse_inflection(tempnode, "Floating Div", pos)
1306 # print(poschildren)
1307 # XXX new above
1309 if not poschildren: 1309 ↛ 1310line 1309 didn't jump to line 1310 because the condition on line 1309 was never true
1310 if not floaters:
1311 wxr.wtp.debug(
1312 "PoS section without contents",
1313 sortid="en/page/1051/20230612",
1314 )
1315 else:
1316 wxr.wtp.debug(
1317 "PoS section without contents except for a floating table",
1318 sortid="en/page/1056/20230612",
1319 )
1320 return
1322 for node in poschildren:
1323 if isinstance(node, str):
1324 for m in re.finditer(r"\n+|[^\n]+", node):
1325 p = m.group(0)
1326 if p.startswith("\n\n") and pre:
1327 first_para = False
1328 start_of_paragraph = True
1329 break
1330 if p and collecting_head:
1331 pre[-1].append(p)
1332 continue
1333 assert isinstance(node, WikiNode)
1334 kind = node.kind
1335 if kind == NodeKind.LIST:
1336 lists[-1].append(node)
1337 collecting_head = False
1338 start_of_paragraph = True
1339 continue
1340 elif kind in LEVEL_KINDS:
1341 # Stop parsing section if encountering any kind of
1342 # level header (like ===Noun=== or ====Further Reading====).
1343 # At a quick glance, this should be the default behavior,
1344 # but if some kinds of source articles have sub-sub-sections
1345 # that should be parsed XXX it should be handled by changing
1346 # this break.
1347 break
1348 elif collecting_head and kind == NodeKind.LINK:
1349 # We might collect relevant links as they are often pictures
1350 # relating to the word
1351 if len(node.largs[0]) >= 1 and isinstance( 1351 ↛ 1366line 1351 didn't jump to line 1366 because the condition on line 1351 was always true
1352 node.largs[0][0], str
1353 ):
1354 if node.largs[0][0].startswith( 1354 ↛ 1360line 1354 didn't jump to line 1360 because the condition on line 1354 was never true
1355 ns_title_prefix_tuple(wxr, "Category")
1356 ):
1357 # [[Category:...]]
1358 # We're at the end of the file, probably, so stop
1359 # here. Otherwise the head will get garbage.
1360 break
1361 if node.largs[0][0].startswith(
1362 ns_title_prefix_tuple(wxr, "File")
1363 ):
1364 # Skips file links
1365 continue
1366 start_of_paragraph = False
1367 pre[-1].append(node)
1368 elif kind == NodeKind.HTML:
1369 if node.sarg == "br":
1370 if pre[-1]: 1370 ↛ 1322line 1370 didn't jump to line 1322 because the condition on line 1370 was always true
1371 pre.append([]) # Switch to next head
1372 lists.append([]) # Lists parallels pre
1373 collecting_head = True
1374 start_of_paragraph = True
1375 elif collecting_head and node.sarg not in ( 1375 ↛ 1381line 1375 didn't jump to line 1381 because the condition on line 1375 was never true
1376 "gallery",
1377 "ref",
1378 "cite",
1379 "caption",
1380 ):
1381 start_of_paragraph = False
1382 pre[-1].append(node)
1383 else:
1384 start_of_paragraph = False
1385 elif isinstance(node, TemplateNode):
1386 # XXX Insert code here that disambiguates between
1387 # templates that generate word heads and templates
1388 # that don't.
1389 # There's head_tag_re that seems like a regex meant
1390 # to identify head templates. Too bad it's None.
1392 # ignore {{category}}, {{cat}}... etc.
1393 if node.template_name in stop_head_at_these_templates:
1394 # we've reached a template that should be at the end,
1395 continue
1397 # skip these templates; panel_templates is already used
1398 # to skip certain templates else, but it also applies to
1399 # head parsing quite well.
1400 # node.largs[0][0] should always be str, but can't type-check
1401 # that.
1402 if is_panel_template(wxr, node.template_name):
1403 continue
1404 # skip these templates
1405 # if node.largs[0][0] in skip_these_templates_in_head:
1406 # first_head_tmplt = False # no first_head_tmplt at all
1407 # start_of_paragraph = False
1408 # continue
1410 if first_head_tmplt and pre[-1]:
1411 first_head_tmplt = False
1412 start_of_paragraph = False
1413 pre[-1].append(node)
1414 elif pre[-1] and start_of_paragraph:
1415 pre.append([]) # Switch to the next head
1416 lists.append([]) # lists parallel pre
1417 collecting_head = True
1418 start_of_paragraph = False
1419 pre[-1].append(node)
1420 else:
1421 pre[-1].append(node)
1422 elif first_para:
1423 start_of_paragraph = False
1424 if collecting_head: 1424 ↛ 1322line 1424 didn't jump to line 1322 because the condition on line 1424 was always true
1425 pre[-1].append(node)
1426 # XXX use template_fn in clean_node to check that the head macro
1427 # is compatible with the current part-of-speech and generate warning
1428 # if not. Use template_allowed_pos_map.
1430 # Clean up empty pairs, and fix messes with extra newlines that
1431 # separate templates that are followed by lists wiktextract issue #314
1433 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1434 cleaned_lists: list[list[WikiNode]] = []
1435 pairless_pre_index = None
1437 for pre1, ls in zip(pre, lists):
1438 if pre1 and not ls:
1439 pairless_pre_index = len(cleaned_pre)
1440 if not pre1 and not ls: 1440 ↛ 1442line 1440 didn't jump to line 1442 because the condition on line 1440 was never true
1441 # skip [] + []
1442 continue
1443 if not ls and all(
1444 (isinstance(x, str) and not x.strip()) for x in pre1
1445 ):
1446 # skip ["\n", " "] + []
1447 continue
1448 if ls and not pre1:
1449 if pairless_pre_index is not None: 1449 ↛ 1450line 1449 didn't jump to line 1450 because the condition on line 1449 was never true
1450 cleaned_lists[pairless_pre_index] = ls
1451 pairless_pre_index = None
1452 continue
1453 cleaned_pre.append(pre1)
1454 cleaned_lists.append(ls)
1456 pre = cleaned_pre
1457 lists = cleaned_lists
1459 there_are_many_heads = len(pre) > 1
1460 header_tags: list[str] = []
1461 header_topics: list[str] = []
1462 previous_head_had_list = False
1464 if not any(g for g in lists):
1465 process_gloss_without_list(
1466 poschildren, pos, pos_data, header_tags, header_topics
1467 )
1468 else:
1469 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1470 # if len(ls) == 0:
1471 # # don't have gloss list
1472 # # XXX add code here to filter out 'garbage', like text
1473 # # that isn't a head template or head.
1474 # continue
1476 if all(not sl for sl in lists[i:]):
1477 if i == 0: 1477 ↛ 1478line 1477 didn't jump to line 1478 because the condition on line 1477 was never true
1478 if isinstance(node, str):
1479 wxr.wtp.debug(
1480 "first head without list of senses,"
1481 "string: '{}[...]', {}/{}".format(
1482 node[:20], word, language
1483 ),
1484 sortid="page/1689/20221215",
1485 )
1486 if isinstance(node, WikiNode):
1487 if node.largs and node.largs[0][0] in [
1488 "Han char",
1489 ]:
1490 # just ignore these templates
1491 pass
1492 else:
1493 wxr.wtp.debug(
1494 "first head without "
1495 "list of senses, "
1496 "template node "
1497 "{}, {}/{}".format(
1498 node.largs, word, language
1499 ),
1500 sortid="page/1694/20221215",
1501 )
1502 else:
1503 wxr.wtp.debug(
1504 "first head without list of senses, "
1505 "{}/{}".format(word, language),
1506 sortid="page/1700/20221215",
1507 )
1508 # no break here so that the first head always
1509 # gets processed.
1510 else:
1511 if isinstance(node, str): 1511 ↛ 1512line 1511 didn't jump to line 1512 because the condition on line 1511 was never true
1512 wxr.wtp.debug(
1513 "later head without list of senses,"
1514 "string: '{}[...]', {}/{}".format(
1515 node[:20], word, language
1516 ),
1517 sortid="page/1708/20221215",
1518 )
1519 if isinstance(node, WikiNode): 1519 ↛ 1531line 1519 didn't jump to line 1531 because the condition on line 1519 was always true
1520 wxr.wtp.debug(
1521 "later head without list of senses,"
1522 "template node "
1523 "{}, {}/{}".format(
1524 node.sarg if node.sarg else node.largs,
1525 word,
1526 language,
1527 ),
1528 sortid="page/1713/20221215",
1529 )
1530 else:
1531 wxr.wtp.debug(
1532 "later head without list of senses, "
1533 "{}/{}".format(word, language),
1534 sortid="page/1719/20221215",
1535 )
1536 break
1537 head_group = i + 1 if there_are_many_heads else None
1538 # print("parse_part_of_speech: {}: {}: pre={}"
1539 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1541 if previous_head_had_list:
1542 # We use a boolean flag here because we want to be able
1543 # let the header_tags data pass through after the loop
1544 # is over without accidentally emptying it, if there are
1545 # no pos_datas and we need a dummy data.
1546 header_tags.clear()
1547 header_topics.clear()
1549 # print(f"{pre1=}")
1550 process_gloss_header(
1551 pre1, pos, head_group, pos_data, header_tags, header_topics
1552 )
1553 for ln in ls:
1554 # Parse each list associated with this head.
1555 for node in ln.children:
1556 # Parse nodes in l.children recursively.
1557 # The recursion function uses push_sense() to
1558 # add stuff into sense_datas, and returns True or
1559 # False if something is added, which bubbles upward.
1560 # If the bubble is "True", then higher levels of
1561 # the recursion will not push_sense(), because
1562 # the data is already pushed into a sub-gloss
1563 # downstream, unless the higher level has examples
1564 # that need to be put somewhere.
1565 common_data: SenseData = {
1566 "tags": list(header_tags),
1567 "topics": list(header_topics),
1568 }
1569 if head_group:
1570 common_data["head_nr"] = head_group
1571 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1573 if len(ls) > 0:
1574 previous_head_had_list = True
1575 else:
1576 previous_head_had_list = False
1578 # If there are no senses extracted, add a dummy sense. We want to
1579 # keep tags extracted from the head for the dummy sense.
1580 push_sense() # Make sure unfinished data pushed, and start clean sense
1581 if len(sense_datas) == 0:
1582 data_extend(sense_data, "tags", header_tags)
1583 data_extend(sense_data, "topics", header_topics)
1584 data_append(sense_data, "tags", "no-gloss")
1585 push_sense()
1587 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore
1589 for sd in sense_datas:
1590 if "__temp_sense_sorting_ordinal" in sd: 1590 ↛ 1589line 1590 didn't jump to line 1589 because the condition on line 1590 was always true
1591 del sd["__temp_sense_sorting_ordinal"] # type: ignore
1593 def process_gloss_header(
1594 header_nodes: list[Union[WikiNode, str]],
1595 pos_type: str,
1596 header_group: Optional[int],
1597 pos_data: WordData,
1598 header_tags: list[str],
1599 header_topics: list[str],
1600 ) -> None:
1601 ruby = []
1603 # process template parse nodes here
1604 new_nodes = []
1605 info_template_data = []
1606 for node in header_nodes:
1607 # print(f"{node=}")
1608 info_data, info_out = parse_info_template_node(wxr, node, "head")
1609 if info_data or info_out:
1610 if info_data: 1610 ↛ 1612line 1610 didn't jump to line 1612 because the condition on line 1610 was always true
1611 info_template_data.append(info_data)
1612 if info_out: # including just the original node 1612 ↛ 1613line 1612 didn't jump to line 1613 because the condition on line 1612 was never true
1613 new_nodes.append(info_out)
1614 else:
1615 new_nodes.append(node)
1616 header_nodes = new_nodes
1618 if info_template_data:
1619 if "info_templates" not in pos_data: 1619 ↛ 1622line 1619 didn't jump to line 1622 because the condition on line 1619 was always true
1620 pos_data["info_templates"] = info_template_data
1621 else:
1622 pos_data["info_templates"].extend(info_template_data)
1624 if lang_code == "ja":
1625 exp = wxr.wtp.parse(
1626 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1627 )
1628 rub, _ = recursively_extract(
1629 exp.children,
1630 lambda x: isinstance(x, WikiNode)
1631 and x.kind == NodeKind.HTML
1632 and x.sarg == "ruby",
1633 )
1634 if rub is not None: 1634 ↛ 1679line 1634 didn't jump to line 1679 because the condition on line 1634 was always true
1635 for r in rub:
1636 if TYPE_CHECKING:
1637 # we know the lambda above in recursively_extract
1638 # returns only WikiNodes in rub
1639 assert isinstance(r, WikiNode)
1640 rt = parse_ruby(wxr, r)
1641 if rt is not None: 1641 ↛ 1635line 1641 didn't jump to line 1635 because the condition on line 1641 was always true
1642 ruby.append(rt)
1643 elif lang_code == "vi":
1644 # Handle vi-readings templates that have a weird structures for
1645 # Chu Nom vietnamese characters heads
1646 # https://en.wiktionary.org/wiki/Template:vi-readings
1647 new_header_nodes = []
1648 related_readings: list[LinkageData] = []
1649 for node in header_nodes:
1650 if ( 1650 ↛ 1674line 1650 didn't jump to line 1674 because the condition on line 1650 was always true
1651 isinstance(node, TemplateNode)
1652 and node.template_name == "vi-readings"
1653 ):
1654 print(node.template_parameters)
1655 for parameter, tag in (
1656 ("hanviet", "han-viet-reading"),
1657 ("nom", "nom-reading"),
1658 # we ignore the fanqie parameter "phienthiet"
1659 ):
1660 arg = node.template_parameters.get(parameter)
1661 if arg is not None: 1661 ↛ 1655line 1661 didn't jump to line 1655 because the condition on line 1661 was always true
1662 text = clean_node(wxr, None, arg)
1663 for w in text.split(","):
1664 # ignore - separated references
1665 if "-" in w:
1666 w = w[: w.index("-")]
1667 w = w.strip()
1668 related_readings.append(
1669 LinkageData(word=w, tags=[tag])
1670 )
1671 continue
1673 # Skip the vi-reading template for the rest of the head parsing
1674 new_header_nodes.append(node)
1675 if len(related_readings) > 0: 1675 ↛ 1679line 1675 didn't jump to line 1679 because the condition on line 1675 was always true
1676 data_extend(pos_data, "related", related_readings)
1677 header_nodes = new_header_nodes
1679 header_text = clean_node(
1680 wxr,
1681 pos_data,
1682 header_nodes,
1683 post_template_fn=head_post_template_fn,
1684 collect_links=True,
1685 remove_anchors_from_links=True,
1686 )
1687 if "links" in pos_data:
1688 # WordData doesn't use `links`, so we can use `collect_links`
1689 # here without special handling and smuggle link data.
1690 extracted_links = pos_data["links"] # type: ignore
1691 del pos_data["links"] # type: ignore
1692 else:
1693 extracted_links = None
1694 # print(f"{header_text=}, {extracted_links=}")
1696 header_text = re.sub(r"\s+", " ", header_text).strip()
1698 if not header_text:
1699 return
1701 term_label_tags: list[str] = []
1702 term_label_topics: list[str] = []
1703 if len(term_label_templates) > 0:
1704 # parse term label templates; if there are other similar kinds
1705 # of templates in headers that you want to squash and apply as
1706 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1707 for templ_data in term_label_templates:
1708 # print(templ_data)
1709 expan = templ_data.get("expansion", "").strip("().,; ")
1710 if not expan: 1710 ↛ 1711line 1710 didn't jump to line 1711 because the condition on line 1710 was never true
1711 continue
1712 tlb_tagsets, tlb_topics = decode_tags(expan)
1713 for tlb_tags in tlb_tagsets:
1714 if len(tlb_tags) > 0 and not any(
1715 t.startswith("error-") for t in tlb_tags
1716 ):
1717 term_label_tags.extend(tlb_tags)
1718 term_label_topics.extend(tlb_topics)
1719 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1721 # print(f"{header_text=}")
1722 parse_word_head(
1723 wxr,
1724 word,
1725 pos_type,
1726 header_text,
1727 pos_data,
1728 is_reconstruction,
1729 header_group,
1730 header_nodes,
1731 ruby=ruby,
1732 links=extracted_links,
1733 )
1734 if "tags" in pos_data:
1735 # pos_data can get "tags" data from some source; type-checkers
1736 # doesn't like it, so let's ignore it.
1737 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1738 del pos_data["tags"] # type: ignore[typeddict-item]
1739 if len(term_label_tags) > 0:
1740 header_tags.extend(term_label_tags)
1741 if len(term_label_topics) > 0:
1742 header_topics.extend(term_label_topics)
1744 def process_gloss_without_list(
1745 nodes: list[Union[WikiNode, str]],
1746 pos_type: str,
1747 pos_data: WordData,
1748 header_tags: list[str],
1749 header_topics: list[str],
1750 ) -> None:
1751 # gloss text might not inside a list
1752 header_nodes: list[Union[str, WikiNode]] = []
1753 gloss_nodes: list[Union[str, WikiNode]] = []
1754 for node in strip_nodes(nodes):
1755 if isinstance(node, WikiNode):
1756 if isinstance(node, TemplateNode):
1757 if node.template_name in (
1758 "zh-see",
1759 "ja-see",
1760 "ja-see-kango",
1761 ):
1762 continue # soft redirect
1763 elif (
1764 node.template_name == "head"
1765 or node.template_name.startswith(f"{lang_code}-")
1766 ):
1767 header_nodes.append(node)
1768 continue
1769 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1769 ↛ 1771line 1769 didn't jump to line 1771 because the condition on line 1769 was always true
1770 break
1771 gloss_nodes.append(node)
1773 if len(header_nodes) > 0:
1774 process_gloss_header(
1775 header_nodes,
1776 pos_type,
1777 None,
1778 pos_data,
1779 header_tags,
1780 header_topics,
1781 )
1782 if len(gloss_nodes) > 0:
1783 process_gloss_contents(
1784 gloss_nodes,
1785 pos_type,
1786 {"tags": list(header_tags), "topics": list(header_topics)},
1787 )
1789 def parse_sense_node(
1790 node: Union[str, WikiNode], # never receives str
1791 sense_base: SenseData,
1792 pos: str,
1793 ) -> bool:
1794 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1795 Uses push_sense() to attempt adding data to pos_data in the scope
1796 of parse_language() when it reaches deep in the recursion. push_sense()
1797 returns True if it succeeds, and that is bubbled up the stack; if
1798 a sense was added downstream, the higher levels (whose shared data
1799 was already added by a subsense) do not push_sense(), unless it
1800 has examples that need to be put somewhere.
1801 """
1802 assert isinstance(sense_base, dict) # Added to every sense deeper in
1804 nonlocal sense_ordinal
1805 my_ordinal = sense_ordinal # copies, not a reference
1806 sense_ordinal += 1 # only use for sorting
1808 if not isinstance(node, WikiNode): 1808 ↛ 1810line 1808 didn't jump to line 1810 because the condition on line 1808 was never true
1809 # This doesn't seem to ever happen in practice.
1810 wxr.wtp.debug(
1811 "{}: parse_sense_node called with"
1812 "something that isn't a WikiNode".format(pos),
1813 sortid="page/1287/20230119",
1814 )
1815 return False
1817 if node.kind != NodeKind.LIST_ITEM: 1817 ↛ 1818line 1817 didn't jump to line 1818 because the condition on line 1817 was never true
1818 wxr.wtp.debug(
1819 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1820 )
1821 return False
1823 if node.sarg == ":":
1824 # Skip example entries at the highest level, ones without
1825 # a sense ("...#") above them.
1826 # If node.sarg is exactly and only ":", then it's at
1827 # the highest level; lower levels would have more
1828 # "indentation", like "#:" or "##:"
1829 return False
1831 # If a recursion call succeeds in push_sense(), bubble it up with
1832 # `added`.
1833 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1834 added = False
1836 gloss_template_args: set[str] = set()
1838 # For LISTs and LIST_ITEMS, their argument is something like
1839 # "##" or "##:", and using that we can rudimentally determine
1840 # list 'depth' if need be, and also what kind of list or
1841 # entry it is; # is for normal glosses, : for examples (indent)
1842 # and * is used for quotations on wiktionary.
1843 current_depth = node.sarg
1845 children = node.children
1847 # subentries, (presumably) a list
1848 # of subglosses below this. The list's
1849 # argument ends with #, and its depth should
1850 # be bigger than parent node.
1851 subentries = [
1852 x
1853 for x in children
1854 if isinstance(x, WikiNode)
1855 and x.kind == NodeKind.LIST
1856 and x.sarg == current_depth + "#"
1857 ]
1859 # sublists of examples and quotations. .sarg
1860 # does not end with "#".
1861 others = [
1862 x
1863 for x in children
1864 if isinstance(x, WikiNode)
1865 and x.kind == NodeKind.LIST
1866 and x.sarg != current_depth + "#"
1867 ]
1869 # the actual contents of this particular node.
1870 # can be a gloss (or a template that expands into
1871 # many glosses which we can't easily pre-expand)
1872 # or could be an "outer gloss" with more specific
1873 # subglosses, or could be a qualfier for the subglosses.
1874 contents = [
1875 x
1876 for x in children
1877 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1878 ]
1879 # If this entry has sublists of entries, we should combine
1880 # gloss information from both the "outer" and sublist content.
1881 # Sometimes the outer gloss
1882 # is more non-gloss or tags, sometimes it is a coarse sense
1883 # and the inner glosses are more specific. The outer one
1884 # does not seem to have qualifiers.
1886 # If we have one sublist with one element, treat it
1887 # specially as it may be a Wiktionary error; raise
1888 # that nested element to the same level.
1889 # XXX If need be, this block can be easily removed in
1890 # the current recursive logicand the result is one sense entry
1891 # with both glosses in the glosses list, as you would
1892 # expect. If the higher entry has examples, there will
1893 # be a higher entry with some duplicated data.
1894 if len(subentries) == 1:
1895 slc = subentries[0].children
1896 if len(slc) == 1:
1897 # copy current node and modify it so it doesn't
1898 # loop infinitely.
1899 cropped_node = copy.copy(node)
1900 cropped_node.children = [
1901 x
1902 for x in children
1903 if not (
1904 isinstance(x, WikiNode)
1905 and x.kind == NodeKind.LIST
1906 and x.sarg == current_depth + "#"
1907 )
1908 ]
1909 added |= parse_sense_node(cropped_node, sense_base, pos)
1910 nonlocal sense_data # this kludge causes duplicated raw_
1911 # glosses data if this is not done;
1912 # if the top-level (cropped_node)
1913 # does not push_sense() properly or
1914 # parse_sense_node() returns early,
1915 # sense_data is not reset. This happens
1916 # for example when you have a no-gloss
1917 # string like "(intransitive)":
1918 # no gloss, push_sense() returns early
1919 # and sense_data has duplicate data with
1920 # sense_base
1921 sense_data = {}
1922 added |= parse_sense_node(slc[0], sense_base, pos)
1923 return added
1925 return process_gloss_contents(
1926 contents,
1927 pos,
1928 sense_base,
1929 subentries,
1930 others,
1931 gloss_template_args,
1932 added,
1933 my_ordinal,
1934 )
1936 def process_gloss_contents(
1937 contents: list[Union[str, WikiNode]],
1938 pos: str,
1939 sense_base: SenseData,
1940 subentries: list[WikiNode] = [],
1941 others: list[WikiNode] = [],
1942 gloss_template_args: Set[str] = set(),
1943 added: bool = False,
1944 sorting_ordinal: int | None = None,
1945 ) -> bool:
1946 def sense_template_fn(
1947 name: str, ht: TemplateArgs, is_gloss: bool = False
1948 ) -> Optional[str]:
1949 # print(f"sense_template_fn: {name}, {ht}")
1950 if name in wikipedia_templates:
1951 # parse_wikipedia_template(wxr, pos_data, ht)
1952 return None
1953 if is_panel_template(wxr, name):
1954 return ""
1955 if name in INFO_TEMPLATE_FUNCS:
1956 info_data, info_exp = parse_info_template_arguments(
1957 wxr, name, ht, "sense"
1958 )
1959 if info_data or info_exp: 1959 ↛ 1965line 1959 didn't jump to line 1965 because the condition on line 1959 was always true
1960 if info_data: 1960 ↛ 1962line 1960 didn't jump to line 1962 because the condition on line 1960 was always true
1961 data_append(sense_base, "info_templates", info_data)
1962 if info_exp and isinstance(info_exp, str): 1962 ↛ 1964line 1962 didn't jump to line 1964 because the condition on line 1962 was always true
1963 return info_exp
1964 return ""
1965 if name in ("defdate",):
1966 date = clean_node(wxr, None, ht.get(1, ()))
1967 if part_two := ht.get(2): 1967 ↛ 1969line 1967 didn't jump to line 1969 because the condition on line 1967 was never true
1968 # Unicode mdash, not '-'
1969 date += "–" + clean_node(wxr, None, part_two)
1970 refs: dict[str, ReferenceData] = {}
1971 # ref, refn, ref2, ref2n, ref3, ref3n
1972 # ref1 not valid
1973 for k, v in sorted(
1974 (k, v) for k, v in ht.items() if isinstance(k, str)
1975 ):
1976 if m := re.match(r"ref(\d?)(n?)", k): 1976 ↛ 1973line 1976 didn't jump to line 1973 because the condition on line 1976 was always true
1977 ref_v = clean_node(wxr, None, v)
1978 if m.group(1) not in refs: # empty string or digit
1979 refs[m.group(1)] = ReferenceData()
1980 if m.group(2):
1981 refs[m.group(1)]["refn"] = ref_v
1982 else:
1983 refs[m.group(1)]["text"] = ref_v
1984 data_append(
1985 sense_base,
1986 "attestations",
1987 AttestationData(date=date, references=list(refs.values())),
1988 )
1989 return ""
1990 if name == "senseid":
1991 langid = clean_node(wxr, None, ht.get(1, ()))
1992 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1993 if re.match(r"Q\d+$", arg):
1994 data_append(sense_base, "wikidata", arg)
1995 data_append(sense_base, "senseid", langid + ":" + arg)
1996 if name in sense_linkage_templates:
1997 # print(f"SENSE_TEMPLATE_FN: {name}")
1998 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1999 return ""
2000 if name == "†" or name == "zh-obsolete":
2001 data_append(sense_base, "tags", "obsolete")
2002 return ""
2003 if name in {
2004 "ux",
2005 "uxi",
2006 "usex",
2007 "afex",
2008 "prefixusex",
2009 "ko-usex",
2010 "ko-x",
2011 "hi-x",
2012 "ja-usex-inline",
2013 "ja-x",
2014 "quotei",
2015 "he-x",
2016 "hi-x",
2017 "km-x",
2018 "ne-x",
2019 "shn-x",
2020 "th-x",
2021 "ur-x",
2022 }:
2023 # Usage examples are captured separately below. We don't
2024 # want to expand them into glosses even when unusual coding
2025 # is used in the entry.
2026 # These templates may slip through inside another item, but
2027 # currently we're separating out example entries (..#:)
2028 # well enough that there seems to very little contamination.
2029 if is_gloss:
2030 wxr.wtp.wiki_notice(
2031 "Example template is used for gloss text",
2032 sortid="extractor.en.page.sense_template_fn/1415",
2033 )
2034 else:
2035 return ""
2036 if name == "w": 2036 ↛ 2037line 2036 didn't jump to line 2037 because the condition on line 2036 was never true
2037 if ht.get(2) == "Wp":
2038 return ""
2039 for v in ht.values():
2040 v = v.strip()
2041 if v and "<" not in v:
2042 gloss_template_args.add(v)
2043 return None
2045 def extract_link_texts(item: GeneralNode) -> None:
2046 """Recursively extracts link texts from the gloss source. This
2047 information is used to select whether to remove final "." from
2048 form_of/alt_of (e.g., ihm/Hunsrik)."""
2049 if isinstance(item, (list, tuple)):
2050 for x in item:
2051 extract_link_texts(x)
2052 return
2053 if isinstance(item, str):
2054 # There seem to be HTML sections that may futher contain
2055 # unparsed links.
2056 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2056 ↛ 2057line 2056 didn't jump to line 2057 because the loop on line 2056 never started
2057 print("ITER:", m.group(0))
2058 v = m.group(1).split("|")[-1].strip()
2059 if v:
2060 gloss_template_args.add(v)
2061 return
2062 if not isinstance(item, WikiNode): 2062 ↛ 2063line 2062 didn't jump to line 2063 because the condition on line 2062 was never true
2063 return
2064 if item.kind == NodeKind.LINK:
2065 v = item.largs[-1]
2066 if ( 2066 ↛ 2072line 2066 didn't jump to line 2072 because the condition on line 2066 was always true
2067 isinstance(v, list)
2068 and len(v) == 1
2069 and isinstance(v[0], str)
2070 ):
2071 gloss_template_args.add(v[0].strip())
2072 for x in item.children:
2073 extract_link_texts(x)
2075 extract_link_texts(contents)
2077 # get the raw text of non-list contents of this node, and other stuff
2078 # like tag and category data added to sense_base
2079 # cast = no-op type-setter for the type-checker
2080 partial_template_fn = cast(
2081 TemplateFnCallable,
2082 partial(sense_template_fn, is_gloss=True),
2083 )
2084 rawgloss = clean_node(
2085 wxr,
2086 sense_base,
2087 contents,
2088 template_fn=partial_template_fn,
2089 collect_links=True,
2090 )
2092 if not rawgloss: 2092 ↛ 2093line 2092 didn't jump to line 2093 because the condition on line 2092 was never true
2093 return False
2095 # remove manually typed ordered list text at the start("1. ")
2096 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2098 # get stuff like synonyms and categories from "others",
2099 # maybe examples and quotations
2100 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2102 # The gloss could contain templates that produce more list items.
2103 # This happens commonly with, e.g., {{inflection of|...}}. Split
2104 # to parts. However, e.g. Interlingua generates multiple glosses
2105 # in HTML directly without Wikitext markup, so we must also split
2106 # by just newlines.
2107 subglosses = rawgloss.splitlines()
2109 if len(subglosses) == 0: 2109 ↛ 2110line 2109 didn't jump to line 2110 because the condition on line 2109 was never true
2110 return False
2112 if any(s.startswith("#") for s in subglosses):
2113 subtree = wxr.wtp.parse(rawgloss)
2114 # from wikitextprocessor.parser import print_tree
2115 # print("SUBTREE GENERATED BY TEMPLATE:")
2116 # print_tree(subtree)
2117 new_subentries = [
2118 x
2119 for x in subtree.children
2120 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2121 ]
2123 new_others = [
2124 x
2125 for x in subtree.children
2126 if isinstance(x, WikiNode)
2127 and x.kind == NodeKind.LIST
2128 and not x.sarg.endswith("#")
2129 ]
2131 new_contents = [
2132 clean_node(wxr, [], x)
2133 for x in subtree.children
2134 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2135 ]
2137 subentries = subentries or new_subentries
2138 others = others or new_others
2139 subglosses = new_contents
2140 rawgloss = "".join(subglosses)
2141 # Generate no gloss for translation hub pages, but add the
2142 # "translation-hub" tag for them
2143 if rawgloss == "(This entry is a translation hub.)": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true
2144 data_append(sense_data, "tags", "translation-hub")
2145 return push_sense(sorting_ordinal)
2147 # Remove certain substrings specific to outer glosses
2148 strip_ends = [", particularly:"]
2149 for x in strip_ends:
2150 if rawgloss.endswith(x):
2151 rawgloss = rawgloss[: -len(x)].strip()
2152 break
2154 # A single gloss, or possibly an outer gloss.
2155 # Check if the possible outer gloss starts with
2156 # parenthesized tags/topics
2158 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2159 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2160 m = QUALIFIERS_RE.match(rawgloss)
2161 # (...): ... or (...(...)...): ...
2162 if m:
2163 q = m.group(1)
2164 rawgloss = rawgloss[m.end() :].strip()
2165 parse_sense_qualifier(wxr, q, sense_base)
2166 if rawgloss == "A pejorative:": 2166 ↛ 2167line 2166 didn't jump to line 2167 because the condition on line 2166 was never true
2167 data_append(sense_base, "tags", "pejorative")
2168 rawgloss = ""
2169 elif rawgloss == "Short forms.": 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true
2170 data_append(sense_base, "tags", "abbreviation")
2171 rawgloss = ""
2172 elif rawgloss == "Technical or specialized senses.": 2172 ↛ 2173line 2172 didn't jump to line 2173 because the condition on line 2172 was never true
2173 rawgloss = ""
2174 elif rawgloss.startswith("inflection of "):
2175 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2176 if parsed is not None: 2176 ↛ 2185line 2176 didn't jump to line 2185 because the condition on line 2176 was always true
2177 tags, origins = parsed
2178 if origins is not None: 2178 ↛ 2180line 2178 didn't jump to line 2180 because the condition on line 2178 was always true
2179 data_extend(sense_base, "form_of", origins)
2180 if tags is not None: 2180 ↛ 2183line 2180 didn't jump to line 2183 because the condition on line 2180 was always true
2181 data_extend(sense_base, "tags", tags)
2182 else:
2183 data_append(sense_base, "tags", "form-of")
2184 else:
2185 data_append(sense_base, "tags", "form-of")
2186 if rawgloss: 2186 ↛ 2217line 2186 didn't jump to line 2217 because the condition on line 2186 was always true
2187 # Code duplicating a lot of clean-up operations from later in
2188 # this block. We want to clean up the "supergloss" as much as
2189 # possible, in almost the same way as a normal gloss.
2190 supergloss = rawgloss
2192 if supergloss.startswith("; "): 2192 ↛ 2193line 2192 didn't jump to line 2193 because the condition on line 2192 was never true
2193 supergloss = supergloss[1:].strip()
2195 if supergloss.startswith(("^†", "†")):
2196 data_append(sense_base, "tags", "obsolete")
2197 supergloss = supergloss[2:].strip()
2198 elif supergloss.startswith("^‡"): 2198 ↛ 2199line 2198 didn't jump to line 2199 because the condition on line 2198 was never true
2199 data_extend(sense_base, "tags", ["obsolete", "historical"])
2200 supergloss = supergloss[2:].strip()
2202 # remove [14th century...] style brackets at the end
2203 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2205 if supergloss.startswith((",", ":")):
2206 supergloss = supergloss[1:]
2207 supergloss = supergloss.strip()
2208 if supergloss.startswith("N. of "): 2208 ↛ 2209line 2208 didn't jump to line 2209 because the condition on line 2208 was never true
2209 supergloss = "Name of " + supergloss[6:]
2210 supergloss = supergloss[2:]
2211 data_append(sense_base, "glosses", supergloss)
2212 if supergloss in ("A person:",):
2213 data_append(sense_base, "tags", "g-person")
2215 # The main recursive call (except for the exceptions at the
2216 # start of this function).
2217 for sublist in subentries:
2218 if not ( 2218 ↛ 2221line 2218 didn't jump to line 2221 because the condition on line 2218 was never true
2219 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2220 ):
2221 wxr.wtp.debug(
2222 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2223 f"with items that are not LISTs",
2224 sortid="page/1511/20230119",
2225 )
2226 continue
2227 for item in sublist.children:
2228 if not ( 2228 ↛ 2232line 2228 didn't jump to line 2232 because the condition on line 2228 was never true
2229 isinstance(item, WikiNode)
2230 and item.kind == NodeKind.LIST_ITEM
2231 ):
2232 continue
2233 # copy sense_base to prevent cross-contamination between
2234 # subglosses and other subglosses and superglosses
2235 sense_base2 = copy.deepcopy(sense_base)
2236 if parse_sense_node(item, sense_base2, pos): 2236 ↛ 2227line 2236 didn't jump to line 2227 because the condition on line 2236 was always true
2237 added = True
2239 # Capture examples.
2240 # This is called after the recursive calls above so that
2241 # sense_base is not contaminated with meta-data from
2242 # example entries for *this* gloss.
2243 examples = []
2244 if wxr.config.capture_examples: 2244 ↛ 2248line 2244 didn't jump to line 2248 because the condition on line 2244 was always true
2245 examples = extract_examples(others, sense_base)
2247 # push_sense() succeeded somewhere down-river, so skip this level
2248 if added:
2249 if examples:
2250 # this higher-up gloss has examples that we do not want to skip
2251 wxr.wtp.debug(
2252 "'{}[...]' gloss has examples we want to keep, "
2253 "but there are subglosses.".format(repr(rawgloss[:30])),
2254 sortid="page/1498/20230118",
2255 )
2256 else:
2257 return True
2259 # Some entries, e.g., "iacebam", have weird sentences in quotes
2260 # after the gloss, but these sentences don't seem to be intended
2261 # as glosses. Skip them.
2262 indexed_subglosses = list(
2263 (i, gl)
2264 for i, gl in enumerate(subglosses)
2265 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2266 )
2268 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2268 ↛ 2269line 2268 didn't jump to line 2269 because the condition on line 2268 was never true
2269 gl = indexed_subglosses[0][1].strip()
2270 if gl.endswith(":"):
2271 gl = gl[:-1].strip()
2272 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2273 if parsed is not None:
2274 infl_tags, infl_dts = parsed
2275 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2276 # Interpret others as a particular form under
2277 # "inflection of"
2278 data_extend(sense_base, "tags", infl_tags)
2279 data_extend(sense_base, "form_of", infl_dts)
2280 indexed_subglosses = indexed_subglosses[1:]
2281 elif not infl_dts:
2282 data_extend(sense_base, "tags", infl_tags)
2283 indexed_subglosses = indexed_subglosses[1:]
2285 # Create senses for remaining subglosses
2286 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2287 gloss = gloss.strip()
2288 if not gloss and len(indexed_subglosses) > 1: 2288 ↛ 2289line 2288 didn't jump to line 2289 because the condition on line 2288 was never true
2289 continue
2290 # Push a new sense (if the last one is not empty)
2291 if push_sense(sorting_ordinal): 2291 ↛ 2292line 2291 didn't jump to line 2292 because the condition on line 2291 was never true
2292 added = True
2293 # if gloss not in sense_data.get("raw_glosses", ()):
2294 # data_append(sense_data, "raw_glosses", gloss)
2295 if i == 0 and examples:
2296 # In a multi-line gloss, associate examples
2297 # with only one of them.
2298 # XXX or you could use gloss_i == len(indexed_subglosses)
2299 # to associate examples with the *last* one.
2300 data_extend(sense_data, "examples", examples)
2301 if gloss.startswith("; ") and gloss_i > 0: 2301 ↛ 2302line 2301 didn't jump to line 2302 because the condition on line 2301 was never true
2302 gloss = gloss[1:].strip()
2303 # If the gloss starts with †, mark as obsolete
2304 if gloss.startswith("^†"): 2304 ↛ 2305line 2304 didn't jump to line 2305 because the condition on line 2304 was never true
2305 data_append(sense_data, "tags", "obsolete")
2306 gloss = gloss[2:].strip()
2307 elif gloss.startswith("^‡"): 2307 ↛ 2308line 2307 didn't jump to line 2308 because the condition on line 2307 was never true
2308 data_extend(sense_data, "tags", ["obsolete", "historical"])
2309 gloss = gloss[2:].strip()
2310 # Copy data for all senses to this sense
2311 for k, v in sense_base.items():
2312 if isinstance(v, (list, tuple)):
2313 if k != "tags":
2314 # Tags handled below (countable/uncountable special)
2315 data_extend(sense_data, k, v)
2316 else:
2317 assert k not in ("tags", "categories", "topics")
2318 sense_data[k] = v # type:ignore[literal-required]
2319 # Parse the gloss for this particular sense
2320 m = QUALIFIERS_RE.match(gloss)
2321 # (...): ... or (...(...)...): ...
2322 if m:
2323 parse_sense_qualifier(wxr, m.group(1), sense_data)
2324 gloss = gloss[m.end() :].strip()
2326 # Remove common suffix "[from 14th c.]" and similar
2327 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2329 # Check to make sure we don't have unhandled list items in gloss
2330 ofs = max(gloss.find("#"), gloss.find("* "))
2331 if ofs > 10 and "(#)" not in gloss:
2332 wxr.wtp.debug(
2333 "gloss may contain unhandled list items: {}".format(gloss),
2334 sortid="page/1412",
2335 )
2336 elif "\n" in gloss: 2336 ↛ 2337line 2336 didn't jump to line 2337 because the condition on line 2336 was never true
2337 wxr.wtp.debug(
2338 "gloss contains newline: {}".format(gloss),
2339 sortid="page/1416",
2340 )
2342 # Kludge, some glosses have a comma after initial qualifiers in
2343 # parentheses
2344 if gloss.startswith((",", ":")):
2345 gloss = gloss[1:]
2346 gloss = gloss.strip()
2347 if gloss.endswith(":"):
2348 gloss = gloss[:-1].strip()
2349 if gloss.startswith("N. of "): 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true
2350 gloss = "Name of " + gloss[6:]
2351 if gloss.startswith("†"): 2351 ↛ 2352line 2351 didn't jump to line 2352 because the condition on line 2351 was never true
2352 data_append(sense_data, "tags", "obsolete")
2353 gloss = gloss[1:]
2354 elif gloss.startswith("^†"): 2354 ↛ 2355line 2354 didn't jump to line 2355 because the condition on line 2354 was never true
2355 data_append(sense_data, "tags", "obsolete")
2356 gloss = gloss[2:]
2358 # Copy tags from sense_base if any. This will not copy
2359 # countable/uncountable if either was specified in the sense,
2360 # as sometimes both are specified in word head but only one
2361 # in individual senses.
2362 countability_tags = []
2363 base_tags = sense_base.get("tags", ())
2364 sense_tags = sense_data.get("tags", ())
2365 for tag in base_tags:
2366 if tag in ("countable", "uncountable"):
2367 if tag not in countability_tags: 2367 ↛ 2369line 2367 didn't jump to line 2369 because the condition on line 2367 was always true
2368 countability_tags.append(tag)
2369 continue
2370 if tag not in sense_tags:
2371 data_append(sense_data, "tags", tag)
2372 if countability_tags:
2373 if ( 2373 ↛ 2382line 2373 didn't jump to line 2382 because the condition on line 2373 was always true
2374 "countable" not in sense_tags
2375 and "uncountable" not in sense_tags
2376 ):
2377 data_extend(sense_data, "tags", countability_tags)
2379 # If outer gloss specifies a form-of ("inflection of", see
2380 # aquamarine/German), try to parse the inner glosses as
2381 # tags for an inflected form.
2382 if "form-of" in sense_base.get("tags", ()):
2383 parsed = parse_alt_or_inflection_of(
2384 wxr, gloss, gloss_template_args
2385 )
2386 if parsed is not None: 2386 ↛ 2392line 2386 didn't jump to line 2392 because the condition on line 2386 was always true
2387 infl_tags, infl_dts = parsed
2388 if not infl_dts and infl_tags: 2388 ↛ 2392line 2388 didn't jump to line 2392 because the condition on line 2388 was always true
2389 # Interpret as a particular form under "inflection of"
2390 data_extend(sense_data, "tags", infl_tags)
2392 if not gloss: 2392 ↛ 2393line 2392 didn't jump to line 2393 because the condition on line 2392 was never true
2393 data_append(sense_data, "tags", "empty-gloss")
2394 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2395 if ( 2395 ↛ 2406line 2395 didn't jump to line 2406 because the condition on line 2395 was always true
2396 gloss_i == 0
2397 and len(sense_data.get("glosses", tuple())) >= 1
2398 ):
2399 # If we added a "high-level gloss" from rawgloss, but this
2400 # is that same gloss_i, add this instead of the raw_gloss
2401 # from before if they're different: the rawgloss was not
2402 # cleaned exactly the same as this later gloss
2403 sense_data["glosses"][-1] = gloss
2404 else:
2405 # Add the gloss for the sense.
2406 data_append(sense_data, "glosses", gloss)
2408 # Kludge: there are cases (e.g., etc./Swedish) where there are
2409 # two abbreviations in the same sense, both generated by the
2410 # {{abbreviation of|...}} template. Handle these with some magic.
2411 position = 0
2412 split_glosses = []
2413 for m in re.finditer(r"Abbreviation of ", gloss):
2414 if m.start() != position: 2414 ↛ 2413line 2414 didn't jump to line 2413 because the condition on line 2414 was always true
2415 split_glosses.append(gloss[position : m.start()])
2416 position = m.start()
2417 split_glosses.append(gloss[position:])
2418 for gloss in split_glosses:
2419 # Check if this gloss describes an alt-of or inflection-of
2420 if (
2421 lang_code != "en"
2422 and " " not in gloss
2423 and distw([word], gloss) < 0.3
2424 ):
2425 # Don't try to parse gloss if it is one word
2426 # that is close to the word itself for non-English words
2427 # (probable translations of a tag/form name)
2428 continue
2429 parsed = parse_alt_or_inflection_of(
2430 wxr, gloss, gloss_template_args
2431 )
2432 if parsed is None:
2433 continue
2434 tags, dts = parsed
2435 if not dts and tags:
2436 data_extend(sense_data, "tags", tags)
2437 continue
2438 for dt in dts: # type:ignore[union-attr]
2439 ftags = list(tag for tag in tags if tag != "form-of")
2440 if "alt-of" in tags:
2441 data_extend(sense_data, "tags", ftags)
2442 data_append(sense_data, "alt_of", dt)
2443 elif "compound-of" in tags: 2443 ↛ 2444line 2443 didn't jump to line 2444 because the condition on line 2443 was never true
2444 data_extend(sense_data, "tags", ftags)
2445 data_append(sense_data, "compound_of", dt)
2446 elif "synonym-of" in tags: 2446 ↛ 2447line 2446 didn't jump to line 2447 because the condition on line 2446 was never true
2447 data_extend(dt, "tags", ftags)
2448 data_append(sense_data, "synonyms", dt)
2449 elif tags and dt.get("word", "").startswith("of "): 2449 ↛ 2450line 2449 didn't jump to line 2450 because the condition on line 2449 was never true
2450 dt["word"] = dt["word"][3:]
2451 data_append(sense_data, "tags", "form-of")
2452 data_extend(sense_data, "tags", ftags)
2453 data_append(sense_data, "form_of", dt)
2454 elif "form-of" in tags: 2454 ↛ 2438line 2454 didn't jump to line 2438 because the condition on line 2454 was always true
2455 data_extend(sense_data, "tags", tags)
2456 data_append(sense_data, "form_of", dt)
2458 if len(sense_data) == 0:
2459 if len(sense_base.get("tags", [])) == 0: 2459 ↛ 2461line 2459 didn't jump to line 2461 because the condition on line 2459 was always true
2460 del sense_base["tags"]
2461 sense_data.update(sense_base)
2462 if push_sense(sorting_ordinal): 2462 ↛ 2466line 2462 didn't jump to line 2466 because the condition on line 2462 was always true
2463 # push_sense succeded in adding a sense to pos_data
2464 added = True
2465 # print("PARSE_SENSE DONE:", pos_datas[-1])
2466 return added
2468 def parse_inflection(
2469 node: WikiNode, section: str, pos: Optional[str]
2470 ) -> None:
2471 """Parses inflection data (declension, conjugation) from the given
2472 page. This retrieves the actual inflection template
2473 parameters, which are very useful for applications that need
2474 to learn the inflection classes and generate inflected
2475 forms."""
2476 assert isinstance(node, WikiNode)
2477 assert isinstance(section, str)
2478 assert pos is None or isinstance(pos, str)
2479 # print("parse_inflection:", node)
2481 if pos is None: 2481 ↛ 2482line 2481 didn't jump to line 2482 because the condition on line 2481 was never true
2482 wxr.wtp.debug(
2483 "inflection table outside part-of-speech", sortid="page/1812"
2484 )
2485 return
2487 def inflection_template_fn(
2488 name: str, ht: TemplateArgs
2489 ) -> Optional[str]:
2490 # print("decl_conj_template_fn", name, ht)
2491 if is_panel_template(wxr, name): 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true
2492 return ""
2493 if name in ("is-u-mutation",): 2493 ↛ 2496line 2493 didn't jump to line 2496 because the condition on line 2493 was never true
2494 # These are not to be captured as an exception to the
2495 # generic code below
2496 return None
2497 m = re.search(
2498 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2499 r"declension|inflection|mut|mutation)($|-)",
2500 name,
2501 )
2502 if m:
2503 args_ht = clean_template_args(wxr, ht)
2504 dt = {"name": name, "args": args_ht}
2505 data_append(pos_data, "inflection_templates", dt)
2507 return None
2509 # Convert the subtree back to Wikitext, then expand all and parse,
2510 # capturing templates in the process
2511 text = wxr.wtp.node_to_wikitext(node.children)
2513 # Split text into separate sections for each to-level template
2514 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2515 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2516 # The (?:...) creates a non-capturing regex group; if it was capturing,
2517 # like the group around it, it would create elements in brace_matches,
2518 # including None if it doesn't match.
2519 # 20250114: Added {| and |} into the regex because tables were being
2520 # cut into pieces by this code. Issue #973, introduction of two-part
2521 # book-end templates similar to trans-top and tran-bottom.
2522 template_sections = []
2523 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2524 # Because there is the possibility of triple curly braces
2525 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2526 # count nesting depth using pairs of two brackets, but
2527 # instead use singular braces ("{ }").
2528 # Because template delimiters should be balanced, regardless
2529 # of whether {{ or {{{ is used, and because we only care
2530 # about the outer-most delimiters (the highest level template)
2531 # we can just count the single braces when those single
2532 # braces are part of a group.
2533 table_nesting = 0
2534 # However, if we have a stray table ({| ... |}) that should always
2535 # be its own section, and should prevent templates from cutting it
2536 # into sections.
2538 # print(f"Parse inflection: {text=}")
2539 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2540 if len(brace_matches) > 1:
2541 tsection: list[str] = []
2542 after_templates = False # kludge to keep any text
2543 # before first template
2544 # with the first template;
2545 # otherwise, text
2546 # goes with preceding template
2547 for m in brace_matches:
2548 if m.startswith("\n; ") and after_templates: 2548 ↛ 2549line 2548 didn't jump to line 2549 because the condition on line 2548 was never true
2549 after_templates = False
2550 template_sections.append(tsection)
2551 tsection = []
2552 tsection.append(m)
2553 elif m.startswith("{{") or m.endswith("{|"):
2554 if (
2555 template_nesting == 0
2556 and after_templates
2557 and table_nesting == 0
2558 ):
2559 template_sections.append(tsection)
2560 tsection = []
2561 # start new section
2562 after_templates = True
2563 if m.startswith("{{"):
2564 template_nesting += 1
2565 else:
2566 # m.endswith("{|")
2567 table_nesting += 1
2568 tsection.append(m)
2569 elif m.startswith("}}") or m.endswith("|}"):
2570 if m.startswith("}}"):
2571 template_nesting -= 1
2572 if template_nesting < 0: 2572 ↛ 2573line 2572 didn't jump to line 2573 because the condition on line 2572 was never true
2573 wxr.wtp.error(
2574 "Negatively nested braces, "
2575 "couldn't split inflection templates, "
2576 "{}/{} section {}".format(
2577 word, language, section
2578 ),
2579 sortid="page/1871",
2580 )
2581 template_sections = [] # use whole text
2582 break
2583 else:
2584 table_nesting -= 1
2585 if table_nesting < 0: 2585 ↛ 2586line 2585 didn't jump to line 2586 because the condition on line 2585 was never true
2586 wxr.wtp.error(
2587 "Negatively nested table braces, "
2588 "couldn't split inflection section, "
2589 "{}/{} section {}".format(
2590 word, language, section
2591 ),
2592 sortid="page/20250114",
2593 )
2594 template_sections = [] # use whole text
2595 break
2596 tsection.append(m)
2597 else:
2598 tsection.append(m)
2599 if tsection: # dangling tsection 2599 ↛ 2607line 2599 didn't jump to line 2607 because the condition on line 2599 was always true
2600 template_sections.append(tsection)
2601 # Why do it this way around? The parser has a preference
2602 # to associate bits outside of tables with the preceding
2603 # table (`after`-variable), so a new tsection begins
2604 # at {{ and everything before it belongs to the previous
2605 # template.
2607 texts = []
2608 if not template_sections:
2609 texts = [text]
2610 else:
2611 for tsection in template_sections:
2612 texts.append("".join(tsection))
2613 if template_nesting != 0: 2613 ↛ 2614line 2613 didn't jump to line 2614 because the condition on line 2613 was never true
2614 wxr.wtp.error(
2615 "Template nesting error: "
2616 "template_nesting = {} "
2617 "couldn't split inflection templates, "
2618 "{}/{} section {}".format(
2619 template_nesting, word, language, section
2620 ),
2621 sortid="page/1896",
2622 )
2623 texts = [text]
2624 for text in texts:
2625 tree = wxr.wtp.parse(
2626 text, expand_all=True, template_fn=inflection_template_fn
2627 )
2629 if not text.strip():
2630 continue
2632 # Parse inflection tables from the section. The data is stored
2633 # under "forms".
2634 if wxr.config.capture_inflections: 2634 ↛ 2624line 2634 didn't jump to line 2624 because the condition on line 2634 was always true
2635 tablecontext = None
2636 m = re.search(r"{{([^}{|]+)\|?", text)
2637 if m:
2638 template_name = m.group(1).strip()
2639 tablecontext = TableContext(template_name)
2641 parse_inflection_section(
2642 wxr,
2643 pos_data,
2644 word,
2645 language,
2646 pos,
2647 section,
2648 tree,
2649 tablecontext=tablecontext,
2650 )
2652 def get_subpage_section(
2653 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2654 ) -> Optional[Union[WikiNode, str]]:
2655 """Loads a subpage of the given page, and finds the section
2656 for the given language, part-of-speech, and section title. This
2657 is used for finding translations and other sections on subpages."""
2658 assert isinstance(language, str)
2659 assert isinstance(title, str)
2660 assert isinstance(subtitle, str)
2661 assert isinstance(seqs, (list, tuple))
2662 for seq in seqs:
2663 for x in seq:
2664 assert isinstance(x, str)
2665 subpage_title = word + "/" + subtitle
2666 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2667 if subpage_content is None:
2668 wxr.wtp.error(
2669 "/translations not found despite "
2670 "{{see translation subpage|...}}",
2671 sortid="page/1934",
2672 )
2673 return None
2675 def recurse(
2676 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2677 ) -> Optional[Union[str, WikiNode]]:
2678 # print(f"seq: {seq}")
2679 if not seq:
2680 return node
2681 if not isinstance(node, WikiNode):
2682 return None
2683 # print(f"node.kind: {node.kind}")
2684 if node.kind in LEVEL_KINDS:
2685 t = clean_node(wxr, None, node.largs[0])
2686 # print(f"t: {t} == seq[0]: {seq[0]}?")
2687 if t.lower() == seq[0].lower():
2688 seq = seq[1:]
2689 if not seq:
2690 return node
2691 for n in node.children:
2692 ret = recurse(n, seq)
2693 if ret is not None:
2694 return ret
2695 return None
2697 tree = wxr.wtp.parse(
2698 subpage_content,
2699 pre_expand=True,
2700 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2701 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2702 )
2703 assert tree.kind == NodeKind.ROOT
2704 for seq in seqs:
2705 ret = recurse(tree, seq)
2706 if ret is None:
2707 wxr.wtp.debug(
2708 "Failed to find subpage section {}/{} seq {}".format(
2709 title, subtitle, seq
2710 ),
2711 sortid="page/1963",
2712 )
2713 return ret
2715 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2716 """Parses translations for a word. This may also pull in translations
2717 from separate translation subpages."""
2718 assert isinstance(data, dict)
2719 assert isinstance(xlatnode, WikiNode)
2720 # print("===== PARSE_TRANSLATIONS {} {} {}"
2721 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2722 # print("parse_translations xlatnode={}".format(xlatnode))
2723 if not wxr.config.capture_translations: 2723 ↛ 2724line 2723 didn't jump to line 2724 because the condition on line 2723 was never true
2724 return
2725 sense_parts: list[Union[WikiNode, str]] = []
2726 sense: Optional[str] = None
2728 def parse_translation_item(
2729 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2730 ) -> None:
2731 nonlocal sense
2732 assert isinstance(contents, list)
2733 assert lang is None or isinstance(lang, str)
2734 # print("PARSE_TRANSLATION_ITEM:", contents)
2736 langcode: Optional[str] = None
2737 if sense is None:
2738 sense = clean_node(wxr, data, sense_parts).strip()
2739 # print("sense <- clean_node: ", sense)
2740 idx = sense.find("See also translations at")
2741 if idx > 0: 2741 ↛ 2742line 2741 didn't jump to line 2742 because the condition on line 2741 was never true
2742 wxr.wtp.debug(
2743 "Skipping translation see also: {}".format(sense),
2744 sortid="page/2361",
2745 )
2746 sense = sense[:idx].strip()
2747 if sense.endswith(":"): 2747 ↛ 2748line 2747 didn't jump to line 2748 because the condition on line 2747 was never true
2748 sense = sense[:-1].strip()
2749 if sense.endswith("—"): 2749 ↛ 2750line 2749 didn't jump to line 2750 because the condition on line 2749 was never true
2750 sense = sense[:-1].strip()
2751 translations_from_template: list[str] = []
2753 def translation_item_template_fn(
2754 name: str, ht: TemplateArgs
2755 ) -> Optional[str]:
2756 nonlocal langcode
2757 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2758 if is_panel_template(wxr, name):
2759 return ""
2760 if name in ("t+check", "t-check", "t-needed"):
2761 # We ignore these templates. They seem to have outright
2762 # garbage in some entries, and very varying formatting in
2763 # others. These should be transitory and unreliable
2764 # anyway.
2765 return "__IGNORE__"
2766 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2767 code = ht.get(1)
2768 if code: 2768 ↛ 2778line 2768 didn't jump to line 2778 because the condition on line 2768 was always true
2769 if langcode and code != langcode:
2770 wxr.wtp.debug(
2771 "inconsistent language codes {} vs "
2772 "{} in translation item: {!r} {}".format(
2773 langcode, code, name, ht
2774 ),
2775 sortid="page/2386",
2776 )
2777 langcode = code
2778 tr = ht.get(2)
2779 if tr:
2780 tr = clean_node(wxr, None, [tr])
2781 translations_from_template.append(tr)
2782 return None
2783 if name == "t-egy":
2784 langcode = "egy"
2785 return None
2786 if name == "ttbc":
2787 code = ht.get(1)
2788 if code: 2788 ↛ 2790line 2788 didn't jump to line 2790 because the condition on line 2788 was always true
2789 langcode = code
2790 return None
2791 if name == "trans-see": 2791 ↛ 2792line 2791 didn't jump to line 2792 because the condition on line 2791 was never true
2792 wxr.wtp.error(
2793 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2794 )
2795 return ""
2796 if name.endswith("-top"): 2796 ↛ 2797line 2796 didn't jump to line 2797 because the condition on line 2796 was never true
2797 return ""
2798 if name.endswith("-bottom"): 2798 ↛ 2799line 2798 didn't jump to line 2799 because the condition on line 2798 was never true
2799 return ""
2800 if name.endswith("-mid"): 2800 ↛ 2801line 2800 didn't jump to line 2801 because the condition on line 2800 was never true
2801 return ""
2802 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2803 # .format(name),
2804 # sortid="page/2414")
2805 return None
2807 sublists = list(
2808 x
2809 for x in contents
2810 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2811 )
2812 contents = list(
2813 x
2814 for x in contents
2815 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2816 )
2818 item = clean_node(
2819 wxr, data, contents, template_fn=translation_item_template_fn
2820 )
2821 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2823 # Parse the translation item.
2824 if item: 2824 ↛ exitline 2824 didn't return from function 'parse_translation_item' because the condition on line 2824 was always true
2825 lang = parse_translation_item_text(
2826 wxr,
2827 word,
2828 data,
2829 item,
2830 sense,
2831 lang,
2832 langcode,
2833 translations_from_template,
2834 is_reconstruction,
2835 )
2837 # Handle sublists. They are frequently used for different
2838 # scripts for the language and different variants of the
2839 # language. We will include the lower-level header as a
2840 # tag in those cases.
2841 for listnode in sublists:
2842 assert listnode.kind == NodeKind.LIST
2843 for node in listnode.children:
2844 if not isinstance(node, WikiNode): 2844 ↛ 2845line 2844 didn't jump to line 2845 because the condition on line 2844 was never true
2845 continue
2846 if node.kind == NodeKind.LIST_ITEM: 2846 ↛ 2843line 2846 didn't jump to line 2843 because the condition on line 2846 was always true
2847 parse_translation_item(node.children, lang=lang)
2849 def parse_translation_template(node: WikiNode) -> None:
2850 assert isinstance(node, WikiNode)
2852 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2853 nonlocal sense_parts
2854 nonlocal sense
2855 if is_panel_template(wxr, name):
2856 return ""
2857 if name == "see also":
2858 # XXX capture
2859 # XXX for example, "/" has top-level list containing
2860 # see also items. So also should parse those.
2861 return ""
2862 if name == "trans-see":
2863 # XXX capture
2864 return ""
2865 if name == "see translation subpage": 2865 ↛ 2866line 2865 didn't jump to line 2866 because the condition on line 2865 was never true
2866 sense_parts = []
2867 sense = None
2868 sub = ht.get(1, "")
2869 if sub:
2870 m = re.match(
2871 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
2872 )
2873 else:
2874 m = None
2875 etym = ""
2876 etym_numbered = ""
2877 pos = ""
2878 if m:
2879 etym_numbered = m.group(1)
2880 etym = m.group(2)
2881 pos = m.group(3)
2882 if not sub:
2883 wxr.wtp.debug(
2884 "no part-of-speech in "
2885 "{{see translation subpage|...}}, "
2886 "defaulting to just wxr.wtp.section "
2887 "(= language)",
2888 sortid="page/2468",
2889 )
2890 # seq sent to get_subpage_section without sub and pos
2891 seq = [
2892 language,
2893 TRANSLATIONS_TITLE,
2894 ]
2895 elif (
2896 m
2897 and etym.lower().strip() in ETYMOLOGY_TITLES
2898 and pos.lower() in POS_TITLES
2899 ):
2900 seq = [
2901 language,
2902 etym_numbered,
2903 pos,
2904 TRANSLATIONS_TITLE,
2905 ]
2906 elif sub.lower() in POS_TITLES:
2907 # seq with sub but not pos
2908 seq = [
2909 language,
2910 sub,
2911 TRANSLATIONS_TITLE,
2912 ]
2913 else:
2914 # seq with sub and pos
2915 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2916 if pos.lower() not in POS_TITLES:
2917 wxr.wtp.debug(
2918 "unhandled see translation subpage: "
2919 "language={} sub={} "
2920 "wxr.wtp.subsection={}".format(
2921 language, sub, wxr.wtp.subsection
2922 ),
2923 sortid="page/2478",
2924 )
2925 seq = [language, sub, pos, TRANSLATIONS_TITLE]
2926 subnode = get_subpage_section(
2927 wxr.wtp.title or "MISSING_TITLE",
2928 TRANSLATIONS_TITLE,
2929 [seq],
2930 )
2931 if subnode is None or not isinstance(subnode, WikiNode):
2932 # Failed to find the normal subpage section
2933 # seq with sub and pos
2934 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2935 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
2936 seqs: list[list[str] | tuple[str, ...]] = [
2937 [TRANSLATIONS_TITLE],
2938 [language, pos],
2939 ]
2940 subnode = get_subpage_section(
2941 wxr.wtp.title or "MISSING_TITLE",
2942 TRANSLATIONS_TITLE,
2943 seqs,
2944 )
2945 if subnode is not None and isinstance(subnode, WikiNode):
2946 parse_translations(data, subnode)
2947 return ""
2948 if name in (
2949 "c",
2950 "C",
2951 "categorize",
2952 "cat",
2953 "catlangname",
2954 "topics",
2955 "top",
2956 "qualifier",
2957 "cln",
2958 ):
2959 # These are expanded in the default way
2960 return None
2961 if name in (
2962 "trans-top",
2963 "trans-top-see",
2964 ):
2965 # XXX capture id from trans-top? Capture sense here
2966 # instead of trying to parse it from expanded content?
2967 if ht.get(1):
2968 sense_parts = []
2969 sense = ht.get(1)
2970 else:
2971 sense_parts = []
2972 sense = None
2973 return None
2974 if name in (
2975 "trans-bottom",
2976 "trans-mid",
2977 "checktrans-mid",
2978 "checktrans-bottom",
2979 ):
2980 return None
2981 if name == "checktrans-top":
2982 sense_parts = []
2983 sense = None
2984 return ""
2985 if name == "trans-top-also":
2986 # XXX capture?
2987 sense_parts = []
2988 sense = None
2989 return ""
2990 wxr.wtp.error(
2991 "UNIMPLEMENTED parse_translation_template: {} {}".format(
2992 name, ht
2993 ),
2994 sortid="page/2517",
2995 )
2996 return ""
2998 wxr.wtp.expand(
2999 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3000 )
3002 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3003 nonlocal sense
3004 nonlocal sense_parts
3005 for node in xlatnode.children:
3006 # print(node)
3007 if isinstance(node, str):
3008 if sense:
3009 if not node.isspace():
3010 wxr.wtp.debug(
3011 "skipping string in the middle of "
3012 "translations: {}".format(node),
3013 sortid="page/2530",
3014 )
3015 continue
3016 # Add a part to the sense
3017 sense_parts.append(node)
3018 sense = None
3019 continue
3020 assert isinstance(node, WikiNode)
3021 kind = node.kind
3022 if kind == NodeKind.LIST:
3023 for item in node.children:
3024 if not isinstance(item, WikiNode): 3024 ↛ 3025line 3024 didn't jump to line 3025 because the condition on line 3024 was never true
3025 continue
3026 if item.kind != NodeKind.LIST_ITEM: 3026 ↛ 3027line 3026 didn't jump to line 3027 because the condition on line 3026 was never true
3027 continue
3028 if item.sarg == ":": 3028 ↛ 3029line 3028 didn't jump to line 3029 because the condition on line 3028 was never true
3029 continue
3030 parse_translation_item(item.children)
3031 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3031 ↛ 3035line 3031 didn't jump to line 3035 because the condition on line 3031 was never true
3032 # Silently skip list items that are just indented; these
3033 # are used for text between translations, such as indicating
3034 # translations that need to be checked.
3035 pass
3036 elif kind == NodeKind.TEMPLATE:
3037 parse_translation_template(node)
3038 elif kind in ( 3038 ↛ 3043line 3038 didn't jump to line 3043 because the condition on line 3038 was never true
3039 NodeKind.TABLE,
3040 NodeKind.TABLE_ROW,
3041 NodeKind.TABLE_CELL,
3042 ):
3043 parse_translation_recurse(node)
3044 elif kind == NodeKind.HTML:
3045 if node.attrs.get("class") == "NavFrame": 3045 ↛ 3051line 3045 didn't jump to line 3051 because the condition on line 3045 was never true
3046 # Reset ``sense_parts`` (and force recomputing
3047 # by clearing ``sense``) as each NavFrame specifies
3048 # its own sense. This helps eliminate garbage coming
3049 # from text at the beginning at the translations
3050 # section.
3051 sense_parts = []
3052 sense = None
3053 # for item in node.children:
3054 # if not isinstance(item, WikiNode):
3055 # continue
3056 # parse_translation_recurse(item)
3057 parse_translation_recurse(node)
3058 elif kind in LEVEL_KINDS: 3058 ↛ 3060line 3058 didn't jump to line 3060 because the condition on line 3058 was never true
3059 # Sub-levels will be recursed elsewhere
3060 pass
3061 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3062 parse_translation_recurse(node)
3063 elif kind == NodeKind.PREFORMATTED: 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true
3064 print("parse_translation_recurse: PREFORMATTED:", node)
3065 elif kind == NodeKind.LINK: 3065 ↛ 3119line 3065 didn't jump to line 3119 because the condition on line 3065 was always true
3066 arg0 = node.largs[0]
3067 # Kludge: I've seen occasional normal links to translation
3068 # subpages from main pages (e.g., language/English/Noun
3069 # in July 2021) instead of the normal
3070 # {{see translation subpage|...}} template. This should
3071 # handle them. Note: must be careful not to read other
3072 # links, particularly things like in "human being":
3073 # "a human being -- see [[man/translations]]" (group title)
3074 if ( 3074 ↛ 3082line 3074 didn't jump to line 3082 because the condition on line 3074 was never true
3075 isinstance(arg0, (list, tuple))
3076 and arg0
3077 and isinstance(arg0[0], str)
3078 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3079 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3080 == wxr.wtp.title
3081 ):
3082 wxr.wtp.debug(
3083 "translations subpage link found on main "
3084 "page instead "
3085 "of normal {{see translation subpage|...}}",
3086 sortid="page/2595",
3087 )
3088 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3089 if sub.lower() in POS_TITLES:
3090 seq = [
3091 language,
3092 sub,
3093 TRANSLATIONS_TITLE,
3094 ]
3095 subnode = get_subpage_section(
3096 wxr.wtp.title,
3097 TRANSLATIONS_TITLE,
3098 [seq],
3099 )
3100 if subnode is not None and isinstance(
3101 subnode, WikiNode
3102 ):
3103 parse_translations(data, subnode)
3104 else:
3105 wxr.wtp.error(
3106 "/translations link outside part-of-speech"
3107 )
3109 if (
3110 len(arg0) >= 1
3111 and isinstance(arg0[0], str)
3112 and not arg0[0].lower().startswith("category:")
3113 ):
3114 for x in node.largs[-1]:
3115 if isinstance(x, str): 3115 ↛ 3118line 3115 didn't jump to line 3118 because the condition on line 3115 was always true
3116 sense_parts.append(x)
3117 else:
3118 parse_translation_recurse(x)
3119 elif not sense:
3120 sense_parts.append(node)
3121 else:
3122 wxr.wtp.debug(
3123 "skipping text between translation items/senses: "
3124 "{}".format(node),
3125 sortid="page/2621",
3126 )
3128 # Main code of parse_translation(). We want ``sense`` to be assigned
3129 # regardless of recursion levels, and thus the code is structured
3130 # to define at this level and recurse in parse_translation_recurse().
3131 parse_translation_recurse(xlatnode)
3133 def parse_etymology(data: WordData, node: LevelNode) -> None:
3134 """Parses an etymology section."""
3135 assert isinstance(data, dict)
3136 assert isinstance(node, WikiNode)
3138 templates: list[TemplateData] = []
3140 # Counter for preventing the capture of etymology templates
3141 # when we are inside templates that we want to ignore (i.e.,
3142 # not capture).
3143 ignore_count = 0
3145 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3146 nonlocal ignore_count
3147 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3148 return ""
3149 if re.match(ignored_etymology_templates_re, name):
3150 ignore_count += 1
3151 return None
3153 # CONTINUE_HERE
3155 def etym_post_template_fn(
3156 name: str, ht: TemplateArgs, expansion: str
3157 ) -> None:
3158 nonlocal ignore_count
3159 if name in wikipedia_templates:
3160 parse_wikipedia_template(wxr, data, ht)
3161 return None
3162 if re.match(ignored_etymology_templates_re, name):
3163 ignore_count -= 1
3164 return None
3165 if ignore_count == 0: 3165 ↛ 3171line 3165 didn't jump to line 3171 because the condition on line 3165 was always true
3166 ht = clean_template_args(wxr, ht)
3167 expansion = clean_node(wxr, None, expansion)
3168 templates.append(
3169 {"name": name, "args": ht, "expansion": expansion}
3170 )
3171 return None
3173 # Remove any subsections
3174 contents = list(
3175 x
3176 for x in node.children
3177 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3178 )
3179 # Convert to text, also capturing templates using post_template_fn
3180 text = clean_node(
3181 wxr,
3182 None,
3183 contents,
3184 template_fn=etym_template_fn,
3185 post_template_fn=etym_post_template_fn,
3186 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3187 # Save the collected information.
3188 if len(text) > 0:
3189 data["etymology_text"] = text
3190 if len(templates) > 0:
3191 # Some etymology templates, like Template:root do not generate
3192 # text, so they should be added here. Elsewhere, we check
3193 # for Template:root and add some text to the expansion to please
3194 # the validation.
3195 data["etymology_templates"] = templates
3197 for child_node in node.find_child_recursively( 3197 ↛ exitline 3197 didn't return from function 'parse_etymology' because the loop on line 3197 didn't complete
3198 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3199 ):
3200 if child_node.kind in LEVEL_KIND_FLAGS:
3201 break
3202 elif isinstance( 3202 ↛ 3205line 3202 didn't jump to line 3205 because the condition on line 3202 was never true
3203 child_node, TemplateNode
3204 ) and child_node.template_name in ["zh-x", "zh-q"]:
3205 if "etymology_examples" not in data:
3206 data["etymology_examples"] = []
3207 data["etymology_examples"].extend(
3208 extract_template_zh_x(
3209 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3210 )
3211 )
3213 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3214 """This recurses into a subtree in the parse tree for a page."""
3215 nonlocal etym_data
3216 nonlocal pos_data
3217 nonlocal inside_level_four
3219 redirect_list: list[str] = [] # for `zh-see` template
3221 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3222 """This is called for otherwise unprocessed parts of the page.
3223 We still expand them so that e.g. Category links get captured."""
3224 if name in wikipedia_templates:
3225 data = select_data()
3226 parse_wikipedia_template(wxr, data, ht)
3227 return None
3228 if is_panel_template(wxr, name):
3229 return ""
3230 return None
3232 for node in treenode.children:
3233 if not isinstance(node, WikiNode):
3234 # print(" X{}".format(repr(node)[:40]))
3235 continue
3236 if isinstance(node, TemplateNode):
3237 if process_soft_redirect_template(wxr, node, redirect_list):
3238 continue
3239 elif node.template_name == "zh-forms":
3240 extract_zh_forms_template(wxr, node, select_data())
3241 elif (
3242 node.template_name.endswith("-kanjitab")
3243 or node.template_name == "ja-kt"
3244 ):
3245 extract_ja_kanjitab_template(wxr, node, select_data())
3247 if not isinstance(node, LevelNode):
3248 # XXX handle e.g. wikipedia links at the top of a language
3249 # XXX should at least capture "also" at top of page
3250 if node.kind in (
3251 NodeKind.HLINE,
3252 NodeKind.LIST,
3253 NodeKind.LIST_ITEM,
3254 ):
3255 continue
3256 # print(" UNEXPECTED: {}".format(node))
3257 # Clean the node to collect category links
3258 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3259 continue
3260 t = clean_node(
3261 wxr, etym_data, node.sarg if node.sarg else node.largs
3262 )
3263 t = t.lower()
3264 # XXX these counts were never implemented fully, and even this
3265 # gets discarded: Search STATISTICS_IMPLEMENTATION
3266 wxr.config.section_counts[t] += 1
3267 # print("PROCESS_CHILDREN: T:", repr(t))
3268 if t in IGNORED_TITLES:
3269 pass
3270 elif t.startswith(PRONUNCIATION_TITLE):
3271 # Chinese Pronunciation section kludge; we demote these to
3272 # be level 4 instead of 3 so that they're part of a larger
3273 # etymology hierarchy; usually the data here is empty and
3274 # acts as an inbetween between POS and Etymology data
3275 if lang_code in ("zh",):
3276 inside_level_four = True
3277 if t.startswith(PRONUNCIATION_TITLE + " "):
3278 # Pronunciation 1, etc, are used in Chinese Glyphs,
3279 # and each of them may have senses under Definition
3280 push_level_four_section(True)
3281 wxr.wtp.start_subsection(None)
3282 if wxr.config.capture_pronunciation: 3282 ↛ 3390line 3282 didn't jump to line 3390 because the condition on line 3282 was always true
3283 data = select_data()
3284 parse_pronunciation(
3285 wxr,
3286 node,
3287 data,
3288 etym_data,
3289 have_etym,
3290 base_data,
3291 lang_code,
3292 )
3293 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3294 push_etym()
3295 wxr.wtp.start_subsection(None)
3296 if wxr.config.capture_etymologies: 3296 ↛ 3390line 3296 didn't jump to line 3390 because the condition on line 3296 was always true
3297 m = re.search(r"\s(\d+)$", t)
3298 if m:
3299 etym_data["etymology_number"] = int(m.group(1))
3300 parse_etymology(etym_data, node)
3301 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3302 data = select_data()
3303 extract_descendant_section(wxr, data, node, False)
3304 elif (
3305 t in PROTO_ROOT_DERIVED_TITLES
3306 and pos == "root"
3307 and is_reconstruction
3308 and wxr.config.capture_descendants
3309 ):
3310 data = select_data()
3311 extract_descendant_section(wxr, data, node, True)
3312 elif t == TRANSLATIONS_TITLE:
3313 data = select_data()
3314 parse_translations(data, node)
3315 elif t in INFLECTION_TITLES:
3316 parse_inflection(node, t, pos)
3317 elif t == "alternative forms":
3318 extract_alt_form_section(wxr, select_data(), node)
3319 else:
3320 lst = t.split()
3321 while len(lst) > 1 and lst[-1].isdigit(): 3321 ↛ 3322line 3321 didn't jump to line 3322 because the condition on line 3321 was never true
3322 lst = lst[:-1]
3323 t_no_number = " ".join(lst).lower()
3324 if t_no_number in POS_TITLES:
3325 push_pos()
3326 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3327 pos = dt["pos"] or "MISSING_POS"
3328 wxr.wtp.start_subsection(t)
3329 if "debug" in dt:
3330 wxr.wtp.debug(
3331 "{} in section {}".format(dt["debug"], t),
3332 sortid="page/2755",
3333 )
3334 if "warning" in dt: 3334 ↛ 3335line 3334 didn't jump to line 3335 because the condition on line 3334 was never true
3335 wxr.wtp.wiki_notice(
3336 "{} in section {}".format(dt["warning"], t),
3337 sortid="page/2759",
3338 )
3339 if "error" in dt: 3339 ↛ 3340line 3339 didn't jump to line 3340 because the condition on line 3339 was never true
3340 wxr.wtp.error(
3341 "{} in section {}".format(dt["error"], t),
3342 sortid="page/2763",
3343 )
3344 if "note" in dt: 3344 ↛ 3345line 3344 didn't jump to line 3345 because the condition on line 3344 was never true
3345 wxr.wtp.note(
3346 "{} in section {}".format(dt["note"], t),
3347 sortid="page/20251017a",
3348 )
3349 if "wiki_notice" in dt: 3349 ↛ 3350line 3349 didn't jump to line 3350 because the condition on line 3349 was never true
3350 wxr.wtp.wiki_notice(
3351 "{} in section {}".format(dt["wiki_notices"], t),
3352 sortid="page/20251017b",
3353 )
3354 # Parse word senses for the part-of-speech
3355 parse_part_of_speech(node, pos)
3356 if "tags" in dt:
3357 for pdata in sense_datas:
3358 data_extend(pdata, "tags", dt["tags"])
3359 elif t_no_number in LINKAGE_TITLES:
3360 # print(f"LINKAGE_TITLES NODE {node=}")
3361 rel = LINKAGE_TITLES[t_no_number]
3362 data = select_data()
3363 parse_linkage(
3364 wxr,
3365 data,
3366 rel,
3367 node,
3368 word,
3369 sense_datas,
3370 is_reconstruction,
3371 )
3372 elif t_no_number == COMPOUNDS_TITLE:
3373 data = select_data()
3374 if wxr.config.capture_compounds: 3374 ↛ 3390line 3374 didn't jump to line 3390 because the condition on line 3374 was always true
3375 parse_linkage(
3376 wxr,
3377 data,
3378 "derived",
3379 node,
3380 word,
3381 sense_datas,
3382 is_reconstruction,
3383 )
3385 # XXX parse interesting templates also from other sections. E.g.,
3386 # {{Letter|...}} in ===See also===
3387 # Also <gallery>
3389 # Recurse to children of this node, processing subtitles therein
3390 stack.append(t)
3391 process_children(node, pos)
3392 stack.pop()
3394 if len(redirect_list) > 0:
3395 if len(pos_data) > 0:
3396 pos_data["redirects"] = redirect_list
3397 if "pos" not in pos_data: 3397 ↛ 3398line 3397 didn't jump to line 3398 because the condition on line 3397 was never true
3398 pos_data["pos"] = "soft-redirect"
3399 else:
3400 new_page_data = copy.deepcopy(base_data)
3401 new_page_data["redirects"] = redirect_list
3402 if "pos" not in new_page_data: 3402 ↛ 3404line 3402 didn't jump to line 3404 because the condition on line 3402 was always true
3403 new_page_data["pos"] = "soft-redirect"
3404 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3405 page_datas.append(new_page_data)
3407 def extract_examples(
3408 others: list[WikiNode], sense_base: SenseData
3409 ) -> list[ExampleData]:
3410 """Parses through a list of definitions and quotes to find examples.
3411 Returns a list of example dicts to be added to sense data. Adds
3412 meta-data, mostly categories, into sense_base."""
3413 assert isinstance(others, list)
3414 examples: list[ExampleData] = []
3416 for sub in others:
3417 if not sub.sarg.endswith((":", "*")): 3417 ↛ 3418line 3417 didn't jump to line 3418 because the condition on line 3417 was never true
3418 continue
3419 for item in sub.children:
3420 if not isinstance(item, WikiNode): 3420 ↛ 3421line 3420 didn't jump to line 3421 because the condition on line 3420 was never true
3421 continue
3422 if item.kind != NodeKind.LIST_ITEM: 3422 ↛ 3423line 3422 didn't jump to line 3423 because the condition on line 3422 was never true
3423 continue
3424 usex_type = None
3425 example_template_args = []
3426 example_template_names = []
3427 taxons = set()
3429 # Bypass this function when parsing Chinese, Japanese and
3430 # quotation templates.
3431 new_example_lists = extract_example_list_item(
3432 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3433 )
3434 if len(new_example_lists) > 0:
3435 examples.extend(new_example_lists)
3436 continue
3438 def usex_template_fn(
3439 name: str, ht: TemplateArgs
3440 ) -> Optional[str]:
3441 nonlocal usex_type
3442 if is_panel_template(wxr, name):
3443 return ""
3444 if name in usex_templates:
3445 usex_type = "example"
3446 example_template_args.append(ht)
3447 example_template_names.append(name)
3448 elif name in quotation_templates:
3449 usex_type = "quotation"
3450 elif name in taxonomy_templates: 3450 ↛ 3451line 3450 didn't jump to line 3451 because the condition on line 3450 was never true
3451 taxons.update(ht.get(1, "").split())
3452 for prefix in template_linkages_to_ignore_in_examples:
3453 if re.search(
3454 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3455 ):
3456 return ""
3457 return None
3459 # bookmark
3460 ruby: list[tuple[str, str]] = []
3461 contents = item.children
3462 if lang_code == "ja":
3463 # Capture ruby contents if this is a Japanese language
3464 # example.
3465 # print(contents)
3466 if ( 3466 ↛ 3471line 3466 didn't jump to line 3471 because the condition on line 3466 was never true
3467 contents
3468 and isinstance(contents, str)
3469 and re.match(r"\s*$", contents[0])
3470 ):
3471 contents = contents[1:]
3472 exp = wxr.wtp.parse(
3473 wxr.wtp.node_to_wikitext(contents),
3474 # post_template_fn=head_post_template_fn,
3475 expand_all=True,
3476 )
3477 rub, rest = extract_ruby(wxr, exp.children)
3478 if rub:
3479 for rtup in rub:
3480 ruby.append(rtup)
3481 contents = rest
3482 subtext = clean_node(
3483 wxr, sense_base, contents, template_fn=usex_template_fn
3484 )
3486 frozen_taxons = frozenset(taxons)
3487 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3489 # print(f"{subtext=}")
3490 subtext = re.sub(
3491 r"\s*\(please add an English "
3492 r"translation of this "
3493 r"(example|usage example|quote)\)",
3494 "",
3495 subtext,
3496 ).strip()
3497 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3498 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3499 # print("subtext:", repr(subtext))
3501 lines = subtext.splitlines()
3502 # print(lines)
3504 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3505 lines = list(
3506 x
3507 for x in lines
3508 if not re.match(
3509 r"(Synonyms: |Antonyms: |Hyponyms: |"
3510 r"Synonym: |Antonym: |Hyponym: |"
3511 r"Hypernyms: |Derived terms: |"
3512 r"Related terms: |"
3513 r"Hypernym: |Derived term: |"
3514 r"Coordinate terms:|"
3515 r"Related term: |"
3516 r"For more quotations using )",
3517 x,
3518 )
3519 )
3520 tr = ""
3521 ref = ""
3522 roman = ""
3523 # for line in lines:
3524 # print("LINE:", repr(line))
3525 # print(classify_desc(line))
3526 if len(lines) == 1 and lang_code != "en":
3527 parts = example_splitter_re.split(lines[0])
3528 if ( 3528 ↛ 3536line 3528 didn't jump to line 3536 because the condition on line 3528 was never true
3529 len(parts) > 2
3530 and len(example_template_args) == 1
3531 and any(
3532 ("―" in s) or ("—" in s)
3533 for s in example_template_args[0].values()
3534 )
3535 ):
3536 if nparts := synch_splits_with_args(
3537 lines[0], example_template_args[0]
3538 ):
3539 parts = nparts
3540 if ( 3540 ↛ 3545line 3540 didn't jump to line 3545 because the condition on line 3540 was never true
3541 len(example_template_args) == 1
3542 and "lit" in example_template_args[0]
3543 ):
3544 # ugly brute-force kludge in case there's a lit= arg
3545 literally = example_template_args[0].get("lit", "")
3546 if literally:
3547 literally = (
3548 " (literally, “"
3549 + clean_value(wxr, literally)
3550 + "”)"
3551 )
3552 else:
3553 literally = ""
3554 if ( 3554 ↛ 3593line 3554 didn't jump to line 3593 because the condition on line 3554 was never true
3555 len(example_template_args) == 1
3556 and len(parts) == 2
3557 and len(example_template_args[0])
3558 - (
3559 # horrible kludge to ignore these arguments
3560 # when calculating how many there are
3561 sum(
3562 s in example_template_args[0]
3563 for s in (
3564 "lit", # generates text, but we handle it
3565 "inline",
3566 "noenum",
3567 "nocat",
3568 "sort",
3569 )
3570 )
3571 )
3572 == 3
3573 and clean_value(
3574 wxr, example_template_args[0].get(2, "")
3575 )
3576 == parts[0].strip()
3577 and clean_value(
3578 wxr,
3579 (
3580 example_template_args[0].get(3)
3581 or example_template_args[0].get("translation")
3582 or example_template_args[0].get("t", "")
3583 )
3584 + literally, # in case there's a lit= argument
3585 )
3586 == parts[1].strip()
3587 ):
3588 # {{exampletemplate|ex|Foo bar baz|English translation}}
3589 # is a pretty reliable 'heuristic', so we use it here
3590 # before the others. To be extra sure the template
3591 # doesn't do anything weird, we compare the arguments
3592 # and the output to each other.
3593 lines = [parts[0].strip()]
3594 tr = parts[1].strip()
3595 elif (
3596 len(parts) == 2
3597 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3598 ):
3599 # These other branches just do some simple heuristics w/
3600 # the expanded output of the template (if applicable).
3601 lines = [parts[0].strip()]
3602 tr = parts[1].strip()
3603 elif ( 3603 ↛ 3609line 3603 didn't jump to line 3609 because the condition on line 3603 was never true
3604 len(parts) == 3
3605 and classify_desc2(parts[1])
3606 in ("romanization", "english")
3607 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3608 ):
3609 lines = [parts[0].strip()]
3610 roman = parts[1].strip()
3611 tr = parts[2].strip()
3612 else:
3613 parts = re.split(r"\s+-\s+", lines[0])
3614 if ( 3614 ↛ 3618line 3614 didn't jump to line 3618 because the condition on line 3614 was never true
3615 len(parts) == 2
3616 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3617 ):
3618 lines = [parts[0].strip()]
3619 tr = parts[1].strip()
3620 elif len(lines) > 1:
3621 if any(
3622 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3623 ) and not (len(example_template_names) == 1):
3624 refs: list[str] = []
3625 for i in range(len(lines)): 3625 ↛ 3631line 3625 didn't jump to line 3631 because the loop on line 3625 didn't complete
3626 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3626 ↛ 3627line 3626 didn't jump to line 3627 because the condition on line 3626 was never true
3627 break
3628 refs.append(lines[i].strip())
3629 if re.search(r"[]\d:)]\s*$", lines[i]):
3630 break
3631 ref = " ".join(refs)
3632 lines = lines[i + 1 :]
3633 if (
3634 lang_code != "en"
3635 and len(lines) >= 2
3636 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3637 ):
3638 i = len(lines) - 1
3639 while ( 3639 ↛ 3644line 3639 didn't jump to line 3644 because the condition on line 3639 was never true
3640 i > 1
3641 and classify_desc2(lines[i - 1])
3642 in ENGLISH_TEXTS
3643 ):
3644 i -= 1
3645 tr = "\n".join(lines[i:])
3646 lines = lines[:i]
3647 if len(lines) >= 2:
3648 if classify_desc2(lines[-1]) == "romanization":
3649 roman = lines[-1].strip()
3650 lines = lines[:-1]
3652 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3653 ref = lines[0]
3654 lines = lines[1:]
3655 elif lang_code != "en" and len(lines) == 2:
3656 cls1 = classify_desc2(lines[0])
3657 cls2 = classify_desc2(lines[1])
3658 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3659 tr = lines[1]
3660 lines = [lines[0]]
3661 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3661 ↛ 3662line 3661 didn't jump to line 3662 because the condition on line 3661 was never true
3662 tr = lines[0]
3663 lines = [lines[1]]
3664 elif ( 3664 ↛ 3671line 3664 didn't jump to line 3671 because the condition on line 3664 was never true
3665 re.match(r"^[#*]*:+", lines[1])
3666 and classify_desc2(
3667 re.sub(r"^[#*:]+\s*", "", lines[1])
3668 )
3669 in ENGLISH_TEXTS
3670 ):
3671 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3672 lines = [lines[0]]
3673 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3674 # Both were classified as English, but
3675 # presumably one is not. Assume first is
3676 # non-English, as that seems more common.
3677 tr = lines[1]
3678 lines = [lines[0]]
3679 elif (
3680 usex_type != "quotation"
3681 and lang_code != "en"
3682 and len(lines) == 3
3683 ):
3684 cls1 = classify_desc2(lines[0])
3685 cls2 = classify_desc2(lines[1])
3686 cls3 = classify_desc2(lines[2])
3687 if (
3688 cls3 == "english"
3689 and cls2 in ("english", "romanization")
3690 and cls1 != "english"
3691 ):
3692 tr = lines[2].strip()
3693 roman = lines[1].strip()
3694 lines = [lines[0].strip()]
3695 elif ( 3695 ↛ 3703line 3695 didn't jump to line 3703 because the condition on line 3695 was never true
3696 usex_type == "quotation"
3697 and lang_code != "en"
3698 and len(lines) > 2
3699 ):
3700 # for x in lines:
3701 # print(" LINE: {}: {}"
3702 # .format(classify_desc2(x), x))
3703 if re.match(r"^[#*]*:+\s*$", lines[1]):
3704 ref = lines[0]
3705 lines = lines[2:]
3706 cls1 = classify_desc2(lines[-1])
3707 if cls1 == "english":
3708 i = len(lines) - 1
3709 while (
3710 i > 1
3711 and classify_desc2(lines[i - 1])
3712 == ENGLISH_TEXTS
3713 ):
3714 i -= 1
3715 tr = "\n".join(lines[i:])
3716 lines = lines[:i]
3718 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3719 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3720 tr = re.sub(r"^[#*:]+\s*", "", tr)
3721 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3722 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3723 ref = re.sub(r"^[#*:]+\s*", "", ref)
3724 ref = re.sub(
3725 r", (volume |number |page )?“?"
3726 r"\(please specify ([^)]|\(s\))*\)”?|"
3727 ", text here$",
3728 "",
3729 ref,
3730 )
3731 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3732 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3733 subtext = "\n".join(x for x in lines if x)
3734 if not tr and lang_code != "en":
3735 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3736 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3736 ↛ 3737line 3736 didn't jump to line 3737 because the condition on line 3736 was never true
3737 tr = m.group(2)
3738 subtext = subtext[: m.start()] + m.group(1)
3739 elif lines:
3740 parts = re.split(r"\s*[―—]+\s*", lines[0])
3741 if ( 3741 ↛ 3745line 3741 didn't jump to line 3745 because the condition on line 3741 was never true
3742 len(parts) == 2
3743 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3744 ):
3745 subtext = parts[0].strip()
3746 tr = parts[1].strip()
3747 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3748 subtext = re.sub(
3749 r"(please add an English translation of "
3750 r"this (quote|usage example))",
3751 "",
3752 subtext,
3753 )
3754 subtext = re.sub(
3755 r"\s*→New International Version " "translation$",
3756 "",
3757 subtext,
3758 ) # e.g. pis/Tok Pisin (Bible)
3759 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
3760 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
3761 note = None
3762 m = re.match(r"^\(([^)]*)\):\s+", subtext)
3763 if ( 3763 ↛ 3771line 3763 didn't jump to line 3771 because the condition on line 3763 was never true
3764 m is not None
3765 and lang_code != "en"
3766 and (
3767 m.group(1).startswith("with ")
3768 or classify_desc2(m.group(1)) == "english"
3769 )
3770 ):
3771 note = m.group(1)
3772 subtext = subtext[m.end() :]
3773 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
3774 ref = re.sub(r",\s*→ISBN", "", ref)
3775 ref = ref.strip()
3776 if ref.endswith(":") or ref.endswith(","):
3777 ref = ref[:-1].strip()
3778 ref = re.sub(r"\s+,\s+", ", ", ref)
3779 ref = re.sub(r"\s+", " ", ref)
3780 if ref and not subtext: 3780 ↛ 3781line 3780 didn't jump to line 3781 because the condition on line 3780 was never true
3781 subtext = ref
3782 ref = ""
3783 if subtext:
3784 dt: ExampleData = {"text": subtext}
3785 if ref:
3786 dt["ref"] = ref
3787 if tr:
3788 dt["english"] = tr # DEPRECATED for "translation"
3789 dt["translation"] = tr
3790 if usex_type:
3791 dt["type"] = usex_type
3792 if note: 3792 ↛ 3793line 3792 didn't jump to line 3793 because the condition on line 3792 was never true
3793 dt["note"] = note
3794 if roman:
3795 dt["roman"] = roman
3796 if ruby:
3797 dt["ruby"] = ruby
3798 examples.append(dt)
3800 return examples
3802 # Main code of parse_language()
3803 # Process the section
3804 stack.append(language)
3805 process_children(langnode, None)
3806 stack.pop()
3808 # Finalize word entires
3809 push_etym()
3810 ret = []
3811 for data in page_datas:
3812 merge_base(data, base_data)
3813 ret.append(data)
3815 # Copy all tags to word senses
3816 for data in ret:
3817 if "senses" not in data: 3817 ↛ 3818line 3817 didn't jump to line 3818 because the condition on line 3817 was never true
3818 continue
3819 # WordData should not have a 'tags' field, but if it does, it's
3820 # deleted and its contents removed and placed in each sense;
3821 # that's why the type ignores.
3822 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
3823 if "tags" in data:
3824 del data["tags"] # type: ignore[typeddict-item]
3825 for sense in data["senses"]:
3826 data_extend(sense, "tags", tags)
3828 return ret
3831def parse_wikipedia_template(
3832 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
3833) -> None:
3834 """Helper function for parsing {{wikipedia|...}} and related templates."""
3835 assert isinstance(wxr, WiktextractContext)
3836 assert isinstance(data, dict)
3837 assert isinstance(ht, dict)
3838 langid = clean_node(wxr, data, ht.get("lang", ()))
3839 pagename = (
3840 clean_node(wxr, data, ht.get(1, ()))
3841 or wxr.wtp.title
3842 or "MISSING_PAGE_TITLE"
3843 )
3844 if langid:
3845 data_append(data, "wikipedia", langid + ":" + pagename)
3846 else:
3847 data_append(data, "wikipedia", pagename)
3850def parse_top_template(
3851 wxr: WiktextractContext, node: WikiNode, data: WordData
3852) -> None:
3853 """Parses a template that occurs on the top-level in a page, before any
3854 language subtitles."""
3855 assert isinstance(wxr, WiktextractContext)
3856 assert isinstance(node, WikiNode)
3857 assert isinstance(data, dict)
3859 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3860 if name in wikipedia_templates:
3861 parse_wikipedia_template(wxr, data, ht)
3862 return None
3863 if is_panel_template(wxr, name):
3864 return ""
3865 if name in ("reconstruction",): 3865 ↛ 3866line 3865 didn't jump to line 3866 because the condition on line 3865 was never true
3866 return ""
3867 if name.lower() == "also" or name.lower().startswith("also/"):
3868 # XXX shows related words that might really have been the intended
3869 # word, capture them
3870 return ""
3871 if name == "see also": 3871 ↛ 3873line 3871 didn't jump to line 3873 because the condition on line 3871 was never true
3872 # XXX capture
3873 return ""
3874 if name == "cardinalbox": 3874 ↛ 3876line 3874 didn't jump to line 3876 because the condition on line 3874 was never true
3875 # XXX capture
3876 return ""
3877 if name == "character info": 3877 ↛ 3879line 3877 didn't jump to line 3879 because the condition on line 3877 was never true
3878 # XXX capture
3879 return ""
3880 if name == "commonscat": 3880 ↛ 3882line 3880 didn't jump to line 3882 because the condition on line 3880 was never true
3881 # XXX capture link to Wikimedia commons
3882 return ""
3883 if name == "wrongtitle": 3883 ↛ 3886line 3883 didn't jump to line 3886 because the condition on line 3883 was never true
3884 # XXX this should be captured to replace page title with the
3885 # correct title. E.g. ⿰亻革家
3886 return ""
3887 if name == "wikidata": 3887 ↛ 3888line 3887 didn't jump to line 3888 because the condition on line 3887 was never true
3888 arg = clean_node(wxr, data, ht.get(1, ()))
3889 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
3890 data_append(data, "wikidata", arg)
3891 return ""
3892 wxr.wtp.debug(
3893 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
3894 sortid="page/2870",
3895 )
3896 return ""
3898 clean_node(wxr, None, [node], template_fn=top_template_fn)
3901def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
3902 """Fix subtitle hierarchy to be strict Language -> Etymology ->
3903 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
3904 that are next to each other."""
3906 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
3907 # section get overwritten. In this case, let's just combine the two.
3909 # In Chinese entries, Pronunciation can be preceded on the
3910 # same level 3 by its Etymology *and* Glyph Origin sections:
3911 # ===Glyph Origin===
3912 # ===Etymology===
3913 # ===Pronunciation===
3914 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
3915 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
3916 # are now level 6
3918 # Known lowercase PoS names are in part_of_speech_map
3919 # Known lowercase linkage section names are in linkage_map
3921 old = re.split(
3922 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
3923 )
3925 parts = []
3926 npar = 4 # Number of parentheses in above expression
3927 parts.append(old[0])
3928 prev_level = None
3929 level = None
3930 skip_level_title = False # When combining etymology sections
3931 for i in range(1, len(old), npar + 1):
3932 left = old[i]
3933 right = old[i + npar - 1]
3934 # remove Wikilinks in title
3935 title = re.sub(r"^\[\[", "", old[i + 1])
3936 title = re.sub(r"\]\]$", "", title)
3937 prev_level = level
3938 level = len(left)
3939 part = old[i + npar]
3940 if level != len(right): 3940 ↛ 3941line 3940 didn't jump to line 3941 because the condition on line 3940 was never true
3941 wxr.wtp.debug(
3942 "subtitle has unbalanced levels: "
3943 "{!r} has {} on the left and {} on the right".format(
3944 title, left, right
3945 ),
3946 sortid="page/2904",
3947 )
3948 lc = title.lower()
3949 if name_to_code(title, "en") != "":
3950 if level > 2: 3950 ↛ 3951line 3950 didn't jump to line 3951 because the condition on line 3950 was never true
3951 wxr.wtp.debug(
3952 "subtitle has language name {} at level {}".format(
3953 title, level
3954 ),
3955 sortid="page/2911",
3956 )
3957 level = 2
3958 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
3959 if level > 3: 3959 ↛ 3960line 3959 didn't jump to line 3960 because the condition on line 3959 was never true
3960 wxr.wtp.debug(
3961 "etymology section {} at level {}".format(title, level),
3962 sortid="page/2917",
3963 )
3964 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
3965 # sections cheek-to-cheek
3966 skip_level_title = True
3967 # Modify the title of previous ("Glyph Origin") section, in
3968 # case we have a meaningful title like "Etymology 1"
3969 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
3970 level = 3
3971 elif lc.startswith(PRONUNCIATION_TITLE):
3972 # Pronunciation is now a level between POS and Etymology, so
3973 # we need to shift everything down by one
3974 level = 4
3975 elif lc in POS_TITLES:
3976 level = 5
3977 elif lc == TRANSLATIONS_TITLE:
3978 level = 6
3979 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
3980 level = 6
3981 elif lc in INFLECTION_TITLES:
3982 level = 6
3983 elif lc == DESCENDANTS_TITLE:
3984 level = 6
3985 elif title in PROTO_ROOT_DERIVED_TITLES: 3985 ↛ 3986line 3985 didn't jump to line 3986 because the condition on line 3985 was never true
3986 level = 6
3987 elif lc in IGNORED_TITLES:
3988 level = 6
3989 else:
3990 level = 6
3991 if skip_level_title:
3992 skip_level_title = False
3993 parts.append(part)
3994 else:
3995 parts.append("{}{}{}".format("=" * level, title, "=" * level))
3996 parts.append(part)
3997 # print("=" * level, title)
3998 # if level != len(left):
3999 # print(" FIXED LEVEL OF {} {} -> {}"
4000 # .format(title, len(left), level))
4002 text = "".join(parts)
4003 # print(text)
4004 return text
4007def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4008 # Skip translation pages
4009 if word.endswith("/" + TRANSLATIONS_TITLE): 4009 ↛ 4010line 4009 didn't jump to line 4010 because the condition on line 4009 was never true
4010 return []
4012 if wxr.config.verbose: 4012 ↛ 4013line 4012 didn't jump to line 4013 because the condition on line 4012 was never true
4013 logger.info(f"Parsing page: {word}")
4015 wxr.config.word = word
4016 wxr.wtp.start_page(word)
4018 # Remove <noinclude> and similar tags from main pages. They
4019 # should not appear there, but at least net/Elfdala has one and it
4020 # is probably not the only one.
4021 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4022 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4023 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4025 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4026 # pages that have, for example, Translations section under Linkage, or
4027 # Translations section on the same level as Noun. Enforce a proper
4028 # hierarchy by manipulating the subtitle levels in certain cases.
4029 text = fix_subtitle_hierarchy(wxr, text)
4031 # Parse the page, pre-expanding those templates that are likely to
4032 # influence parsing
4033 tree = wxr.wtp.parse(
4034 text,
4035 pre_expand=True,
4036 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4037 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4038 )
4039 # from wikitextprocessor.parser import print_tree
4040 # print("PAGE PARSE:", print_tree(tree))
4042 top_data: WordData = {}
4044 # Iterate over top-level titles, which should be languages for normal
4045 # pages
4046 by_lang = defaultdict(list)
4047 for langnode in tree.children:
4048 if not isinstance(langnode, WikiNode):
4049 continue
4050 if langnode.kind == NodeKind.TEMPLATE:
4051 parse_top_template(wxr, langnode, top_data)
4052 continue
4053 if langnode.kind == NodeKind.LINK:
4054 # Some pages have links at top level, e.g., "trees" in Wiktionary
4055 continue
4056 if langnode.kind != NodeKind.LEVEL2: 4056 ↛ 4057line 4056 didn't jump to line 4057 because the condition on line 4056 was never true
4057 wxr.wtp.debug(
4058 f"unexpected top-level node: {langnode}", sortid="page/3014"
4059 )
4060 continue
4061 lang = clean_node(
4062 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4063 )
4064 lang_code = name_to_code(lang, "en")
4065 if lang_code == "": 4065 ↛ 4066line 4065 didn't jump to line 4066 because the condition on line 4065 was never true
4066 wxr.wtp.debug(
4067 f"unrecognized language name: {lang}", sortid="page/3019"
4068 )
4069 if (
4070 wxr.config.capture_language_codes
4071 and lang_code not in wxr.config.capture_language_codes
4072 ):
4073 continue
4074 wxr.wtp.start_section(lang)
4076 # Collect all words from the page.
4077 # print(f"{langnode=}")
4078 datas = parse_language(wxr, langnode, lang, lang_code)
4080 # Propagate fields resulting from top-level templates to this
4081 # part-of-speech.
4082 for data in datas:
4083 if "lang" not in data: 4083 ↛ 4084line 4083 didn't jump to line 4084 because the condition on line 4083 was never true
4084 wxr.wtp.debug(
4085 "internal error -- no lang in data: {}".format(data),
4086 sortid="page/3034",
4087 )
4088 continue
4089 for k, v in top_data.items():
4090 assert isinstance(v, (list, tuple))
4091 data_extend(data, k, v)
4092 by_lang[data["lang"]].append(data)
4094 # XXX this code is clearly out of date. There is no longer a "conjugation"
4095 # field. FIX OR REMOVE.
4096 # Do some post-processing on the words. For example, we may distribute
4097 # conjugation information to all the words.
4098 ret = []
4099 for lang, lang_datas in by_lang.items():
4100 ret.extend(lang_datas)
4102 for x in ret:
4103 if x["word"] != word:
4104 if word.startswith("Unsupported titles/"):
4105 wxr.wtp.debug(
4106 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4107 sortid="20231101/3578page.py",
4108 )
4109 else:
4110 wxr.wtp.debug(
4111 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4112 sortid="20231101/3582page.py",
4113 )
4114 x["original_title"] = word
4115 # validate tag data
4116 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4117 return ret
4120def recursively_separate_raw_tags(
4121 wxr: WiktextractContext, data: dict[str, Any]
4122) -> None:
4123 if not isinstance(data, dict): 4123 ↛ 4124line 4123 didn't jump to line 4124 because the condition on line 4123 was never true
4124 wxr.wtp.error(
4125 "'data' is not dict; most probably "
4126 "data has a list that contains at least one dict and "
4127 "at least one non-dict item",
4128 sortid="en/page-4016/20240419",
4129 )
4130 return
4131 new_tags: list[str] = []
4132 raw_tags: list[str] = data.get("raw_tags", [])
4133 for field, val in data.items():
4134 if field == "tags":
4135 for tag in val:
4136 if tag not in valid_tags:
4137 raw_tags.append(tag)
4138 else:
4139 new_tags.append(tag)
4140 if isinstance(val, list):
4141 if len(val) > 0 and isinstance(val[0], dict):
4142 for d in val:
4143 recursively_separate_raw_tags(wxr, d)
4144 if "tags" in data and not new_tags:
4145 del data["tags"]
4146 elif new_tags:
4147 data["tags"] = new_tags
4148 if raw_tags:
4149 data["raw_tags"] = raw_tags
4152def process_soft_redirect_template(
4153 wxr: WiktextractContext,
4154 template_node: TemplateNode,
4155 redirect_pages: list[str],
4156) -> bool:
4157 # return `True` if the template is soft redirect template
4158 if template_node.template_name == "zh-see":
4159 # https://en.wiktionary.org/wiki/Template:zh-see
4160 title = clean_node(
4161 wxr, None, template_node.template_parameters.get(1, "")
4162 )
4163 if title != "": 4163 ↛ 4165line 4163 didn't jump to line 4165 because the condition on line 4163 was always true
4164 redirect_pages.append(title)
4165 return True
4166 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4167 # https://en.wiktionary.org/wiki/Template:ja-see
4168 for key, value in template_node.template_parameters.items():
4169 if isinstance(key, int): 4169 ↛ 4168line 4169 didn't jump to line 4168 because the condition on line 4169 was always true
4170 title = clean_node(wxr, None, value)
4171 if title != "": 4171 ↛ 4168line 4171 didn't jump to line 4168 because the condition on line 4171 was always true
4172 redirect_pages.append(title)
4173 return True
4174 return False
4177ZH_FORMS_TAGS = {
4178 "trad.": "Traditional-Chinese",
4179 "simp.": "Simplified-Chinese",
4180 "alternative forms": "alternative",
4181 "2nd round simp.": "Second-Round-Simplified-Chinese",
4182}
4185def extract_zh_forms_template(
4186 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4187):
4188 # https://en.wiktionary.org/wiki/Template:zh-forms
4189 lit_meaning = clean_node(
4190 wxr, None, t_node.template_parameters.get("lit", "")
4191 )
4192 if lit_meaning != "":
4193 base_data["literal_meaning"] = lit_meaning
4194 expanded_node = wxr.wtp.parse(
4195 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4196 )
4197 for table in expanded_node.find_child(NodeKind.TABLE):
4198 for row in table.find_child(NodeKind.TABLE_ROW):
4199 row_header = ""
4200 row_header_tags: list[str] = []
4201 header_has_span = False
4202 for cell in row.find_child(
4203 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4204 ):
4205 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4206 row_header, row_header_tags, header_has_span = (
4207 extract_zh_forms_header_cell(wxr, base_data, cell)
4208 )
4209 elif not header_has_span:
4210 extract_zh_forms_data_cell(
4211 wxr, base_data, cell, row_header, row_header_tags
4212 )
4214 if "forms" in base_data and len(base_data["forms"]) == 0: 4214 ↛ 4215line 4214 didn't jump to line 4215 because the condition on line 4214 was never true
4215 del base_data["forms"]
4218def extract_zh_forms_header_cell(
4219 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4220) -> tuple[str, list[str], bool]:
4221 row_header = ""
4222 row_header_tags = []
4223 header_has_span = False
4224 first_span_index = len(header_cell.children)
4225 for index, span_tag in header_cell.find_html("span", with_index=True):
4226 if index < first_span_index: 4226 ↛ 4228line 4226 didn't jump to line 4228 because the condition on line 4226 was always true
4227 first_span_index = index
4228 header_has_span = True
4229 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4230 for raw_tag in row_header.split(" and "):
4231 raw_tag = raw_tag.strip()
4232 if raw_tag != "":
4233 row_header_tags.append(raw_tag)
4234 for span_tag in header_cell.find_html_recursively("span"):
4235 span_lang = span_tag.attrs.get("lang", "")
4236 form_nodes = []
4237 sup_title = ""
4238 for node in span_tag.children:
4239 if isinstance(node, HTMLNode) and node.tag == "sup": 4239 ↛ 4240line 4239 didn't jump to line 4240 because the condition on line 4239 was never true
4240 for sup_span in node.find_html("span"):
4241 sup_title = sup_span.attrs.get("title", "")
4242 else:
4243 form_nodes.append(node)
4244 if span_lang in ["zh-Hant", "zh-Hans"]:
4245 for word in clean_node(wxr, None, form_nodes).split("/"):
4246 if word not in [wxr.wtp.title, ""]:
4247 form = {"form": word}
4248 for raw_tag in row_header_tags:
4249 if raw_tag in ZH_FORMS_TAGS: 4249 ↛ 4252line 4249 didn't jump to line 4252 because the condition on line 4249 was always true
4250 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4251 else:
4252 data_append(form, "raw_tags", raw_tag)
4253 if sup_title != "": 4253 ↛ 4254line 4253 didn't jump to line 4254 because the condition on line 4253 was never true
4254 data_append(form, "raw_tags", sup_title)
4255 data_append(base_data, "forms", form)
4256 return row_header, row_header_tags, header_has_span
4259TagLiteral = Literal["tags", "raw_tags"]
4260TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags")
4263def extract_zh_forms_data_cell(
4264 wxr: WiktextractContext,
4265 base_data: WordData,
4266 cell: WikiNode,
4267 row_header: str,
4268 row_header_tags: list[str],
4269) -> None:
4270 from .zh_pron_tags import ZH_PRON_TAGS
4272 forms: list[FormData] = []
4273 for top_span_tag in cell.find_html("span"):
4274 span_style = top_span_tag.attrs.get("style", "")
4275 span_lang = top_span_tag.attrs.get("lang", "")
4276 if span_style == "white-space:nowrap;":
4277 extract_zh_forms_data_cell(
4278 wxr, base_data, top_span_tag, row_header, row_header_tags
4279 )
4280 elif "font-size:80%" in span_style:
4281 raw_tag = clean_node(wxr, None, top_span_tag)
4282 if raw_tag != "": 4282 ↛ 4273line 4282 didn't jump to line 4273 because the condition on line 4282 was always true
4283 for form in forms:
4284 if raw_tag in ZH_PRON_TAGS: 4284 ↛ 4290line 4284 didn't jump to line 4290 because the condition on line 4284 was always true
4285 tr_tag = ZH_PRON_TAGS[raw_tag]
4286 if isinstance(tr_tag, list): 4286 ↛ 4287line 4286 didn't jump to line 4287 because the condition on line 4286 was never true
4287 data_extend(form, "tags", tr_tag)
4288 elif isinstance(tr_tag, str): 4288 ↛ 4283line 4288 didn't jump to line 4283 because the condition on line 4288 was always true
4289 data_append(form, "tags", tr_tag)
4290 elif raw_tag in valid_tags:
4291 data_append(form, "tags", raw_tag)
4292 else:
4293 data_append(form, "raw_tags", raw_tag)
4294 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4294 ↛ 4273line 4294 didn't jump to line 4273 because the condition on line 4294 was always true
4295 word = clean_node(wxr, None, top_span_tag)
4296 if word not in ["", "/", wxr.wtp.title]:
4297 form = {"form": word}
4298 if row_header != "anagram": 4298 ↛ 4304line 4298 didn't jump to line 4304 because the condition on line 4298 was always true
4299 for raw_tag in row_header_tags:
4300 if raw_tag in ZH_FORMS_TAGS: 4300 ↛ 4303line 4300 didn't jump to line 4303 because the condition on line 4300 was always true
4301 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4302 else:
4303 data_append(form, "raw_tags", raw_tag)
4304 if span_lang == "zh-Hant":
4305 data_append(form, "tags", "Traditional-Chinese")
4306 elif span_lang == "zh-Hans":
4307 data_append(form, "tags", "Simplified-Chinese")
4308 forms.append(form)
4310 if row_header == "anagram": 4310 ↛ 4311line 4310 didn't jump to line 4311 because the condition on line 4310 was never true
4311 for form in forms:
4312 l_data: LinkageData = {"word": form["form"]}
4313 for key in TAG_LITERALS_TUPLE:
4314 if key in form:
4315 l_data[key] = form[key]
4316 data_append(base_data, "anagrams", l_data)
4317 else:
4318 data_extend(base_data, "forms", forms)
4321def extract_ja_kanjitab_template(
4322 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4323):
4324 # https://en.wiktionary.org/wiki/Template:ja-kanjitab
4325 expanded_node = wxr.wtp.parse(
4326 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4327 )
4328 for table in expanded_node.find_child(NodeKind.TABLE):
4329 is_alt_form_table = False
4330 for row in table.find_child(NodeKind.TABLE_ROW):
4331 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
4332 header_text = clean_node(wxr, None, header_node)
4333 if header_text.startswith("Alternative spelling"):
4334 is_alt_form_table = True
4335 if not is_alt_form_table:
4336 continue
4337 forms = []
4338 for row in table.find_child(NodeKind.TABLE_ROW):
4339 for cell_node in row.find_child(NodeKind.TABLE_CELL):
4340 for child_node in cell_node.children:
4341 if isinstance(child_node, HTMLNode):
4342 if child_node.tag == "span":
4343 word = clean_node(wxr, None, child_node)
4344 if word != "": 4344 ↛ 4340line 4344 didn't jump to line 4340 because the condition on line 4344 was always true
4345 forms.append(
4346 {
4347 "form": word,
4348 "tags": ["alternative", "kanji"],
4349 }
4350 )
4351 elif child_node.tag == "small":
4352 raw_tag = clean_node(wxr, None, child_node).strip(
4353 "()"
4354 )
4355 if raw_tag != "" and len(forms) > 0: 4355 ↛ 4340line 4355 didn't jump to line 4340 because the condition on line 4355 was always true
4356 data_append(
4357 forms[-1],
4358 "tags"
4359 if raw_tag in valid_tags
4360 else "raw_tags",
4361 raw_tag,
4362 )
4363 data_extend(base_data, "forms", forms)
4364 for link_node in expanded_node.find_child(NodeKind.LINK):
4365 clean_node(wxr, base_data, link_node)