Coverage for src / wiktextract / extractor / en / page.py: 79%
1821 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-02 00:27 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8from collections import defaultdict
9from functools import partial
10from typing import (
11 TYPE_CHECKING,
12 Any,
13 Iterable,
14 Literal,
15 Optional,
16 Set,
17 Union,
18 cast,
19)
21from mediawiki_langcodes import get_all_names, name_to_code
22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
23from wikitextprocessor.parser import (
24 LEVEL_KIND_FLAGS,
25 GeneralNode,
26 HTMLNode,
27 LevelNode,
28 NodeKind,
29 TemplateNode,
30 WikiNode,
31)
33from ...clean import clean_template_args, clean_value
34from ...datautils import (
35 data_append,
36 data_extend,
37 ns_title_prefix_tuple,
38)
39from ...page import (
40 LEVEL_KINDS,
41 clean_node,
42 is_panel_template,
43 recursively_extract,
44)
45from ...tags import valid_tags
46from ...wxr_context import WiktextractContext
47from ...wxr_logging import logger
48from ..ruby import extract_ruby, parse_ruby
49from ..share import strip_nodes
50from .descendant import extract_descendant_section
51from .example import extract_example_list_item, extract_template_zh_x
52from .form_descriptions import (
53 classify_desc,
54 decode_tags,
55 distw,
56 parse_alt_or_inflection_of,
57 parse_sense_qualifier,
58 parse_word_head,
59)
60from .inflection import TableContext, parse_inflection_section
61from .info_templates import (
62 INFO_TEMPLATE_FUNCS,
63 parse_info_template_arguments,
64 parse_info_template_node,
65)
66from .linkages import (
67 extract_alt_form_section,
68 parse_linkage,
69)
70from .parts_of_speech import PARTS_OF_SPEECH
71from .section_titles import (
72 COMPOUNDS_TITLE,
73 DESCENDANTS_TITLE,
74 ETYMOLOGY_TITLES,
75 IGNORED_TITLES,
76 INFLECTION_TITLES,
77 LINKAGE_TITLES,
78 POS_TITLES,
79 PRONUNCIATION_TITLE,
80 PROTO_ROOT_DERIVED_TITLES,
81 TRANSLATIONS_TITLE,
82)
83from .translations import parse_translation_item_text
84from .type_utils import (
85 AttestationData,
86 ExampleData,
87 FormData,
88 LinkageData,
89 ReferenceData,
90 SenseData,
91 SoundData,
92 TemplateData,
93 WordData,
94)
95from .unsupported_titles import unsupported_title_map
97# When determining whether a string is 'english', classify_desc
98# might return 'taxonomic' which is English text 99% of the time.
99ENGLISH_TEXTS = ("english", "taxonomic")
101# Matches head tag
102HEAD_TAG_RE = re.compile(
103 r"^(head|Han char|arabic-noun|arabic-noun-form|"
104 r"hangul-symbol|syllable-hangul)$|"
105 + r"^(latin|"
106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
107 + r")-("
108 + "|".join(
109 [
110 "abbr",
111 "adj",
112 "adjective",
113 "adjective form",
114 "adjective-form",
115 "adv",
116 "adverb",
117 "affix",
118 "animal command",
119 "art",
120 "article",
121 "aux",
122 "bound pronoun",
123 "bound-pronoun",
124 "Buyla",
125 "card num",
126 "card-num",
127 "cardinal",
128 "chunom",
129 "classifier",
130 "clitic",
131 "cls",
132 "cmene",
133 "cmavo",
134 "colloq-verb",
135 "colverbform",
136 "combining form",
137 "combining-form",
138 "comparative",
139 "con",
140 "concord",
141 "conj",
142 "conjunction",
143 "conjug",
144 "cont",
145 "contr",
146 "converb",
147 "daybox",
148 "decl",
149 "decl noun",
150 "def",
151 "dem",
152 "det",
153 "determ",
154 "Deva",
155 "ending",
156 "entry",
157 "form",
158 "fuhivla",
159 "gerund",
160 "gismu",
161 "hanja",
162 "hantu",
163 "hanzi",
164 "head",
165 "ideophone",
166 "idiom",
167 "inf",
168 "indef",
169 "infixed pronoun",
170 "infixed-pronoun",
171 "infl",
172 "inflection",
173 "initialism",
174 "int",
175 "interfix",
176 "interj",
177 "interjection",
178 "jyut",
179 "latin",
180 "letter",
181 "locative",
182 "lujvo",
183 "monthbox",
184 "mutverb",
185 "name",
186 "nisba",
187 "nom",
188 "noun",
189 "noun form",
190 "noun-form",
191 "noun plural",
192 "noun-plural",
193 "nounprefix",
194 "num",
195 "number",
196 "numeral",
197 "ord",
198 "ordinal",
199 "par",
200 "part",
201 "part form",
202 "part-form",
203 "participle",
204 "particle",
205 "past",
206 "past neg",
207 "past-neg",
208 "past participle",
209 "past-participle",
210 "perfect participle",
211 "perfect-participle",
212 "personal pronoun",
213 "personal-pronoun",
214 "pref",
215 "prefix",
216 "phrase",
217 "pinyin",
218 "plural noun",
219 "plural-noun",
220 "pos",
221 "poss-noun",
222 "post",
223 "postp",
224 "postposition",
225 "PP",
226 "pp",
227 "ppron",
228 "pred",
229 "predicative",
230 "prep",
231 "prep phrase",
232 "prep-phrase",
233 "preposition",
234 "present participle",
235 "present-participle",
236 "pron",
237 "prondem",
238 "pronindef",
239 "pronoun",
240 "prop",
241 "proper noun",
242 "proper-noun",
243 "proper noun form",
244 "proper-noun form",
245 "proper noun-form",
246 "proper-noun-form",
247 "prov",
248 "proverb",
249 "prpn",
250 "prpr",
251 "punctuation mark",
252 "punctuation-mark",
253 "regnoun",
254 "rel",
255 "rom",
256 "romanji",
257 "root",
258 "sign",
259 "suff",
260 "suffix",
261 "syllable",
262 "symbol",
263 "verb",
264 "verb form",
265 "verb-form",
266 "verbal noun",
267 "verbal-noun",
268 "verbnec",
269 "vform",
270 ]
271 )
272 + r")(-|/|\+|$)"
273)
275# Head-templates causing problems (like newlines) that can be squashed into
276# an empty string in the template handler while saving their template
277# data for later.
278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}
280FLOATING_TABLE_TEMPLATES: set[str] = {
281 # az-suffix-form creates a style=floatright div that is otherwise
282 # deleted; if it is not pre-expanded, we can intercept the template
283 # so we add this set into do_not_pre_expand, and intercept the
284 # templates in parse_part_of_speech
285 "az-suffix-forms",
286 "az-inf-p",
287 "kk-suffix-forms",
288 "ky-suffix-forms",
289 "tr-inf-p",
290 "tr-suffix-forms",
291 "tt-suffix-forms",
292 "uz-suffix-forms",
293}
294# These two should contain template names that should always be
295# pre-expanded when *first* processing the tree, or not pre-expanded
296# so that the template are left in place with their identifying
297# name intact for later filtering.
299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
302# Additional templates to be expanded in the pre-expand phase
303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
304 "multitrans",
305 "multitrans-nowiki",
306 "trans-top",
307 "trans-top-also",
308 "trans-bottom",
309 "checktrans-top",
310 "checktrans-bottom",
311 "col",
312 "col1",
313 "col2",
314 "col3",
315 "col4",
316 "col5",
317 "col1-u",
318 "col2-u",
319 "col3-u",
320 "col4-u",
321 "col5-u",
322 "check deprecated lang param usage",
323 "deprecated code",
324 "ru-verb-alt-ё",
325 "ru-noun-alt-ё",
326 "ru-adj-alt-ё",
327 "ru-proper noun-alt-ё",
328 "ru-pos-alt-ё",
329 "ru-alt-ё",
330 "inflection of",
331 "no deprecated lang param usage",
332 "transclude", # these produce sense entries (or other lists)
333 "tcl",
334}
336# Inverse linkage for those that have them
337linkage_inverses: dict[str, str] = {
338 # XXX this is not currently used, move to post-processing
339 "synonyms": "synonyms",
340 "hypernyms": "hyponyms",
341 "hyponyms": "hypernyms",
342 "holonyms": "meronyms",
343 "meronyms": "holonyms",
344 "derived": "derived_from",
345 "coordinate_terms": "coordinate_terms",
346 "troponyms": "hypernyms",
347 "antonyms": "antonyms",
348 "instances": "instance_of",
349 "related": "related",
350}
352# Templates that are used to form panels on pages and that
353# should be ignored in various positions
354PANEL_TEMPLATES: set[str] = {
355 "Character info",
356 "CJKV",
357 "French personal pronouns",
358 "French possessive adjectives",
359 "French possessive pronouns",
360 "Han etym",
361 "Japanese demonstratives",
362 "Latn-script",
363 "LDL",
364 "MW1913Abbr",
365 "Number-encoding",
366 "Nuttall",
367 "Spanish possessive adjectives",
368 "Spanish possessive pronouns",
369 "USRegionDisputed",
370 "Webster 1913",
371 "ase-rfr",
372 "attention",
373 "attn",
374 "beer",
375 "broken ref",
376 "ca-compass",
377 "character info",
378 "character info/var",
379 "checksense",
380 "compass-fi",
381 "copyvio suspected",
382 "delete",
383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
384 "etystub",
385 "examples",
386 "hu-corr",
387 "hu-suff-pron",
388 "interwiktionary",
389 "ja-kanjitab",
390 "ja-kt",
391 "ko-hanja-search",
392 "look",
393 "maintenance box",
394 "maintenance line",
395 "mediagenic terms",
396 "merge",
397 "missing template",
398 "morse links",
399 "move",
400 "multiple images",
401 "no inline",
402 "picdic",
403 "picdicimg",
404 "picdiclabel",
405 "polyominoes",
406 "predidential nomics",
407 "punctuation", # This actually gets pre-expanded
408 "reconstructed",
409 "request box",
410 "rf-sound example",
411 "rfaccents",
412 "rfap",
413 "rfaspect",
414 "rfc",
415 "rfc-auto",
416 "rfc-header",
417 "rfc-level",
418 "rfc-pron-n",
419 "rfc-sense",
420 "rfclarify",
421 "rfd",
422 "rfd-redundant",
423 "rfd-sense",
424 "rfdate",
425 "rfdatek",
426 "rfdef",
427 "rfe",
428 "rfe/dowork",
429 "rfex",
430 "rfexp",
431 "rfform",
432 "rfgender",
433 "rfi",
434 "rfinfl",
435 "rfm",
436 "rfm-sense",
437 "rfp",
438 "rfp-old",
439 "rfquote",
440 "rfquote-sense",
441 "rfquotek",
442 "rfref",
443 "rfscript",
444 "rft2",
445 "rftaxon",
446 "rftone",
447 "rftranslit",
448 "rfv",
449 "rfv-etym",
450 "rfv-pron",
451 "rfv-quote",
452 "rfv-sense",
453 "selfref",
454 "split",
455 "stroke order", # XXX consider capturing this?
456 "stub entry",
457 "t-needed",
458 "tbot entry",
459 "tea room",
460 "tea room sense",
461 # "ttbc", - XXX needed in at least on/Preposition/Translation page
462 "unblock",
463 "unsupportedpage",
464 "video frames",
465 "was wotd",
466 "wrongtitle",
467 "zh-forms",
468 "zh-hanzi-box",
469 "no entry",
470}
472# Template name prefixes used for language-specific panel templates (i.e.,
473# templates that create side boxes or notice boxes or that should generally
474# be ignored).
475PANEL_PREFIXES: set[str] = {
476 "list:compass points/",
477 "list:Gregorian calendar months/",
478 "RQ:",
479}
481# Templates used for wikipedia links.
482wikipedia_templates: set[str] = {
483 "wikipedia",
484 "slim-wikipedia",
485 "w",
486 "W",
487 "swp",
488 "wiki",
489 "Wikipedia",
490 "wtorw",
491}
492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started
493 print(
494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
495 x
496 )
497 )
499# Mapping from a template name (without language prefix) for the main word
500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
501# it could validly occur. This is used as just a sanity check to give
502# warnings about probably incorrect coding in Wiktionary.
503template_allowed_pos_map: dict[str, list[str]] = {
504 "abbr": ["abbrev"],
505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
506 "plural noun": ["noun", "name"],
507 "plural-noun": ["noun", "name"],
508 "proper noun": ["noun", "name"],
509 "proper-noun": ["name", "noun"],
510 "prop": ["name", "noun"],
511 "verb": ["verb", "phrase"],
512 "gerund": ["verb"],
513 "particle": ["adv", "particle"],
514 "adj": ["adj", "adj_noun"],
515 "pron": ["pron", "noun"],
516 "name": ["name", "noun"],
517 "adv": ["adv", "intj", "conj", "particle"],
518 "phrase": ["phrase", "prep_phrase"],
519 "noun phrase": ["phrase"],
520 "ordinal": ["num"],
521 "number": ["num"],
522 "pos": ["affix", "name", "num"],
523 "suffix": ["suffix", "affix"],
524 "character": ["character"],
525 "letter": ["character"],
526 "kanji": ["character"],
527 "cont": ["abbrev"],
528 "interj": ["intj"],
529 "con": ["conj"],
530 "part": ["particle"],
531 "prep": ["prep", "postp"],
532 "postp": ["postp"],
533 "misspelling": ["noun", "adj", "verb", "adv"],
534 "part-form": ["verb"],
535}
536for k, v in template_allowed_pos_map.items():
537 for x in v:
538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true
539 print(
540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
541 "".format(x, k, v)
542 )
543 assert False
546# Templates ignored during etymology extraction, i.e., these will not be listed
547# in the extracted etymology templates.
548ignored_etymology_templates: list[str] = [
549 "...",
550 "IPAchar",
551 "ipachar",
552 "ISBN",
553 "isValidPageName",
554 "redlink category",
555 "deprecated code",
556 "check deprecated lang param usage",
557 "para",
558 "p",
559 "cite",
560 "Cite news",
561 "Cite newsgroup",
562 "cite paper",
563 "cite MLLM 1976",
564 "cite journal",
565 "cite news/documentation",
566 "cite paper/documentation",
567 "cite video game",
568 "cite video game/documentation",
569 "cite newsgroup",
570 "cite newsgroup/documentation",
571 "cite web/documentation",
572 "cite news",
573 "Cite book",
574 "Cite-book",
575 "cite book",
576 "cite web",
577 "cite-usenet",
578 "cite-video/documentation",
579 "Cite-journal",
580 "rfe",
581 "catlangname",
582 "cln",
583 "langname-lite",
584 "no deprecated lang param usage",
585 "mention",
586 "m",
587 "m-self",
588 "link",
589 "l",
590 "ll",
591 "l-self",
592]
593# Regexp for matching ignored etymology template names. This adds certain
594# prefixes to the names listed above.
595ignored_etymology_templates_re = re.compile(
596 r"^((cite-|R:|RQ:).*|"
597 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
598 + r")$"
599)
601# Regexp for matching ignored descendants template names. Right now we just
602# copy the ignored etymology templates
603ignored_descendants_templates_re = ignored_etymology_templates_re
605# Set of template names that are used to define usage examples. If the usage
606# example contains one of these templates, then it its type is set to
607# "example"
608usex_templates: set[str] = {
609 "afex",
610 "affixusex",
611 "co", # {{collocation}} acts like a example template, specifically for
612 # pairs of combinations of words that are more common than you'd
613 # except would be randomly; hlavní#Czech
614 "coi",
615 "collocation",
616 "el-example",
617 "el-x",
618 "example",
619 "examples",
620 "he-usex",
621 "he-x",
622 "hi-usex",
623 "hi-x",
624 "ja-usex-inline",
625 "ja-usex",
626 "ja-x",
627 "jbo-example",
628 "jbo-x",
629 "km-usex",
630 "km-x",
631 "ko-usex",
632 "ko-x",
633 "lo-usex",
634 "lo-x",
635 "ne-x",
636 "ne-usex",
637 "prefixusex",
638 "ryu-usex",
639 "ryu-x",
640 "shn-usex",
641 "shn-x",
642 "suffixusex",
643 "th-usex",
644 "th-x",
645 "ur-usex",
646 "ur-x",
647 "usex",
648 "usex-suffix",
649 "ux",
650 "uxi",
651}
653stop_head_at_these_templates: set[str] = {
654 "category",
655 "cat",
656 "topics",
657 "catlangname",
658 "c",
659 "C",
660 "top",
661 "cln",
662}
664# Set of template names that are used to define quotation examples. If the
665# usage example contains one of these templates, then its type is set to
666# "quotation".
667quotation_templates: set[str] = {
668 "collapse-quote",
669 "quote-av",
670 "quote-book",
671 "quote-GYLD",
672 "quote-hansard",
673 "quotei",
674 "quote-journal",
675 "quotelite",
676 "quote-mailing list",
677 "quote-meta",
678 "quote-newsgroup",
679 "quote-song",
680 "quote-text",
681 "quote",
682 "quote-us-patent",
683 "quote-video game",
684 "quote-web",
685 "quote-wikipedia",
686 "wikiquote",
687 "Wikiquote",
688 "Q",
689}
691taxonomy_templates = {
692 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
693 "taxfmt",
694 "taxlink",
695 "taxlink2",
696 "taxlinknew",
697 "taxlook",
698}
700# Template names, this was exctracted from template_linkage_mappings,
701# because the code using template_linkage_mappings was actually not used
702# (but not removed).
703template_linkages_to_ignore_in_examples: set[str] = {
704 "syn",
705 "synonyms",
706 "ant",
707 "antonyms",
708 "hyp",
709 "hyponyms",
710 "der",
711 "derived terms",
712 "coordinate terms",
713 "cot",
714 "rel",
715 "col",
716 "inline alt forms",
717 "alti",
718 "comeronyms",
719 "holonyms",
720 "holo",
721 "hypernyms",
722 "hyper",
723 "meronyms",
724 "mero",
725 "troponyms",
726 "perfectives",
727 "pf",
728 "imperfectives",
729 "impf",
730 "syndiff",
731 "synsee",
732 # not linkage nor example templates
733 "sense",
734 "s",
735 "color panel",
736 "colour panel",
737}
739# Maps template name used in a word sense to a linkage field that it adds.
740sense_linkage_templates: dict[str, str] = {
741 "syn": "synonyms",
742 "synonyms": "synonyms",
743 "synsee": "synonyms",
744 "syndiff": "synonyms",
745 "hyp": "hyponyms",
746 "hyponyms": "hyponyms",
747 "ant": "antonyms",
748 "antonyms": "antonyms",
749 "alti": "related",
750 "inline alt forms": "related",
751 "coordinate terms": "coordinate_terms",
752 "cot": "coordinate_terms",
753 "comeronyms": "related",
754 "holonyms": "holonyms",
755 "holo": "holonyms",
756 "hypernyms": "hypernyms",
757 "hyper": "hypernyms",
758 "meronyms": "meronyms",
759 "mero": "meronyms",
760 "troponyms": "troponyms",
761 "perfectives": "related",
762 "pf": "related",
763 "imperfectives": "related",
764 "impf": "related",
765 "parasynonyms": "synonyms",
766 "par": "synonyms",
767 "parasyn": "synonyms",
768 "nearsyn": "synonyms",
769 "near-syn": "synonyms",
770}
772sense_linkage_templates_tags: dict[str, list[str]] = {
773 "alti": ["alternative"],
774 "inline alt forms": ["alternative"],
775 "comeronyms": ["comeronym"],
776 "perfectives": ["perfective"],
777 "pf": ["perfective"],
778 "imperfectives": ["imperfective"],
779 "impf": ["imperfective"],
780}
783def decode_html_entities(v: Union[str, int]) -> str:
784 """Decodes HTML entities from a value, converting them to the respective
785 Unicode characters/strings."""
786 if isinstance(v, int):
787 # I changed this to return str(v) instead of v = str(v),
788 # but there might have been the intention to have more logic
789 # here. html.unescape would not do anything special with an integer,
790 # it needs html escape symbols (&xx;).
791 return str(v)
792 return html.unescape(v)
795def parse_sense_linkage(
796 wxr: WiktextractContext,
797 data: SenseData,
798 name: str,
799 ht: TemplateArgs,
800 pos: str,
801) -> None:
802 """Parses a linkage (synonym, etc) specified in a word sense."""
803 assert isinstance(wxr, WiktextractContext)
804 assert isinstance(data, dict)
805 assert isinstance(name, str)
806 assert isinstance(ht, dict)
807 field = sense_linkage_templates[name]
808 field_tags = sense_linkage_templates_tags.get(name, [])
809 for i in range(2, 20):
810 if i not in ht:
811 break
812 w = clean_node(wxr, data, ht[i])
813 if "#" in w:
814 w = w[: w.index("#")]
815 if w in ["", "<"]: # used in "hypernyms" template
816 continue
817 is_thesaurus = False
818 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
819 if w.startswith(alias):
820 is_thesaurus = True
821 w = w[len(alias) :]
822 if w != wxr.wtp.title: 822 ↛ 842line 822 didn't jump to line 842 because the condition on line 822 was always true
823 from ...thesaurus import search_thesaurus
825 lang_code = clean_node(wxr, None, ht.get(1, ""))
826 for t_data in search_thesaurus(
827 wxr.thesaurus_db_conn, # type: ignore
828 w,
829 lang_code,
830 pos,
831 "synonyms", # GH issue #1570
832 ):
833 l_data: LinkageData = {
834 "word": t_data.term,
835 "source": "Thesaurus:" + w,
836 }
837 if len(t_data.tags) > 0: 837 ↛ 838line 837 didn't jump to line 838 because the condition on line 837 was never true
838 l_data["tags"] = t_data.tags
839 if len(t_data.raw_tags) > 0: 839 ↛ 840line 839 didn't jump to line 840 because the condition on line 839 was never true
840 l_data["raw_tags"] = t_data.raw_tags
841 data_append(data, field, l_data)
842 break
843 if is_thesaurus:
844 continue
845 tags: list[str] = []
846 topics: list[str] = []
847 english: Optional[str] = None
848 # Try to find qualifiers for this synonym
849 q = ht.get("q{}".format(i - 1))
850 if q:
851 cls = classify_desc(q)
852 if cls == "tags":
853 tagsets1, topics1 = decode_tags(q)
854 for ts in tagsets1:
855 tags.extend(ts)
856 topics.extend(topics1)
857 elif cls == "english": 857 ↛ 863line 857 didn't jump to line 863 because the condition on line 857 was always true
858 if english: 858 ↛ 859line 858 didn't jump to line 859 because the condition on line 858 was never true
859 english += "; " + q
860 else:
861 english = q
862 # Try to find English translation for this synonym
863 t = ht.get("t{}".format(i - 1))
864 if t: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true
865 if english:
866 english += "; " + t
867 else:
868 english = t
870 # See if the linkage contains a parenthesized alt
871 alt = None
872 m = re.search(r"\(([^)]+)\)$", w)
873 if m: 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true
874 w = w[: m.start()].strip()
875 alt = m.group(1)
877 dt = {"word": w}
878 if field_tags: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true
879 data_extend(dt, "tags", field_tags)
880 if tags:
881 data_extend(dt, "tags", tags)
882 if topics: 882 ↛ 883line 882 didn't jump to line 883 because the condition on line 882 was never true
883 data_extend(dt, "topics", topics)
884 if english:
885 dt["english"] = english # DEPRECATED for "translation"
886 dt["translation"] = english
887 if alt: 887 ↛ 888line 887 didn't jump to line 888 because the condition on line 887 was never true
888 dt["alt"] = alt
889 data_append(data, field, dt)
892EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
893example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
894captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
897def synch_splits_with_args(
898 line: str, targs: TemplateArgs
899) -> Optional[list[str]]:
900 """If it looks like there's something weird with how a line of example
901 text has been split, this function will do the splitting after counting
902 occurences of the splitting regex inside the two main template arguments
903 containing the string data for the original language example and the
904 English translations.
905 """
906 # Previously, we split without capturing groups, but here we want to
907 # keep the original splitting hyphen regex intact.
908 fparts = captured_splitters_re.split(line)
909 new_parts = []
910 # ["First", " – ", "second", " – ", "third..."] from OL argument
911 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
912 new_parts.append("".join(fparts[:first]))
913 # Translation argument
914 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
915 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
916 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
917 new_parts.append("".join(fparts[first + 1 : second]))
919 if all(new_parts): # no empty strings from the above spaghetti
920 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
921 return new_parts
922 else:
923 return None
926QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
927QUALIFIERS_RE = re.compile(QUALIFIERS)
928# (...): ... or (...(...)...): ...
931def parse_language(
932 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
933) -> list[WordData]:
934 """Iterates over the text of the page, returning words (parts-of-speech)
935 defined on the page one at a time. (Individual word senses for the
936 same part-of-speech are typically encoded in the same entry.)"""
937 # imported here to avoid circular import
938 from .pronunciation import parse_pronunciation
940 assert isinstance(wxr, WiktextractContext)
941 assert isinstance(langnode, WikiNode)
942 assert isinstance(language, str)
943 assert isinstance(lang_code, str)
944 # print("parse_language", language)
946 is_reconstruction = False
947 word: str = wxr.wtp.title # type: ignore[assignment]
948 unsupported_prefix = "Unsupported titles/"
949 if word.startswith(unsupported_prefix):
950 w = word[len(unsupported_prefix) :]
951 if w in unsupported_title_map: 951 ↛ 954line 951 didn't jump to line 954 because the condition on line 951 was always true
952 word = unsupported_title_map[w]
953 else:
954 wxr.wtp.error(
955 "Unimplemented unsupported title: {}".format(word),
956 sortid="page/870",
957 )
958 word = w
959 elif word.startswith("Reconstruction:"):
960 word = word[word.find("/") + 1 :]
961 is_reconstruction = True
963 base_data: WordData = {
964 "word": word,
965 "lang": language,
966 "lang_code": lang_code,
967 }
968 if is_reconstruction:
969 data_append(base_data, "tags", "reconstruction")
970 sense_data: SenseData = {}
971 pos_data: WordData = {} # For a current part-of-speech
972 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
973 etym_data: WordData = {} # For one etymology
974 sense_datas: list[SenseData] = []
975 sense_ordinal = 0 # The recursive sense parsing messes up the ordering
976 # Never reset, do not use as data
977 level_four_datas: list[WordData] = []
978 etym_datas: list[WordData] = []
979 page_datas: list[WordData] = []
980 have_etym = False
981 inside_level_four = False # This is for checking if the etymology section
982 # or article has a Pronunciation section, for Chinese mostly; because
983 # Chinese articles can have three level three sections (two etymology
984 # sections and pronunciation sections) one after another, we need a kludge
985 # to better keep track of whether we're in a normal "etym" or inside a
986 # "level four" (which is what we've turned the level three Pron sections
987 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
988 # a step.
989 stack: list[str] = [] # names of items on the "stack"
991 def merge_base(data: WordData, base: WordData) -> None:
992 for k, v in base.items():
993 # Copy the value to ensure that we don't share lists or
994 # dicts between structures (even nested ones).
995 v = copy.deepcopy(v)
996 if k not in data:
997 # The list was copied above, so this will not create shared ref
998 data[k] = v # type: ignore[literal-required]
999 continue
1000 if data[k] == v: # type: ignore[literal-required]
1001 continue
1002 if ( 1002 ↛ 1010line 1002 didn't jump to line 1010 because the condition on line 1002 was always true
1003 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
1004 or isinstance(
1005 v,
1006 (list, tuple), # Should this be "and"?
1007 )
1008 ):
1009 data[k] = list(data[k]) + list(v) # type: ignore
1010 elif data[k] != v: # type: ignore[literal-required]
1011 wxr.wtp.warning(
1012 "conflicting values for {} in merge_base: "
1013 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
1014 sortid="page/904",
1015 )
1017 def complementary_pop(pron: SoundData, key: str) -> SoundData:
1018 """Remove unnecessary keys from dict values
1019 in a list comprehension..."""
1020 if key in pron:
1021 pron.pop(key) # type: ignore
1022 return pron
1024 # If the result has sounds, eliminate sounds that have a prefix that
1025 # does not match "word" or one of "forms"
1026 if "sounds" in data and "word" in data:
1027 accepted = [data["word"]]
1028 accepted.extend(f["form"] for f in data.get("forms", dict()))
1029 data["sounds"] = list(
1030 s
1031 for s in data["sounds"]
1032 if "form" not in s or s["form"] in accepted
1033 )
1034 # If the result has sounds, eliminate sounds that have a pos that
1035 # does not match "pos"
1036 if "sounds" in data and "pos" in data:
1037 data["sounds"] = list(
1038 complementary_pop(s, "pos")
1039 for s in data["sounds"]
1040 # "pos" is not a field of SoundData, correctly, so we're
1041 # removing it here. It's a kludge on a kludge on a kludge.
1042 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
1043 )
1045 def push_sense(sorting_ordinal: int | None = None) -> bool:
1046 """Starts collecting data for a new word sense. This returns True
1047 if a sense was added."""
1048 nonlocal sense_data
1049 if sorting_ordinal is None:
1050 sorting_ordinal = sense_ordinal
1051 tags = sense_data.get("tags", ())
1052 if (
1053 not sense_data.get("glosses")
1054 and "translation-hub" not in tags
1055 and "no-gloss" not in tags
1056 ):
1057 return False
1059 if ( 1059 ↛ 1069line 1059 didn't jump to line 1069 because the condition on line 1059 was never true
1060 (
1061 "participle" in sense_data.get("tags", ())
1062 or "infinitive" in sense_data.get("tags", ())
1063 )
1064 and "alt_of" not in sense_data
1065 and "form_of" not in sense_data
1066 and "etymology_text" in etym_data
1067 and etym_data["etymology_text"] != ""
1068 ):
1069 etym = etym_data["etymology_text"]
1070 etym = etym.split(". ")[0]
1071 ret = parse_alt_or_inflection_of(wxr, etym, set())
1072 if ret is not None:
1073 tags, lst = ret
1074 assert isinstance(lst, (list, tuple))
1075 if "form-of" in tags:
1076 data_extend(sense_data, "form_of", lst)
1077 data_extend(sense_data, "tags", tags)
1078 elif "alt-of" in tags:
1079 data_extend(sense_data, "alt_of", lst)
1080 data_extend(sense_data, "tags", tags)
1082 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1082 ↛ 1085line 1082 didn't jump to line 1085 because the condition on line 1082 was never true
1083 "tags", ()
1084 ):
1085 data_append(sense_data, "tags", "no-gloss")
1087 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore
1088 sense_datas.append(sense_data)
1089 sense_data = {}
1090 return True
1092 def push_pos(sorting_ordinal: int | None = None) -> None:
1093 """Starts collecting data for a new part-of-speech."""
1094 nonlocal pos_data
1095 nonlocal sense_datas
1096 push_sense(sorting_ordinal)
1097 if wxr.wtp.subsection:
1098 data: WordData = {"senses": sense_datas}
1099 merge_base(data, pos_data)
1100 level_four_datas.append(data)
1101 pos_data = {}
1102 sense_datas = []
1103 wxr.wtp.start_subsection(None)
1105 def push_level_four_section(clear_sound_data: bool) -> None:
1106 """Starts collecting data for a new level four sections, which
1107 is usually virtual and empty, unless the article has Chinese
1108 'Pronunciation' sections that are etymology-section-like but
1109 under etymology, and at the same level in the source. We modify
1110 the source to demote Pronunciation sections like that to level
1111 4, and other sections one step lower."""
1112 nonlocal level_four_data
1113 nonlocal level_four_datas
1114 nonlocal etym_datas
1115 push_pos()
1116 # print(f"======\n{etym_data=}")
1117 # print(f"======\n{etym_datas=}")
1118 # print(f"======\n{level_four_data=}")
1119 # print(f"======\n{level_four_datas=}")
1120 for data in level_four_datas:
1121 merge_base(data, level_four_data)
1122 etym_datas.append(data)
1123 for data in etym_datas:
1124 merge_base(data, etym_data)
1125 page_datas.append(data)
1126 if clear_sound_data:
1127 level_four_data = {}
1128 level_four_datas = []
1129 etym_datas = []
1131 def push_etym() -> None:
1132 """Starts collecting data for a new etymology."""
1133 nonlocal etym_data
1134 nonlocal etym_datas
1135 nonlocal have_etym
1136 nonlocal inside_level_four
1137 have_etym = True
1138 push_level_four_section(False)
1139 inside_level_four = False
1140 # etymology section could under pronunciation section
1141 etym_data = (
1142 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}
1143 )
1145 def select_data() -> WordData:
1146 """Selects where to store data (pos or etym) based on whether we
1147 are inside a pos (part-of-speech)."""
1148 # print(f"{wxr.wtp.subsection=}")
1149 # print(f"{stack=}")
1150 if wxr.wtp.subsection is not None:
1151 return pos_data
1152 if inside_level_four:
1153 return level_four_data
1154 if stack[-1] == language:
1155 return base_data
1156 return etym_data
1158 term_label_templates: list[TemplateData] = []
1160 def head_post_template_fn(
1161 name: str, ht: TemplateArgs, expansion: str
1162 ) -> Optional[str]:
1163 """Handles special templates in the head section of a word. Head
1164 section is the text after part-of-speech subtitle and before word
1165 sense list. Typically it generates the bold line for the word, but
1166 may also contain other useful information that often ends in
1167 side boxes. We want to capture some of that additional information."""
1168 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1169 if is_panel_template(wxr, name): 1169 ↛ 1172line 1169 didn't jump to line 1172 because the condition on line 1169 was never true
1170 # Completely ignore these templates (not even recorded in
1171 # head_templates)
1172 return ""
1173 if name == "head":
1174 # XXX are these also captured in forms? Should this special case
1175 # be removed?
1176 t = ht.get(2, "")
1177 if t == "pinyin": 1177 ↛ 1178line 1177 didn't jump to line 1178 because the condition on line 1177 was never true
1178 data_append(pos_data, "tags", "Pinyin")
1179 elif t == "romanization": 1179 ↛ 1180line 1179 didn't jump to line 1180 because the condition on line 1179 was never true
1180 data_append(pos_data, "tags", "romanization")
1181 if (
1182 HEAD_TAG_RE.search(name) is not None
1183 or name in WORD_LEVEL_HEAD_TEMPLATES
1184 ):
1185 args_ht = clean_template_args(wxr, ht)
1186 cleaned_expansion = clean_node(wxr, None, expansion)
1187 dt: TemplateData = {
1188 "name": name,
1189 "args": args_ht,
1190 "expansion": cleaned_expansion,
1191 }
1192 data_append(pos_data, "head_templates", dt)
1193 if name in WORD_LEVEL_HEAD_TEMPLATES:
1194 term_label_templates.append(dt)
1195 # Squash these, their tags are applied to the whole word,
1196 # and some cause problems like "term-label"
1197 return ""
1199 # The following are both captured in head_templates and parsed
1200 # separately
1202 if name in wikipedia_templates:
1203 # Note: various places expect to have content from wikipedia
1204 # templates, so cannot convert this to empty
1205 parse_wikipedia_template(wxr, pos_data, ht)
1206 return None
1208 if name == "number box": 1208 ↛ 1210line 1208 didn't jump to line 1210 because the condition on line 1208 was never true
1209 # XXX extract numeric value?
1210 return ""
1211 if name == "enum":
1212 # XXX extract?
1213 return ""
1214 if name == "cardinalbox": 1214 ↛ 1217line 1214 didn't jump to line 1217 because the condition on line 1214 was never true
1215 # XXX extract similar to enum?
1216 # XXX this can also occur in top-level under language
1217 return ""
1218 if name == "Han simplified forms": 1218 ↛ 1220line 1218 didn't jump to line 1220 because the condition on line 1218 was never true
1219 # XXX extract?
1220 return ""
1221 # if name == "ja-kanji forms":
1222 # # XXX extract?
1223 # return ""
1224 # if name == "vi-readings":
1225 # # XXX extract?
1226 # return ""
1227 # if name == "ja-kanji":
1228 # # XXX extract?
1229 # return ""
1230 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1230 ↛ 1232line 1230 didn't jump to line 1232 because the condition on line 1230 was never true
1231 # XXX extract?
1232 return ""
1234 return None
1236 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1237 """Parses the subsection for a part-of-speech under a language on
1238 a page."""
1239 assert isinstance(posnode, WikiNode)
1240 assert isinstance(pos, str)
1241 # print("parse_part_of_speech", pos)
1242 pos_data["pos"] = pos
1243 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1244 lists: list[list[WikiNode]] = [[]] # list of lists
1245 first_para = True
1246 first_head_tmplt = True
1247 collecting_head = True
1248 start_of_paragraph = True
1250 # XXX extract templates from posnode with recursively_extract
1251 # that break stuff, like ja-kanji or az-suffix-form.
1252 # Do the extraction with a list of template names, combined from
1253 # different lists, then separate out them into different lists
1254 # that are handled at different points of the POS section.
1255 # First, extract az-suffix-form, put it in `inflection`,
1256 # and parse `inflection`'s content when appropriate later.
1257 # The contents of az-suffix-form (and ja-kanji) that generate
1258 # divs with "floatright" in their style gets deleted by
1259 # clean_value, so templates that slip through from here won't
1260 # break anything.
1261 # XXX bookmark
1262 # print("===================")
1263 # print(posnode.children)
1265 floaters, poschildren = recursively_extract(
1266 posnode.children,
1267 lambda x: (
1268 isinstance(x, WikiNode)
1269 and (
1270 (
1271 isinstance(x, TemplateNode)
1272 and x.template_name in FLOATING_TABLE_TEMPLATES
1273 )
1274 or (
1275 x.kind == NodeKind.LINK
1276 # Need to check for stringiness because some links are
1277 # broken; for example, if a template is missing an
1278 # argument, a link might look like `[[{{{1}}}...]]`
1279 and len(x.largs) > 0
1280 and len(x.largs[0]) > 0
1281 and isinstance(x.largs[0][0], str)
1282 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]
1283 )
1284 )
1285 ),
1286 )
1287 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1288 tempnode.largs = [["Inflection"]]
1289 tempnode.children = floaters
1290 parse_inflection(tempnode, "Floating Div", pos)
1291 # print(poschildren)
1292 # XXX new above
1294 if not poschildren: 1294 ↛ 1295line 1294 didn't jump to line 1295 because the condition on line 1294 was never true
1295 if not floaters:
1296 wxr.wtp.debug(
1297 "PoS section without contents",
1298 sortid="en/page/1051/20230612",
1299 )
1300 else:
1301 wxr.wtp.debug(
1302 "PoS section without contents except for a floating table",
1303 sortid="en/page/1056/20230612",
1304 )
1305 return
1307 for node in poschildren:
1308 if isinstance(node, str):
1309 for m in re.finditer(r"\n+|[^\n]+", node):
1310 p = m.group(0)
1311 if p.startswith("\n\n") and pre:
1312 first_para = False
1313 start_of_paragraph = True
1314 break
1315 if p and collecting_head:
1316 pre[-1].append(p)
1317 continue
1318 assert isinstance(node, WikiNode)
1319 kind = node.kind
1320 if kind == NodeKind.LIST:
1321 lists[-1].append(node)
1322 collecting_head = False
1323 start_of_paragraph = True
1324 continue
1325 elif kind in LEVEL_KINDS:
1326 # Stop parsing section if encountering any kind of
1327 # level header (like ===Noun=== or ====Further Reading====).
1328 # At a quick glance, this should be the default behavior,
1329 # but if some kinds of source articles have sub-sub-sections
1330 # that should be parsed XXX it should be handled by changing
1331 # this break.
1332 break
1333 elif collecting_head and kind == NodeKind.LINK:
1334 # We might collect relevant links as they are often pictures
1335 # relating to the word
1336 if len(node.largs[0]) >= 1 and isinstance( 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true
1337 node.largs[0][0], str
1338 ):
1339 if node.largs[0][0].startswith( 1339 ↛ 1345line 1339 didn't jump to line 1345 because the condition on line 1339 was never true
1340 ns_title_prefix_tuple(wxr, "Category")
1341 ):
1342 # [[Category:...]]
1343 # We're at the end of the file, probably, so stop
1344 # here. Otherwise the head will get garbage.
1345 break
1346 if node.largs[0][0].startswith( 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true
1347 ns_title_prefix_tuple(wxr, "File")
1348 ):
1349 # Skips file links
1350 continue
1351 start_of_paragraph = False
1352 pre[-1].extend(node.largs[-1])
1353 elif kind == NodeKind.HTML:
1354 if node.sarg == "br":
1355 if pre[-1]: 1355 ↛ 1307line 1355 didn't jump to line 1307 because the condition on line 1355 was always true
1356 pre.append([]) # Switch to next head
1357 lists.append([]) # Lists parallels pre
1358 collecting_head = True
1359 start_of_paragraph = True
1360 elif collecting_head and node.sarg not in ( 1360 ↛ 1366line 1360 didn't jump to line 1366 because the condition on line 1360 was never true
1361 "gallery",
1362 "ref",
1363 "cite",
1364 "caption",
1365 ):
1366 start_of_paragraph = False
1367 pre[-1].append(node)
1368 else:
1369 start_of_paragraph = False
1370 elif isinstance(node, TemplateNode):
1371 # XXX Insert code here that disambiguates between
1372 # templates that generate word heads and templates
1373 # that don't.
1374 # There's head_tag_re that seems like a regex meant
1375 # to identify head templates. Too bad it's None.
1377 # ignore {{category}}, {{cat}}... etc.
1378 if node.template_name in stop_head_at_these_templates:
1379 # we've reached a template that should be at the end,
1380 continue
1382 # skip these templates; panel_templates is already used
1383 # to skip certain templates else, but it also applies to
1384 # head parsing quite well.
1385 # node.largs[0][0] should always be str, but can't type-check
1386 # that.
1387 if is_panel_template(wxr, node.template_name):
1388 continue
1389 # skip these templates
1390 # if node.largs[0][0] in skip_these_templates_in_head:
1391 # first_head_tmplt = False # no first_head_tmplt at all
1392 # start_of_paragraph = False
1393 # continue
1395 if first_head_tmplt and pre[-1]:
1396 first_head_tmplt = False
1397 start_of_paragraph = False
1398 pre[-1].append(node)
1399 elif pre[-1] and start_of_paragraph:
1400 pre.append([]) # Switch to the next head
1401 lists.append([]) # lists parallel pre
1402 collecting_head = True
1403 start_of_paragraph = False
1404 pre[-1].append(node)
1405 else:
1406 pre[-1].append(node)
1407 elif first_para:
1408 start_of_paragraph = False
1409 if collecting_head: 1409 ↛ 1307line 1409 didn't jump to line 1307 because the condition on line 1409 was always true
1410 pre[-1].append(node)
1411 # XXX use template_fn in clean_node to check that the head macro
1412 # is compatible with the current part-of-speech and generate warning
1413 # if not. Use template_allowed_pos_map.
1415 # Clean up empty pairs, and fix messes with extra newlines that
1416 # separate templates that are followed by lists wiktextract issue #314
1418 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1419 cleaned_lists: list[list[WikiNode]] = []
1420 pairless_pre_index = None
1422 for pre1, ls in zip(pre, lists):
1423 if pre1 and not ls:
1424 pairless_pre_index = len(cleaned_pre)
1425 if not pre1 and not ls: 1425 ↛ 1427line 1425 didn't jump to line 1427 because the condition on line 1425 was never true
1426 # skip [] + []
1427 continue
1428 if not ls and all(
1429 (isinstance(x, str) and not x.strip()) for x in pre1
1430 ):
1431 # skip ["\n", " "] + []
1432 continue
1433 if ls and not pre1:
1434 if pairless_pre_index is not None: 1434 ↛ 1435line 1434 didn't jump to line 1435 because the condition on line 1434 was never true
1435 cleaned_lists[pairless_pre_index] = ls
1436 pairless_pre_index = None
1437 continue
1438 cleaned_pre.append(pre1)
1439 cleaned_lists.append(ls)
1441 pre = cleaned_pre
1442 lists = cleaned_lists
1444 there_are_many_heads = len(pre) > 1
1445 header_tags: list[str] = []
1446 header_topics: list[str] = []
1447 previous_head_had_list = False
1449 if not any(g for g in lists):
1450 process_gloss_without_list(
1451 poschildren, pos, pos_data, header_tags, header_topics
1452 )
1453 else:
1454 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1455 # if len(ls) == 0:
1456 # # don't have gloss list
1457 # # XXX add code here to filter out 'garbage', like text
1458 # # that isn't a head template or head.
1459 # continue
1461 if all(not sl for sl in lists[i:]):
1462 if i == 0: 1462 ↛ 1463line 1462 didn't jump to line 1463 because the condition on line 1462 was never true
1463 if isinstance(node, str):
1464 wxr.wtp.debug(
1465 "first head without list of senses,"
1466 "string: '{}[...]', {}/{}".format(
1467 node[:20], word, language
1468 ),
1469 sortid="page/1689/20221215",
1470 )
1471 if isinstance(node, WikiNode):
1472 if node.largs and node.largs[0][0] in [
1473 "Han char",
1474 ]:
1475 # just ignore these templates
1476 pass
1477 else:
1478 wxr.wtp.debug(
1479 "first head without "
1480 "list of senses, "
1481 "template node "
1482 "{}, {}/{}".format(
1483 node.largs, word, language
1484 ),
1485 sortid="page/1694/20221215",
1486 )
1487 else:
1488 wxr.wtp.debug(
1489 "first head without list of senses, "
1490 "{}/{}".format(word, language),
1491 sortid="page/1700/20221215",
1492 )
1493 # no break here so that the first head always
1494 # gets processed.
1495 else:
1496 if isinstance(node, str): 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true
1497 wxr.wtp.debug(
1498 "later head without list of senses,"
1499 "string: '{}[...]', {}/{}".format(
1500 node[:20], word, language
1501 ),
1502 sortid="page/1708/20221215",
1503 )
1504 if isinstance(node, WikiNode): 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true
1505 wxr.wtp.debug(
1506 "later head without list of senses,"
1507 "template node "
1508 "{}, {}/{}".format(
1509 node.sarg if node.sarg else node.largs,
1510 word,
1511 language,
1512 ),
1513 sortid="page/1713/20221215",
1514 )
1515 else:
1516 wxr.wtp.debug(
1517 "later head without list of senses, "
1518 "{}/{}".format(word, language),
1519 sortid="page/1719/20221215",
1520 )
1521 break
1522 head_group = i + 1 if there_are_many_heads else None
1523 # print("parse_part_of_speech: {}: {}: pre={}"
1524 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1526 if previous_head_had_list:
1527 # We use a boolean flag here because we want to be able
1528 # let the header_tags data pass through after the loop
1529 # is over without accidentally emptying it, if there are
1530 # no pos_datas and we need a dummy data.
1531 header_tags.clear()
1532 header_topics.clear()
1534 process_gloss_header(
1535 pre1, pos, head_group, pos_data, header_tags, header_topics
1536 )
1537 for ln in ls:
1538 # Parse each list associated with this head.
1539 for node in ln.children:
1540 # Parse nodes in l.children recursively.
1541 # The recursion function uses push_sense() to
1542 # add stuff into sense_datas, and returns True or
1543 # False if something is added, which bubbles upward.
1544 # If the bubble is "True", then higher levels of
1545 # the recursion will not push_sense(), because
1546 # the data is already pushed into a sub-gloss
1547 # downstream, unless the higher level has examples
1548 # that need to be put somewhere.
1549 common_data: SenseData = {
1550 "tags": list(header_tags),
1551 "topics": list(header_topics),
1552 }
1553 if head_group:
1554 common_data["head_nr"] = head_group
1555 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1557 if len(ls) > 0:
1558 previous_head_had_list = True
1559 else:
1560 previous_head_had_list = False
1562 # If there are no senses extracted, add a dummy sense. We want to
1563 # keep tags extracted from the head for the dummy sense.
1564 push_sense() # Make sure unfinished data pushed, and start clean sense
1565 if len(sense_datas) == 0:
1566 data_extend(sense_data, "tags", header_tags)
1567 data_extend(sense_data, "topics", header_topics)
1568 data_append(sense_data, "tags", "no-gloss")
1569 push_sense()
1571 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore
1573 for sd in sense_datas:
1574 if "__temp_sense_sorting_ordinal" in sd: 1574 ↛ 1573line 1574 didn't jump to line 1573 because the condition on line 1574 was always true
1575 del sd["__temp_sense_sorting_ordinal"] # type: ignore
1577 def process_gloss_header(
1578 header_nodes: list[Union[WikiNode, str]],
1579 pos_type: str,
1580 header_group: Optional[int],
1581 pos_data: WordData,
1582 header_tags: list[str],
1583 header_topics: list[str],
1584 ) -> None:
1585 ruby = []
1586 links: list[str] = []
1588 # process template parse nodes here
1589 new_nodes = []
1590 info_template_data = []
1591 for node in header_nodes:
1592 # print(f"{node=}")
1593 info_data, info_out = parse_info_template_node(wxr, node, "head")
1594 if info_data or info_out:
1595 if info_data: 1595 ↛ 1597line 1595 didn't jump to line 1597 because the condition on line 1595 was always true
1596 info_template_data.append(info_data)
1597 if info_out: # including just the original node 1597 ↛ 1598line 1597 didn't jump to line 1598 because the condition on line 1597 was never true
1598 new_nodes.append(info_out)
1599 else:
1600 new_nodes.append(node)
1601 header_nodes = new_nodes
1603 if info_template_data:
1604 if "info_templates" not in pos_data: 1604 ↛ 1607line 1604 didn't jump to line 1607 because the condition on line 1604 was always true
1605 pos_data["info_templates"] = info_template_data
1606 else:
1607 pos_data["info_templates"].extend(info_template_data)
1609 if not word.isalnum():
1610 # `-` is kosher, add more of these if needed.
1611 if word.replace("-", "").isalnum():
1612 pass
1613 else:
1614 # if the word contains non-letter or -number characters, it
1615 # might have something that messes with split-at-semi-comma; we
1616 # collect links so that we can skip splitting them.
1617 exp = wxr.wtp.parse(
1618 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1619 )
1620 link_nodes, _ = recursively_extract(
1621 exp.children,
1622 lambda x: isinstance(x, WikiNode)
1623 and x.kind == NodeKind.LINK,
1624 )
1625 for ln in link_nodes:
1626 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1627 if not ltext.isalnum():
1628 links.append(ltext)
1629 if word not in links: 1629 ↛ 1632line 1629 didn't jump to line 1632 because the condition on line 1629 was always true
1630 links.append(word)
1632 if lang_code == "ja":
1633 exp = wxr.wtp.parse(
1634 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1635 )
1636 rub, _ = recursively_extract(
1637 exp.children,
1638 lambda x: isinstance(x, WikiNode)
1639 and x.kind == NodeKind.HTML
1640 and x.sarg == "ruby",
1641 )
1642 if rub is not None: 1642 ↛ 1687line 1642 didn't jump to line 1687 because the condition on line 1642 was always true
1643 for r in rub:
1644 if TYPE_CHECKING:
1645 # we know the lambda above in recursively_extract
1646 # returns only WikiNodes in rub
1647 assert isinstance(r, WikiNode)
1648 rt = parse_ruby(wxr, r)
1649 if rt is not None: 1649 ↛ 1643line 1649 didn't jump to line 1643 because the condition on line 1649 was always true
1650 ruby.append(rt)
1651 elif lang_code == "vi":
1652 # Handle vi-readings templates that have a weird structures for
1653 # Chu Nom vietnamese characters heads
1654 # https://en.wiktionary.org/wiki/Template:vi-readings
1655 new_header_nodes = []
1656 related_readings: list[LinkageData] = []
1657 for node in header_nodes:
1658 if ( 1658 ↛ 1682line 1658 didn't jump to line 1682 because the condition on line 1658 was always true
1659 isinstance(node, TemplateNode)
1660 and node.template_name == "vi-readings"
1661 ):
1662 print(node.template_parameters)
1663 for parameter, tag in (
1664 ("hanviet", "han-viet-reading"),
1665 ("nom", "nom-reading"),
1666 # we ignore the fanqie parameter "phienthiet"
1667 ):
1668 arg = node.template_parameters.get(parameter)
1669 if arg is not None: 1669 ↛ 1663line 1669 didn't jump to line 1663 because the condition on line 1669 was always true
1670 text = clean_node(wxr, None, arg)
1671 for w in text.split(","):
1672 # ignore - separated references
1673 if "-" in w:
1674 w = w[: w.index("-")]
1675 w = w.strip()
1676 related_readings.append(
1677 LinkageData(word=w, tags=[tag])
1678 )
1679 continue
1681 # Skip the vi-reading template for the rest of the head parsing
1682 new_header_nodes.append(node)
1683 if len(related_readings) > 0: 1683 ↛ 1687line 1683 didn't jump to line 1687 because the condition on line 1683 was always true
1684 data_extend(pos_data, "related", related_readings)
1685 header_nodes = new_header_nodes
1687 header_text = clean_node(
1688 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1689 )
1691 if not header_text.strip():
1692 return
1694 term_label_tags: list[str] = []
1695 term_label_topics: list[str] = []
1696 if len(term_label_templates) > 0:
1697 # parse term label templates; if there are other similar kinds
1698 # of templates in headers that you want to squash and apply as
1699 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES
1700 for templ_data in term_label_templates:
1701 # print(templ_data)
1702 expan = templ_data.get("expansion", "").strip("().,; ")
1703 if not expan: 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true
1704 continue
1705 tlb_tagsets, tlb_topics = decode_tags(expan)
1706 for tlb_tags in tlb_tagsets:
1707 if len(tlb_tags) > 0 and not any(
1708 t.startswith("error-") for t in tlb_tags
1709 ):
1710 term_label_tags.extend(tlb_tags)
1711 term_label_topics.extend(tlb_topics)
1712 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")
1714 header_text = re.sub(r"\s+", " ", header_text)
1715 # print(f"{header_text=}")
1716 parse_word_head(
1717 wxr,
1718 pos_type,
1719 header_text,
1720 pos_data,
1721 is_reconstruction,
1722 header_group,
1723 ruby=ruby,
1724 links=links,
1725 )
1726 if "tags" in pos_data:
1727 # pos_data can get "tags" data from some source; type-checkers
1728 # doesn't like it, so let's ignore it.
1729 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1730 del pos_data["tags"] # type: ignore[typeddict-item]
1731 if len(term_label_tags) > 0:
1732 header_tags.extend(term_label_tags)
1733 if len(term_label_topics) > 0:
1734 header_topics.extend(term_label_topics)
1736 def process_gloss_without_list(
1737 nodes: list[Union[WikiNode, str]],
1738 pos_type: str,
1739 pos_data: WordData,
1740 header_tags: list[str],
1741 header_topics: list[str],
1742 ) -> None:
1743 # gloss text might not inside a list
1744 header_nodes: list[Union[str, WikiNode]] = []
1745 gloss_nodes: list[Union[str, WikiNode]] = []
1746 for node in strip_nodes(nodes):
1747 if isinstance(node, WikiNode):
1748 if isinstance(node, TemplateNode):
1749 if node.template_name in (
1750 "zh-see",
1751 "ja-see",
1752 "ja-see-kango",
1753 ):
1754 continue # soft redirect
1755 elif (
1756 node.template_name == "head"
1757 or node.template_name.startswith(f"{lang_code}-")
1758 ):
1759 header_nodes.append(node)
1760 continue
1761 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1761 ↛ 1763line 1761 didn't jump to line 1763 because the condition on line 1761 was always true
1762 break
1763 gloss_nodes.append(node)
1765 if len(header_nodes) > 0:
1766 process_gloss_header(
1767 header_nodes,
1768 pos_type,
1769 None,
1770 pos_data,
1771 header_tags,
1772 header_topics,
1773 )
1774 if len(gloss_nodes) > 0:
1775 process_gloss_contents(
1776 gloss_nodes,
1777 pos_type,
1778 {"tags": list(header_tags), "topics": list(header_topics)},
1779 )
1781 def parse_sense_node(
1782 node: Union[str, WikiNode], # never receives str
1783 sense_base: SenseData,
1784 pos: str,
1785 ) -> bool:
1786 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1787 Uses push_sense() to attempt adding data to pos_data in the scope
1788 of parse_language() when it reaches deep in the recursion. push_sense()
1789 returns True if it succeeds, and that is bubbled up the stack; if
1790 a sense was added downstream, the higher levels (whose shared data
1791 was already added by a subsense) do not push_sense(), unless it
1792 has examples that need to be put somewhere.
1793 """
1794 assert isinstance(sense_base, dict) # Added to every sense deeper in
1796 nonlocal sense_ordinal
1797 my_ordinal = sense_ordinal # copies, not a reference
1798 sense_ordinal += 1 # only use for sorting
1800 if not isinstance(node, WikiNode): 1800 ↛ 1802line 1800 didn't jump to line 1802 because the condition on line 1800 was never true
1801 # This doesn't seem to ever happen in practice.
1802 wxr.wtp.debug(
1803 "{}: parse_sense_node called with"
1804 "something that isn't a WikiNode".format(pos),
1805 sortid="page/1287/20230119",
1806 )
1807 return False
1809 if node.kind != NodeKind.LIST_ITEM: 1809 ↛ 1810line 1809 didn't jump to line 1810 because the condition on line 1809 was never true
1810 wxr.wtp.debug(
1811 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1812 )
1813 return False
1815 if node.sarg == ":":
1816 # Skip example entries at the highest level, ones without
1817 # a sense ("...#") above them.
1818 # If node.sarg is exactly and only ":", then it's at
1819 # the highest level; lower levels would have more
1820 # "indentation", like "#:" or "##:"
1821 return False
1823 # If a recursion call succeeds in push_sense(), bubble it up with
1824 # `added`.
1825 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1826 added = False
1828 gloss_template_args: set[str] = set()
1830 # For LISTs and LIST_ITEMS, their argument is something like
1831 # "##" or "##:", and using that we can rudimentally determine
1832 # list 'depth' if need be, and also what kind of list or
1833 # entry it is; # is for normal glosses, : for examples (indent)
1834 # and * is used for quotations on wiktionary.
1835 current_depth = node.sarg
1837 children = node.children
1839 # subentries, (presumably) a list
1840 # of subglosses below this. The list's
1841 # argument ends with #, and its depth should
1842 # be bigger than parent node.
1843 subentries = [
1844 x
1845 for x in children
1846 if isinstance(x, WikiNode)
1847 and x.kind == NodeKind.LIST
1848 and x.sarg == current_depth + "#"
1849 ]
1851 # sublists of examples and quotations. .sarg
1852 # does not end with "#".
1853 others = [
1854 x
1855 for x in children
1856 if isinstance(x, WikiNode)
1857 and x.kind == NodeKind.LIST
1858 and x.sarg != current_depth + "#"
1859 ]
1861 # the actual contents of this particular node.
1862 # can be a gloss (or a template that expands into
1863 # many glosses which we can't easily pre-expand)
1864 # or could be an "outer gloss" with more specific
1865 # subglosses, or could be a qualfier for the subglosses.
1866 contents = [
1867 x
1868 for x in children
1869 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1870 ]
1871 # If this entry has sublists of entries, we should combine
1872 # gloss information from both the "outer" and sublist content.
1873 # Sometimes the outer gloss
1874 # is more non-gloss or tags, sometimes it is a coarse sense
1875 # and the inner glosses are more specific. The outer one
1876 # does not seem to have qualifiers.
1878 # If we have one sublist with one element, treat it
1879 # specially as it may be a Wiktionary error; raise
1880 # that nested element to the same level.
1881 # XXX If need be, this block can be easily removed in
1882 # the current recursive logicand the result is one sense entry
1883 # with both glosses in the glosses list, as you would
1884 # expect. If the higher entry has examples, there will
1885 # be a higher entry with some duplicated data.
1886 if len(subentries) == 1:
1887 slc = subentries[0].children
1888 if len(slc) == 1:
1889 # copy current node and modify it so it doesn't
1890 # loop infinitely.
1891 cropped_node = copy.copy(node)
1892 cropped_node.children = [
1893 x
1894 for x in children
1895 if not (
1896 isinstance(x, WikiNode)
1897 and x.kind == NodeKind.LIST
1898 and x.sarg == current_depth + "#"
1899 )
1900 ]
1901 added |= parse_sense_node(cropped_node, sense_base, pos)
1902 nonlocal sense_data # this kludge causes duplicated raw_
1903 # glosses data if this is not done;
1904 # if the top-level (cropped_node)
1905 # does not push_sense() properly or
1906 # parse_sense_node() returns early,
1907 # sense_data is not reset. This happens
1908 # for example when you have a no-gloss
1909 # string like "(intransitive)":
1910 # no gloss, push_sense() returns early
1911 # and sense_data has duplicate data with
1912 # sense_base
1913 sense_data = {}
1914 added |= parse_sense_node(slc[0], sense_base, pos)
1915 return added
1917 return process_gloss_contents(
1918 contents,
1919 pos,
1920 sense_base,
1921 subentries,
1922 others,
1923 gloss_template_args,
1924 added,
1925 my_ordinal,
1926 )
1928 def process_gloss_contents(
1929 contents: list[Union[str, WikiNode]],
1930 pos: str,
1931 sense_base: SenseData,
1932 subentries: list[WikiNode] = [],
1933 others: list[WikiNode] = [],
1934 gloss_template_args: Set[str] = set(),
1935 added: bool = False,
1936 sorting_ordinal: int | None = None,
1937 ) -> bool:
1938 def sense_template_fn(
1939 name: str, ht: TemplateArgs, is_gloss: bool = False
1940 ) -> Optional[str]:
1941 # print(f"sense_template_fn: {name}, {ht}")
1942 if name in wikipedia_templates:
1943 # parse_wikipedia_template(wxr, pos_data, ht)
1944 return None
1945 if is_panel_template(wxr, name):
1946 return ""
1947 if name in INFO_TEMPLATE_FUNCS:
1948 info_data, info_exp = parse_info_template_arguments(
1949 wxr, name, ht, "sense"
1950 )
1951 if info_data or info_exp: 1951 ↛ 1957line 1951 didn't jump to line 1957 because the condition on line 1951 was always true
1952 if info_data: 1952 ↛ 1954line 1952 didn't jump to line 1954 because the condition on line 1952 was always true
1953 data_append(sense_base, "info_templates", info_data)
1954 if info_exp and isinstance(info_exp, str): 1954 ↛ 1956line 1954 didn't jump to line 1956 because the condition on line 1954 was always true
1955 return info_exp
1956 return ""
1957 if name in ("defdate",):
1958 date = clean_node(wxr, None, ht.get(1, ()))
1959 if part_two := ht.get(2): 1959 ↛ 1961line 1959 didn't jump to line 1961 because the condition on line 1959 was never true
1960 # Unicode mdash, not '-'
1961 date += "–" + clean_node(wxr, None, part_two)
1962 refs: dict[str, ReferenceData] = {}
1963 # ref, refn, ref2, ref2n, ref3, ref3n
1964 # ref1 not valid
1965 for k, v in sorted(
1966 (k, v) for k, v in ht.items() if isinstance(k, str)
1967 ):
1968 if m := re.match(r"ref(\d?)(n?)", k): 1968 ↛ 1965line 1968 didn't jump to line 1965 because the condition on line 1968 was always true
1969 ref_v = clean_node(wxr, None, v)
1970 if m.group(1) not in refs: # empty string or digit
1971 refs[m.group(1)] = ReferenceData()
1972 if m.group(2):
1973 refs[m.group(1)]["refn"] = ref_v
1974 else:
1975 refs[m.group(1)]["text"] = ref_v
1976 data_append(
1977 sense_base,
1978 "attestations",
1979 AttestationData(date=date, references=list(refs.values())),
1980 )
1981 return ""
1982 if name == "senseid":
1983 langid = clean_node(wxr, None, ht.get(1, ()))
1984 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1985 if re.match(r"Q\d+$", arg):
1986 data_append(sense_base, "wikidata", arg)
1987 data_append(sense_base, "senseid", langid + ":" + arg)
1988 if name in sense_linkage_templates:
1989 # print(f"SENSE_TEMPLATE_FN: {name}")
1990 parse_sense_linkage(wxr, sense_base, name, ht, pos)
1991 return ""
1992 if name == "†" or name == "zh-obsolete":
1993 data_append(sense_base, "tags", "obsolete")
1994 return ""
1995 if name in {
1996 "ux",
1997 "uxi",
1998 "usex",
1999 "afex",
2000 "prefixusex",
2001 "ko-usex",
2002 "ko-x",
2003 "hi-x",
2004 "ja-usex-inline",
2005 "ja-x",
2006 "quotei",
2007 "he-x",
2008 "hi-x",
2009 "km-x",
2010 "ne-x",
2011 "shn-x",
2012 "th-x",
2013 "ur-x",
2014 }:
2015 # Usage examples are captured separately below. We don't
2016 # want to expand them into glosses even when unusual coding
2017 # is used in the entry.
2018 # These templates may slip through inside another item, but
2019 # currently we're separating out example entries (..#:)
2020 # well enough that there seems to very little contamination.
2021 if is_gloss:
2022 wxr.wtp.wiki_notice(
2023 "Example template is used for gloss text",
2024 sortid="extractor.en.page.sense_template_fn/1415",
2025 )
2026 else:
2027 return ""
2028 if name == "w": 2028 ↛ 2029line 2028 didn't jump to line 2029 because the condition on line 2028 was never true
2029 if ht.get(2) == "Wp":
2030 return ""
2031 for v in ht.values():
2032 v = v.strip()
2033 if v and "<" not in v:
2034 gloss_template_args.add(v)
2035 return None
2037 def extract_link_texts(item: GeneralNode) -> None:
2038 """Recursively extracts link texts from the gloss source. This
2039 information is used to select whether to remove final "." from
2040 form_of/alt_of (e.g., ihm/Hunsrik)."""
2041 if isinstance(item, (list, tuple)):
2042 for x in item:
2043 extract_link_texts(x)
2044 return
2045 if isinstance(item, str):
2046 # There seem to be HTML sections that may futher contain
2047 # unparsed links.
2048 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2048 ↛ 2049line 2048 didn't jump to line 2049 because the loop on line 2048 never started
2049 print("ITER:", m.group(0))
2050 v = m.group(1).split("|")[-1].strip()
2051 if v:
2052 gloss_template_args.add(v)
2053 return
2054 if not isinstance(item, WikiNode): 2054 ↛ 2055line 2054 didn't jump to line 2055 because the condition on line 2054 was never true
2055 return
2056 if item.kind == NodeKind.LINK:
2057 v = item.largs[-1]
2058 if ( 2058 ↛ 2064line 2058 didn't jump to line 2064 because the condition on line 2058 was always true
2059 isinstance(v, list)
2060 and len(v) == 1
2061 and isinstance(v[0], str)
2062 ):
2063 gloss_template_args.add(v[0].strip())
2064 for x in item.children:
2065 extract_link_texts(x)
2067 extract_link_texts(contents)
2069 # get the raw text of non-list contents of this node, and other stuff
2070 # like tag and category data added to sense_base
2071 # cast = no-op type-setter for the type-checker
2072 partial_template_fn = cast(
2073 TemplateFnCallable,
2074 partial(sense_template_fn, is_gloss=True),
2075 )
2076 rawgloss = clean_node(
2077 wxr,
2078 sense_base,
2079 contents,
2080 template_fn=partial_template_fn,
2081 collect_links=True,
2082 )
2084 if not rawgloss: 2084 ↛ 2085line 2084 didn't jump to line 2085 because the condition on line 2084 was never true
2085 return False
2087 # remove manually typed ordered list text at the start("1. ")
2088 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
2090 # get stuff like synonyms and categories from "others",
2091 # maybe examples and quotations
2092 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
2094 # The gloss could contain templates that produce more list items.
2095 # This happens commonly with, e.g., {{inflection of|...}}. Split
2096 # to parts. However, e.g. Interlingua generates multiple glosses
2097 # in HTML directly without Wikitext markup, so we must also split
2098 # by just newlines.
2099 subglosses = rawgloss.splitlines()
2101 if len(subglosses) == 0: 2101 ↛ 2102line 2101 didn't jump to line 2102 because the condition on line 2101 was never true
2102 return False
2104 if any(s.startswith("#") for s in subglosses):
2105 subtree = wxr.wtp.parse(rawgloss)
2106 # from wikitextprocessor.parser import print_tree
2107 # print("SUBTREE GENERATED BY TEMPLATE:")
2108 # print_tree(subtree)
2109 new_subentries = [
2110 x
2111 for x in subtree.children
2112 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2113 ]
2115 new_others = [
2116 x
2117 for x in subtree.children
2118 if isinstance(x, WikiNode)
2119 and x.kind == NodeKind.LIST
2120 and not x.sarg.endswith("#")
2121 ]
2123 new_contents = [
2124 clean_node(wxr, [], x)
2125 for x in subtree.children
2126 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2127 ]
2129 subentries = subentries or new_subentries
2130 others = others or new_others
2131 subglosses = new_contents
2132 rawgloss = "".join(subglosses)
2133 # Generate no gloss for translation hub pages, but add the
2134 # "translation-hub" tag for them
2135 if rawgloss == "(This entry is a translation hub.)": 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true
2136 data_append(sense_data, "tags", "translation-hub")
2137 return push_sense(sorting_ordinal)
2139 # Remove certain substrings specific to outer glosses
2140 strip_ends = [", particularly:"]
2141 for x in strip_ends:
2142 if rawgloss.endswith(x):
2143 rawgloss = rawgloss[: -len(x)].strip()
2144 break
2146 # A single gloss, or possibly an outer gloss.
2147 # Check if the possible outer gloss starts with
2148 # parenthesized tags/topics
2150 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):
2151 data_append(sense_base, "raw_glosses", subglosses[0].strip())
2152 m = QUALIFIERS_RE.match(rawgloss)
2153 # (...): ... or (...(...)...): ...
2154 if m:
2155 q = m.group(1)
2156 rawgloss = rawgloss[m.end() :].strip()
2157 parse_sense_qualifier(wxr, q, sense_base)
2158 if rawgloss == "A pejorative:": 2158 ↛ 2159line 2158 didn't jump to line 2159 because the condition on line 2158 was never true
2159 data_append(sense_base, "tags", "pejorative")
2160 rawgloss = ""
2161 elif rawgloss == "Short forms.": 2161 ↛ 2162line 2161 didn't jump to line 2162 because the condition on line 2161 was never true
2162 data_append(sense_base, "tags", "abbreviation")
2163 rawgloss = ""
2164 elif rawgloss == "Technical or specialized senses.": 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true
2165 rawgloss = ""
2166 elif rawgloss.startswith("inflection of "):
2167 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
2168 if parsed is not None: 2168 ↛ 2177line 2168 didn't jump to line 2177 because the condition on line 2168 was always true
2169 tags, origins = parsed
2170 if origins is not None: 2170 ↛ 2172line 2170 didn't jump to line 2172 because the condition on line 2170 was always true
2171 data_extend(sense_base, "form_of", origins)
2172 if tags is not None: 2172 ↛ 2175line 2172 didn't jump to line 2175 because the condition on line 2172 was always true
2173 data_extend(sense_base, "tags", tags)
2174 else:
2175 data_append(sense_base, "tags", "form-of")
2176 else:
2177 data_append(sense_base, "tags", "form-of")
2178 if rawgloss: 2178 ↛ 2209line 2178 didn't jump to line 2209 because the condition on line 2178 was always true
2179 # Code duplicating a lot of clean-up operations from later in
2180 # this block. We want to clean up the "supergloss" as much as
2181 # possible, in almost the same way as a normal gloss.
2182 supergloss = rawgloss
2184 if supergloss.startswith("; "): 2184 ↛ 2185line 2184 didn't jump to line 2185 because the condition on line 2184 was never true
2185 supergloss = supergloss[1:].strip()
2187 if supergloss.startswith(("^†", "†")):
2188 data_append(sense_base, "tags", "obsolete")
2189 supergloss = supergloss[2:].strip()
2190 elif supergloss.startswith("^‡"): 2190 ↛ 2191line 2190 didn't jump to line 2191 because the condition on line 2190 was never true
2191 data_extend(sense_base, "tags", ["obsolete", "historical"])
2192 supergloss = supergloss[2:].strip()
2194 # remove [14th century...] style brackets at the end
2195 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
2197 if supergloss.startswith((",", ":")):
2198 supergloss = supergloss[1:]
2199 supergloss = supergloss.strip()
2200 if supergloss.startswith("N. of "): 2200 ↛ 2201line 2200 didn't jump to line 2201 because the condition on line 2200 was never true
2201 supergloss = "Name of " + supergloss[6:]
2202 supergloss = supergloss[2:]
2203 data_append(sense_base, "glosses", supergloss)
2204 if supergloss in ("A person:",):
2205 data_append(sense_base, "tags", "g-person")
2207 # The main recursive call (except for the exceptions at the
2208 # start of this function).
2209 for sublist in subentries:
2210 if not ( 2210 ↛ 2213line 2210 didn't jump to line 2213 because the condition on line 2210 was never true
2211 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
2212 ):
2213 wxr.wtp.debug(
2214 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
2215 f"with items that are not LISTs",
2216 sortid="page/1511/20230119",
2217 )
2218 continue
2219 for item in sublist.children:
2220 if not ( 2220 ↛ 2224line 2220 didn't jump to line 2224 because the condition on line 2220 was never true
2221 isinstance(item, WikiNode)
2222 and item.kind == NodeKind.LIST_ITEM
2223 ):
2224 continue
2225 # copy sense_base to prevent cross-contamination between
2226 # subglosses and other subglosses and superglosses
2227 sense_base2 = copy.deepcopy(sense_base)
2228 if parse_sense_node(item, sense_base2, pos): 2228 ↛ 2219line 2228 didn't jump to line 2219 because the condition on line 2228 was always true
2229 added = True
2231 # Capture examples.
2232 # This is called after the recursive calls above so that
2233 # sense_base is not contaminated with meta-data from
2234 # example entries for *this* gloss.
2235 examples = []
2236 if wxr.config.capture_examples: 2236 ↛ 2240line 2236 didn't jump to line 2240 because the condition on line 2236 was always true
2237 examples = extract_examples(others, sense_base)
2239 # push_sense() succeeded somewhere down-river, so skip this level
2240 if added:
2241 if examples:
2242 # this higher-up gloss has examples that we do not want to skip
2243 wxr.wtp.debug(
2244 "'{}[...]' gloss has examples we want to keep, "
2245 "but there are subglosses.".format(repr(rawgloss[:30])),
2246 sortid="page/1498/20230118",
2247 )
2248 else:
2249 return True
2251 # Some entries, e.g., "iacebam", have weird sentences in quotes
2252 # after the gloss, but these sentences don't seem to be intended
2253 # as glosses. Skip them.
2254 indexed_subglosses = list(
2255 (i, gl)
2256 for i, gl in enumerate(subglosses)
2257 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2258 )
2260 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2260 ↛ 2261line 2260 didn't jump to line 2261 because the condition on line 2260 was never true
2261 gl = indexed_subglosses[0][1].strip()
2262 if gl.endswith(":"):
2263 gl = gl[:-1].strip()
2264 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2265 if parsed is not None:
2266 infl_tags, infl_dts = parsed
2267 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2268 # Interpret others as a particular form under
2269 # "inflection of"
2270 data_extend(sense_base, "tags", infl_tags)
2271 data_extend(sense_base, "form_of", infl_dts)
2272 indexed_subglosses = indexed_subglosses[1:]
2273 elif not infl_dts:
2274 data_extend(sense_base, "tags", infl_tags)
2275 indexed_subglosses = indexed_subglosses[1:]
2277 # Create senses for remaining subglosses
2278 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2279 gloss = gloss.strip()
2280 if not gloss and len(indexed_subglosses) > 1: 2280 ↛ 2281line 2280 didn't jump to line 2281 because the condition on line 2280 was never true
2281 continue
2282 # Push a new sense (if the last one is not empty)
2283 if push_sense(sorting_ordinal): 2283 ↛ 2284line 2283 didn't jump to line 2284 because the condition on line 2283 was never true
2284 added = True
2285 # if gloss not in sense_data.get("raw_glosses", ()):
2286 # data_append(sense_data, "raw_glosses", gloss)
2287 if i == 0 and examples:
2288 # In a multi-line gloss, associate examples
2289 # with only one of them.
2290 # XXX or you could use gloss_i == len(indexed_subglosses)
2291 # to associate examples with the *last* one.
2292 data_extend(sense_data, "examples", examples)
2293 if gloss.startswith("; ") and gloss_i > 0: 2293 ↛ 2294line 2293 didn't jump to line 2294 because the condition on line 2293 was never true
2294 gloss = gloss[1:].strip()
2295 # If the gloss starts with †, mark as obsolete
2296 if gloss.startswith("^†"): 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true
2297 data_append(sense_data, "tags", "obsolete")
2298 gloss = gloss[2:].strip()
2299 elif gloss.startswith("^‡"): 2299 ↛ 2300line 2299 didn't jump to line 2300 because the condition on line 2299 was never true
2300 data_extend(sense_data, "tags", ["obsolete", "historical"])
2301 gloss = gloss[2:].strip()
2302 # Copy data for all senses to this sense
2303 for k, v in sense_base.items():
2304 if isinstance(v, (list, tuple)):
2305 if k != "tags":
2306 # Tags handled below (countable/uncountable special)
2307 data_extend(sense_data, k, v)
2308 else:
2309 assert k not in ("tags", "categories", "topics")
2310 sense_data[k] = v # type:ignore[literal-required]
2311 # Parse the gloss for this particular sense
2312 m = QUALIFIERS_RE.match(gloss)
2313 # (...): ... or (...(...)...): ...
2314 if m:
2315 parse_sense_qualifier(wxr, m.group(1), sense_data)
2316 gloss = gloss[m.end() :].strip()
2318 # Remove common suffix "[from 14th c.]" and similar
2319 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2321 # Check to make sure we don't have unhandled list items in gloss
2322 ofs = max(gloss.find("#"), gloss.find("* "))
2323 if ofs > 10 and "(#)" not in gloss:
2324 wxr.wtp.debug(
2325 "gloss may contain unhandled list items: {}".format(gloss),
2326 sortid="page/1412",
2327 )
2328 elif "\n" in gloss: 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true
2329 wxr.wtp.debug(
2330 "gloss contains newline: {}".format(gloss),
2331 sortid="page/1416",
2332 )
2334 # Kludge, some glosses have a comma after initial qualifiers in
2335 # parentheses
2336 if gloss.startswith((",", ":")):
2337 gloss = gloss[1:]
2338 gloss = gloss.strip()
2339 if gloss.endswith(":"):
2340 gloss = gloss[:-1].strip()
2341 if gloss.startswith("N. of "): 2341 ↛ 2342line 2341 didn't jump to line 2342 because the condition on line 2341 was never true
2342 gloss = "Name of " + gloss[6:]
2343 if gloss.startswith("†"): 2343 ↛ 2344line 2343 didn't jump to line 2344 because the condition on line 2343 was never true
2344 data_append(sense_data, "tags", "obsolete")
2345 gloss = gloss[1:]
2346 elif gloss.startswith("^†"): 2346 ↛ 2347line 2346 didn't jump to line 2347 because the condition on line 2346 was never true
2347 data_append(sense_data, "tags", "obsolete")
2348 gloss = gloss[2:]
2350 # Copy tags from sense_base if any. This will not copy
2351 # countable/uncountable if either was specified in the sense,
2352 # as sometimes both are specified in word head but only one
2353 # in individual senses.
2354 countability_tags = []
2355 base_tags = sense_base.get("tags", ())
2356 sense_tags = sense_data.get("tags", ())
2357 for tag in base_tags:
2358 if tag in ("countable", "uncountable"):
2359 if tag not in countability_tags: 2359 ↛ 2361line 2359 didn't jump to line 2361 because the condition on line 2359 was always true
2360 countability_tags.append(tag)
2361 continue
2362 if tag not in sense_tags:
2363 data_append(sense_data, "tags", tag)
2364 if countability_tags:
2365 if ( 2365 ↛ 2374line 2365 didn't jump to line 2374 because the condition on line 2365 was always true
2366 "countable" not in sense_tags
2367 and "uncountable" not in sense_tags
2368 ):
2369 data_extend(sense_data, "tags", countability_tags)
2371 # If outer gloss specifies a form-of ("inflection of", see
2372 # aquamarine/German), try to parse the inner glosses as
2373 # tags for an inflected form.
2374 if "form-of" in sense_base.get("tags", ()):
2375 parsed = parse_alt_or_inflection_of(
2376 wxr, gloss, gloss_template_args
2377 )
2378 if parsed is not None: 2378 ↛ 2384line 2378 didn't jump to line 2384 because the condition on line 2378 was always true
2379 infl_tags, infl_dts = parsed
2380 if not infl_dts and infl_tags: 2380 ↛ 2384line 2380 didn't jump to line 2384 because the condition on line 2380 was always true
2381 # Interpret as a particular form under "inflection of"
2382 data_extend(sense_data, "tags", infl_tags)
2384 if not gloss: 2384 ↛ 2385line 2384 didn't jump to line 2385 because the condition on line 2384 was never true
2385 data_append(sense_data, "tags", "empty-gloss")
2386 elif gloss != "-" and gloss not in sense_data.get("glosses", []):
2387 if ( 2387 ↛ 2398line 2387 didn't jump to line 2398 because the condition on line 2387 was always true
2388 gloss_i == 0
2389 and len(sense_data.get("glosses", tuple())) >= 1
2390 ):
2391 # If we added a "high-level gloss" from rawgloss, but this
2392 # is that same gloss_i, add this instead of the raw_gloss
2393 # from before if they're different: the rawgloss was not
2394 # cleaned exactly the same as this later gloss
2395 sense_data["glosses"][-1] = gloss
2396 else:
2397 # Add the gloss for the sense.
2398 data_append(sense_data, "glosses", gloss)
2400 # Kludge: there are cases (e.g., etc./Swedish) where there are
2401 # two abbreviations in the same sense, both generated by the
2402 # {{abbreviation of|...}} template. Handle these with some magic.
2403 position = 0
2404 split_glosses = []
2405 for m in re.finditer(r"Abbreviation of ", gloss):
2406 if m.start() != position: 2406 ↛ 2405line 2406 didn't jump to line 2405 because the condition on line 2406 was always true
2407 split_glosses.append(gloss[position : m.start()])
2408 position = m.start()
2409 split_glosses.append(gloss[position:])
2410 for gloss in split_glosses:
2411 # Check if this gloss describes an alt-of or inflection-of
2412 if (
2413 lang_code != "en"
2414 and " " not in gloss
2415 and distw([word], gloss) < 0.3
2416 ):
2417 # Don't try to parse gloss if it is one word
2418 # that is close to the word itself for non-English words
2419 # (probable translations of a tag/form name)
2420 continue
2421 parsed = parse_alt_or_inflection_of(
2422 wxr, gloss, gloss_template_args
2423 )
2424 if parsed is None:
2425 continue
2426 tags, dts = parsed
2427 if not dts and tags:
2428 data_extend(sense_data, "tags", tags)
2429 continue
2430 for dt in dts: # type:ignore[union-attr]
2431 ftags = list(tag for tag in tags if tag != "form-of")
2432 if "alt-of" in tags:
2433 data_extend(sense_data, "tags", ftags)
2434 data_append(sense_data, "alt_of", dt)
2435 elif "compound-of" in tags: 2435 ↛ 2436line 2435 didn't jump to line 2436 because the condition on line 2435 was never true
2436 data_extend(sense_data, "tags", ftags)
2437 data_append(sense_data, "compound_of", dt)
2438 elif "synonym-of" in tags: 2438 ↛ 2439line 2438 didn't jump to line 2439 because the condition on line 2438 was never true
2439 data_extend(dt, "tags", ftags)
2440 data_append(sense_data, "synonyms", dt)
2441 elif tags and dt.get("word", "").startswith("of "): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true
2442 dt["word"] = dt["word"][3:]
2443 data_append(sense_data, "tags", "form-of")
2444 data_extend(sense_data, "tags", ftags)
2445 data_append(sense_data, "form_of", dt)
2446 elif "form-of" in tags: 2446 ↛ 2430line 2446 didn't jump to line 2430 because the condition on line 2446 was always true
2447 data_extend(sense_data, "tags", tags)
2448 data_append(sense_data, "form_of", dt)
2450 if len(sense_data) == 0:
2451 if len(sense_base.get("tags", [])) == 0: 2451 ↛ 2453line 2451 didn't jump to line 2453 because the condition on line 2451 was always true
2452 del sense_base["tags"]
2453 sense_data.update(sense_base)
2454 if push_sense(sorting_ordinal): 2454 ↛ 2458line 2454 didn't jump to line 2458 because the condition on line 2454 was always true
2455 # push_sense succeded in adding a sense to pos_data
2456 added = True
2457 # print("PARSE_SENSE DONE:", pos_datas[-1])
2458 return added
2460 def parse_inflection(
2461 node: WikiNode, section: str, pos: Optional[str]
2462 ) -> None:
2463 """Parses inflection data (declension, conjugation) from the given
2464 page. This retrieves the actual inflection template
2465 parameters, which are very useful for applications that need
2466 to learn the inflection classes and generate inflected
2467 forms."""
2468 assert isinstance(node, WikiNode)
2469 assert isinstance(section, str)
2470 assert pos is None or isinstance(pos, str)
2471 # print("parse_inflection:", node)
2473 if pos is None: 2473 ↛ 2474line 2473 didn't jump to line 2474 because the condition on line 2473 was never true
2474 wxr.wtp.debug(
2475 "inflection table outside part-of-speech", sortid="page/1812"
2476 )
2477 return
2479 def inflection_template_fn(
2480 name: str, ht: TemplateArgs
2481 ) -> Optional[str]:
2482 # print("decl_conj_template_fn", name, ht)
2483 if is_panel_template(wxr, name): 2483 ↛ 2484line 2483 didn't jump to line 2484 because the condition on line 2483 was never true
2484 return ""
2485 if name in ("is-u-mutation",): 2485 ↛ 2488line 2485 didn't jump to line 2488 because the condition on line 2485 was never true
2486 # These are not to be captured as an exception to the
2487 # generic code below
2488 return None
2489 m = re.search(
2490 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2491 r"declension|inflection|mut|mutation)($|-)",
2492 name,
2493 )
2494 if m:
2495 args_ht = clean_template_args(wxr, ht)
2496 dt = {"name": name, "args": args_ht}
2497 data_append(pos_data, "inflection_templates", dt)
2499 return None
2501 # Convert the subtree back to Wikitext, then expand all and parse,
2502 # capturing templates in the process
2503 text = wxr.wtp.node_to_wikitext(node.children)
2505 # Split text into separate sections for each to-level template
2506 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)
2507 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]
2508 # The (?:...) creates a non-capturing regex group; if it was capturing,
2509 # like the group around it, it would create elements in brace_matches,
2510 # including None if it doesn't match.
2511 # 20250114: Added {| and |} into the regex because tables were being
2512 # cut into pieces by this code. Issue #973, introduction of two-part
2513 # book-end templates similar to trans-top and tran-bottom.
2514 template_sections = []
2515 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2516 # Because there is the possibility of triple curly braces
2517 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2518 # count nesting depth using pairs of two brackets, but
2519 # instead use singular braces ("{ }").
2520 # Because template delimiters should be balanced, regardless
2521 # of whether {{ or {{{ is used, and because we only care
2522 # about the outer-most delimiters (the highest level template)
2523 # we can just count the single braces when those single
2524 # braces are part of a group.
2525 table_nesting = 0
2526 # However, if we have a stray table ({| ... |}) that should always
2527 # be its own section, and should prevent templates from cutting it
2528 # into sections.
2530 # print(f"Parse inflection: {text=}")
2531 # print(f"Brace matches: {repr('///'.join(brace_matches))}")
2532 if len(brace_matches) > 1:
2533 tsection: list[str] = []
2534 after_templates = False # kludge to keep any text
2535 # before first template
2536 # with the first template;
2537 # otherwise, text
2538 # goes with preceding template
2539 for m in brace_matches:
2540 if m.startswith("\n; ") and after_templates: 2540 ↛ 2541line 2540 didn't jump to line 2541 because the condition on line 2540 was never true
2541 after_templates = False
2542 template_sections.append(tsection)
2543 tsection = []
2544 tsection.append(m)
2545 elif m.startswith("{{") or m.endswith("{|"):
2546 if (
2547 template_nesting == 0
2548 and after_templates
2549 and table_nesting == 0
2550 ):
2551 template_sections.append(tsection)
2552 tsection = []
2553 # start new section
2554 after_templates = True
2555 if m.startswith("{{"):
2556 template_nesting += 1
2557 else:
2558 # m.endswith("{|")
2559 table_nesting += 1
2560 tsection.append(m)
2561 elif m.startswith("}}") or m.endswith("|}"):
2562 if m.startswith("}}"):
2563 template_nesting -= 1
2564 if template_nesting < 0: 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true
2565 wxr.wtp.error(
2566 "Negatively nested braces, "
2567 "couldn't split inflection templates, "
2568 "{}/{} section {}".format(
2569 word, language, section
2570 ),
2571 sortid="page/1871",
2572 )
2573 template_sections = [] # use whole text
2574 break
2575 else:
2576 table_nesting -= 1
2577 if table_nesting < 0: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true
2578 wxr.wtp.error(
2579 "Negatively nested table braces, "
2580 "couldn't split inflection section, "
2581 "{}/{} section {}".format(
2582 word, language, section
2583 ),
2584 sortid="page/20250114",
2585 )
2586 template_sections = [] # use whole text
2587 break
2588 tsection.append(m)
2589 else:
2590 tsection.append(m)
2591 if tsection: # dangling tsection 2591 ↛ 2599line 2591 didn't jump to line 2599 because the condition on line 2591 was always true
2592 template_sections.append(tsection)
2593 # Why do it this way around? The parser has a preference
2594 # to associate bits outside of tables with the preceding
2595 # table (`after`-variable), so a new tsection begins
2596 # at {{ and everything before it belongs to the previous
2597 # template.
2599 texts = []
2600 if not template_sections:
2601 texts = [text]
2602 else:
2603 for tsection in template_sections:
2604 texts.append("".join(tsection))
2605 if template_nesting != 0: 2605 ↛ 2606line 2605 didn't jump to line 2606 because the condition on line 2605 was never true
2606 wxr.wtp.error(
2607 "Template nesting error: "
2608 "template_nesting = {} "
2609 "couldn't split inflection templates, "
2610 "{}/{} section {}".format(
2611 template_nesting, word, language, section
2612 ),
2613 sortid="page/1896",
2614 )
2615 texts = [text]
2616 for text in texts:
2617 tree = wxr.wtp.parse(
2618 text, expand_all=True, template_fn=inflection_template_fn
2619 )
2621 if not text.strip():
2622 continue
2624 # Parse inflection tables from the section. The data is stored
2625 # under "forms".
2626 if wxr.config.capture_inflections: 2626 ↛ 2616line 2626 didn't jump to line 2616 because the condition on line 2626 was always true
2627 tablecontext = None
2628 m = re.search(r"{{([^}{|]+)\|?", text)
2629 if m:
2630 template_name = m.group(1)
2631 tablecontext = TableContext(template_name)
2633 parse_inflection_section(
2634 wxr,
2635 pos_data,
2636 word,
2637 language,
2638 pos,
2639 section,
2640 tree,
2641 tablecontext=tablecontext,
2642 )
2644 def get_subpage_section(
2645 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]
2646 ) -> Optional[Union[WikiNode, str]]:
2647 """Loads a subpage of the given page, and finds the section
2648 for the given language, part-of-speech, and section title. This
2649 is used for finding translations and other sections on subpages."""
2650 assert isinstance(language, str)
2651 assert isinstance(title, str)
2652 assert isinstance(subtitle, str)
2653 assert isinstance(seqs, (list, tuple))
2654 for seq in seqs:
2655 for x in seq:
2656 assert isinstance(x, str)
2657 subpage_title = word + "/" + subtitle
2658 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2659 if subpage_content is None:
2660 wxr.wtp.error(
2661 "/translations not found despite "
2662 "{{see translation subpage|...}}",
2663 sortid="page/1934",
2664 )
2665 return None
2667 def recurse(
2668 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2669 ) -> Optional[Union[str, WikiNode]]:
2670 # print(f"seq: {seq}")
2671 if not seq:
2672 return node
2673 if not isinstance(node, WikiNode):
2674 return None
2675 # print(f"node.kind: {node.kind}")
2676 if node.kind in LEVEL_KINDS:
2677 t = clean_node(wxr, None, node.largs[0])
2678 # print(f"t: {t} == seq[0]: {seq[0]}?")
2679 if t.lower() == seq[0].lower():
2680 seq = seq[1:]
2681 if not seq:
2682 return node
2683 for n in node.children:
2684 ret = recurse(n, seq)
2685 if ret is not None:
2686 return ret
2687 return None
2689 tree = wxr.wtp.parse(
2690 subpage_content,
2691 pre_expand=True,
2692 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2693 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2694 )
2695 assert tree.kind == NodeKind.ROOT
2696 for seq in seqs:
2697 ret = recurse(tree, seq)
2698 if ret is None:
2699 wxr.wtp.debug(
2700 "Failed to find subpage section {}/{} seq {}".format(
2701 title, subtitle, seq
2702 ),
2703 sortid="page/1963",
2704 )
2705 return ret
2707 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2708 """Parses translations for a word. This may also pull in translations
2709 from separate translation subpages."""
2710 assert isinstance(data, dict)
2711 assert isinstance(xlatnode, WikiNode)
2712 # print("===== PARSE_TRANSLATIONS {} {} {}"
2713 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2714 # print("parse_translations xlatnode={}".format(xlatnode))
2715 if not wxr.config.capture_translations: 2715 ↛ 2716line 2715 didn't jump to line 2716 because the condition on line 2715 was never true
2716 return
2717 sense_parts: list[Union[WikiNode, str]] = []
2718 sense: Optional[str] = None
2720 def parse_translation_item(
2721 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2722 ) -> None:
2723 nonlocal sense
2724 assert isinstance(contents, list)
2725 assert lang is None or isinstance(lang, str)
2726 # print("PARSE_TRANSLATION_ITEM:", contents)
2728 langcode: Optional[str] = None
2729 if sense is None:
2730 sense = clean_node(wxr, data, sense_parts).strip()
2731 # print("sense <- clean_node: ", sense)
2732 idx = sense.find("See also translations at")
2733 if idx > 0: 2733 ↛ 2734line 2733 didn't jump to line 2734 because the condition on line 2733 was never true
2734 wxr.wtp.debug(
2735 "Skipping translation see also: {}".format(sense),
2736 sortid="page/2361",
2737 )
2738 sense = sense[:idx].strip()
2739 if sense.endswith(":"): 2739 ↛ 2740line 2739 didn't jump to line 2740 because the condition on line 2739 was never true
2740 sense = sense[:-1].strip()
2741 if sense.endswith("—"): 2741 ↛ 2742line 2741 didn't jump to line 2742 because the condition on line 2741 was never true
2742 sense = sense[:-1].strip()
2743 translations_from_template: list[str] = []
2745 def translation_item_template_fn(
2746 name: str, ht: TemplateArgs
2747 ) -> Optional[str]:
2748 nonlocal langcode
2749 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2750 if is_panel_template(wxr, name):
2751 return ""
2752 if name in ("t+check", "t-check", "t-needed"):
2753 # We ignore these templates. They seem to have outright
2754 # garbage in some entries, and very varying formatting in
2755 # others. These should be transitory and unreliable
2756 # anyway.
2757 return "__IGNORE__"
2758 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2759 code = ht.get(1)
2760 if code: 2760 ↛ 2770line 2760 didn't jump to line 2770 because the condition on line 2760 was always true
2761 if langcode and code != langcode:
2762 wxr.wtp.debug(
2763 "inconsistent language codes {} vs "
2764 "{} in translation item: {!r} {}".format(
2765 langcode, code, name, ht
2766 ),
2767 sortid="page/2386",
2768 )
2769 langcode = code
2770 tr = ht.get(2)
2771 if tr:
2772 tr = clean_node(wxr, None, [tr])
2773 translations_from_template.append(tr)
2774 return None
2775 if name == "t-egy":
2776 langcode = "egy"
2777 return None
2778 if name == "ttbc":
2779 code = ht.get(1)
2780 if code: 2780 ↛ 2782line 2780 didn't jump to line 2782 because the condition on line 2780 was always true
2781 langcode = code
2782 return None
2783 if name == "trans-see": 2783 ↛ 2784line 2783 didn't jump to line 2784 because the condition on line 2783 was never true
2784 wxr.wtp.error(
2785 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2786 )
2787 return ""
2788 if name.endswith("-top"): 2788 ↛ 2789line 2788 didn't jump to line 2789 because the condition on line 2788 was never true
2789 return ""
2790 if name.endswith("-bottom"): 2790 ↛ 2791line 2790 didn't jump to line 2791 because the condition on line 2790 was never true
2791 return ""
2792 if name.endswith("-mid"): 2792 ↛ 2793line 2792 didn't jump to line 2793 because the condition on line 2792 was never true
2793 return ""
2794 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2795 # .format(name),
2796 # sortid="page/2414")
2797 return None
2799 sublists = list(
2800 x
2801 for x in contents
2802 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2803 )
2804 contents = list(
2805 x
2806 for x in contents
2807 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2808 )
2810 item = clean_node(
2811 wxr, data, contents, template_fn=translation_item_template_fn
2812 )
2813 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2815 # Parse the translation item.
2816 if item: 2816 ↛ exitline 2816 didn't return from function 'parse_translation_item' because the condition on line 2816 was always true
2817 lang = parse_translation_item_text(
2818 wxr,
2819 word,
2820 data,
2821 item,
2822 sense,
2823 lang,
2824 langcode,
2825 translations_from_template,
2826 is_reconstruction,
2827 )
2829 # Handle sublists. They are frequently used for different
2830 # scripts for the language and different variants of the
2831 # language. We will include the lower-level header as a
2832 # tag in those cases.
2833 for listnode in sublists:
2834 assert listnode.kind == NodeKind.LIST
2835 for node in listnode.children:
2836 if not isinstance(node, WikiNode): 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true
2837 continue
2838 if node.kind == NodeKind.LIST_ITEM: 2838 ↛ 2835line 2838 didn't jump to line 2835 because the condition on line 2838 was always true
2839 parse_translation_item(node.children, lang=lang)
2841 def parse_translation_template(node: WikiNode) -> None:
2842 assert isinstance(node, WikiNode)
2844 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2845 nonlocal sense_parts
2846 nonlocal sense
2847 if is_panel_template(wxr, name):
2848 return ""
2849 if name == "see also":
2850 # XXX capture
2851 # XXX for example, "/" has top-level list containing
2852 # see also items. So also should parse those.
2853 return ""
2854 if name == "trans-see":
2855 # XXX capture
2856 return ""
2857 if name == "see translation subpage": 2857 ↛ 2858line 2857 didn't jump to line 2858 because the condition on line 2857 was never true
2858 sense_parts = []
2859 sense = None
2860 sub = ht.get(1, "")
2861 if sub:
2862 m = re.match(
2863 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
2864 )
2865 else:
2866 m = None
2867 etym = ""
2868 etym_numbered = ""
2869 pos = ""
2870 if m:
2871 etym_numbered = m.group(1)
2872 etym = m.group(2)
2873 pos = m.group(3)
2874 if not sub:
2875 wxr.wtp.debug(
2876 "no part-of-speech in "
2877 "{{see translation subpage|...}}, "
2878 "defaulting to just wxr.wtp.section "
2879 "(= language)",
2880 sortid="page/2468",
2881 )
2882 # seq sent to get_subpage_section without sub and pos
2883 seq = [
2884 language,
2885 TRANSLATIONS_TITLE,
2886 ]
2887 elif (
2888 m
2889 and etym.lower().strip() in ETYMOLOGY_TITLES
2890 and pos.lower() in POS_TITLES
2891 ):
2892 seq = [
2893 language,
2894 etym_numbered,
2895 pos,
2896 TRANSLATIONS_TITLE,
2897 ]
2898 elif sub.lower() in POS_TITLES:
2899 # seq with sub but not pos
2900 seq = [
2901 language,
2902 sub,
2903 TRANSLATIONS_TITLE,
2904 ]
2905 else:
2906 # seq with sub and pos
2907 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2908 if pos.lower() not in POS_TITLES:
2909 wxr.wtp.debug(
2910 "unhandled see translation subpage: "
2911 "language={} sub={} "
2912 "wxr.wtp.subsection={}".format(
2913 language, sub, wxr.wtp.subsection
2914 ),
2915 sortid="page/2478",
2916 )
2917 seq = [language, sub, pos, TRANSLATIONS_TITLE]
2918 subnode = get_subpage_section(
2919 wxr.wtp.title or "MISSING_TITLE",
2920 TRANSLATIONS_TITLE,
2921 [seq],
2922 )
2923 if subnode is None or not isinstance(subnode, WikiNode):
2924 # Failed to find the normal subpage section
2925 # seq with sub and pos
2926 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
2927 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")
2928 seqs: list[list[str] | tuple[str, ...]] = [
2929 [TRANSLATIONS_TITLE],
2930 [language, pos],
2931 ]
2932 subnode = get_subpage_section(
2933 wxr.wtp.title or "MISSING_TITLE",
2934 TRANSLATIONS_TITLE,
2935 seqs,
2936 )
2937 if subnode is not None and isinstance(subnode, WikiNode):
2938 parse_translations(data, subnode)
2939 return ""
2940 if name in (
2941 "c",
2942 "C",
2943 "categorize",
2944 "cat",
2945 "catlangname",
2946 "topics",
2947 "top",
2948 "qualifier",
2949 "cln",
2950 ):
2951 # These are expanded in the default way
2952 return None
2953 if name in (
2954 "trans-top",
2955 "trans-top-see",
2956 ):
2957 # XXX capture id from trans-top? Capture sense here
2958 # instead of trying to parse it from expanded content?
2959 if ht.get(1):
2960 sense_parts = []
2961 sense = ht.get(1)
2962 else:
2963 sense_parts = []
2964 sense = None
2965 return None
2966 if name in (
2967 "trans-bottom",
2968 "trans-mid",
2969 "checktrans-mid",
2970 "checktrans-bottom",
2971 ):
2972 return None
2973 if name == "checktrans-top":
2974 sense_parts = []
2975 sense = None
2976 return ""
2977 if name == "trans-top-also":
2978 # XXX capture?
2979 sense_parts = []
2980 sense = None
2981 return ""
2982 wxr.wtp.error(
2983 "UNIMPLEMENTED parse_translation_template: {} {}".format(
2984 name, ht
2985 ),
2986 sortid="page/2517",
2987 )
2988 return ""
2990 wxr.wtp.expand(
2991 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
2992 )
2994 def parse_translation_recurse(xlatnode: WikiNode) -> None:
2995 nonlocal sense
2996 nonlocal sense_parts
2997 for node in xlatnode.children:
2998 # print(node)
2999 if isinstance(node, str):
3000 if sense:
3001 if not node.isspace():
3002 wxr.wtp.debug(
3003 "skipping string in the middle of "
3004 "translations: {}".format(node),
3005 sortid="page/2530",
3006 )
3007 continue
3008 # Add a part to the sense
3009 sense_parts.append(node)
3010 sense = None
3011 continue
3012 assert isinstance(node, WikiNode)
3013 kind = node.kind
3014 if kind == NodeKind.LIST:
3015 for item in node.children:
3016 if not isinstance(item, WikiNode): 3016 ↛ 3017line 3016 didn't jump to line 3017 because the condition on line 3016 was never true
3017 continue
3018 if item.kind != NodeKind.LIST_ITEM: 3018 ↛ 3019line 3018 didn't jump to line 3019 because the condition on line 3018 was never true
3019 continue
3020 if item.sarg == ":": 3020 ↛ 3021line 3020 didn't jump to line 3021 because the condition on line 3020 was never true
3021 continue
3022 parse_translation_item(item.children)
3023 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3023 ↛ 3027line 3023 didn't jump to line 3027 because the condition on line 3023 was never true
3024 # Silently skip list items that are just indented; these
3025 # are used for text between translations, such as indicating
3026 # translations that need to be checked.
3027 pass
3028 elif kind == NodeKind.TEMPLATE:
3029 parse_translation_template(node)
3030 elif kind in ( 3030 ↛ 3035line 3030 didn't jump to line 3035 because the condition on line 3030 was never true
3031 NodeKind.TABLE,
3032 NodeKind.TABLE_ROW,
3033 NodeKind.TABLE_CELL,
3034 ):
3035 parse_translation_recurse(node)
3036 elif kind == NodeKind.HTML:
3037 if node.attrs.get("class") == "NavFrame": 3037 ↛ 3043line 3037 didn't jump to line 3043 because the condition on line 3037 was never true
3038 # Reset ``sense_parts`` (and force recomputing
3039 # by clearing ``sense``) as each NavFrame specifies
3040 # its own sense. This helps eliminate garbage coming
3041 # from text at the beginning at the translations
3042 # section.
3043 sense_parts = []
3044 sense = None
3045 # for item in node.children:
3046 # if not isinstance(item, WikiNode):
3047 # continue
3048 # parse_translation_recurse(item)
3049 parse_translation_recurse(node)
3050 elif kind in LEVEL_KINDS: 3050 ↛ 3052line 3050 didn't jump to line 3052 because the condition on line 3050 was never true
3051 # Sub-levels will be recursed elsewhere
3052 pass
3053 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):
3054 parse_translation_recurse(node)
3055 elif kind == NodeKind.PREFORMATTED: 3055 ↛ 3056line 3055 didn't jump to line 3056 because the condition on line 3055 was never true
3056 print("parse_translation_recurse: PREFORMATTED:", node)
3057 elif kind == NodeKind.LINK: 3057 ↛ 3111line 3057 didn't jump to line 3111 because the condition on line 3057 was always true
3058 arg0 = node.largs[0]
3059 # Kludge: I've seen occasional normal links to translation
3060 # subpages from main pages (e.g., language/English/Noun
3061 # in July 2021) instead of the normal
3062 # {{see translation subpage|...}} template. This should
3063 # handle them. Note: must be careful not to read other
3064 # links, particularly things like in "human being":
3065 # "a human being -- see [[man/translations]]" (group title)
3066 if ( 3066 ↛ 3074line 3066 didn't jump to line 3074 because the condition on line 3066 was never true
3067 isinstance(arg0, (list, tuple))
3068 and arg0
3069 and isinstance(arg0[0], str)
3070 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3071 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3072 == wxr.wtp.title
3073 ):
3074 wxr.wtp.debug(
3075 "translations subpage link found on main "
3076 "page instead "
3077 "of normal {{see translation subpage|...}}",
3078 sortid="page/2595",
3079 )
3080 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3081 if sub.lower() in POS_TITLES:
3082 seq = [
3083 language,
3084 sub,
3085 TRANSLATIONS_TITLE,
3086 ]
3087 subnode = get_subpage_section(
3088 wxr.wtp.title,
3089 TRANSLATIONS_TITLE,
3090 [seq],
3091 )
3092 if subnode is not None and isinstance(
3093 subnode, WikiNode
3094 ):
3095 parse_translations(data, subnode)
3096 else:
3097 wxr.wtp.error(
3098 "/translations link outside part-of-speech"
3099 )
3101 if (
3102 len(arg0) >= 1
3103 and isinstance(arg0[0], str)
3104 and not arg0[0].lower().startswith("category:")
3105 ):
3106 for x in node.largs[-1]:
3107 if isinstance(x, str): 3107 ↛ 3110line 3107 didn't jump to line 3110 because the condition on line 3107 was always true
3108 sense_parts.append(x)
3109 else:
3110 parse_translation_recurse(x)
3111 elif not sense:
3112 sense_parts.append(node)
3113 else:
3114 wxr.wtp.debug(
3115 "skipping text between translation items/senses: "
3116 "{}".format(node),
3117 sortid="page/2621",
3118 )
3120 # Main code of parse_translation(). We want ``sense`` to be assigned
3121 # regardless of recursion levels, and thus the code is structured
3122 # to define at this level and recurse in parse_translation_recurse().
3123 parse_translation_recurse(xlatnode)
3125 def parse_etymology(data: WordData, node: LevelNode) -> None:
3126 """Parses an etymology section."""
3127 assert isinstance(data, dict)
3128 assert isinstance(node, WikiNode)
3130 templates: list[TemplateData] = []
3132 # Counter for preventing the capture of etymology templates
3133 # when we are inside templates that we want to ignore (i.e.,
3134 # not capture).
3135 ignore_count = 0
3137 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3138 nonlocal ignore_count
3139 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3140 return ""
3141 if re.match(ignored_etymology_templates_re, name):
3142 ignore_count += 1
3143 return None
3145 # CONTINUE_HERE
3147 def etym_post_template_fn(
3148 name: str, ht: TemplateArgs, expansion: str
3149 ) -> None:
3150 nonlocal ignore_count
3151 if name in wikipedia_templates:
3152 parse_wikipedia_template(wxr, data, ht)
3153 return None
3154 if re.match(ignored_etymology_templates_re, name):
3155 ignore_count -= 1
3156 return None
3157 if ignore_count == 0: 3157 ↛ 3163line 3157 didn't jump to line 3163 because the condition on line 3157 was always true
3158 ht = clean_template_args(wxr, ht)
3159 expansion = clean_node(wxr, None, expansion)
3160 templates.append(
3161 {"name": name, "args": ht, "expansion": expansion}
3162 )
3163 return None
3165 # Remove any subsections
3166 contents = list(
3167 x
3168 for x in node.children
3169 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3170 )
3171 # Convert to text, also capturing templates using post_template_fn
3172 text = clean_node(
3173 wxr,
3174 None,
3175 contents,
3176 template_fn=etym_template_fn,
3177 post_template_fn=etym_post_template_fn,
3178 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3179 # Save the collected information.
3180 if len(text) > 0:
3181 data["etymology_text"] = text
3182 if len(templates) > 0:
3183 # Some etymology templates, like Template:root do not generate
3184 # text, so they should be added here. Elsewhere, we check
3185 # for Template:root and add some text to the expansion to please
3186 # the validation.
3187 data["etymology_templates"] = templates
3189 for child_node in node.find_child_recursively( 3189 ↛ exitline 3189 didn't return from function 'parse_etymology' because the loop on line 3189 didn't complete
3190 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3191 ):
3192 if child_node.kind in LEVEL_KIND_FLAGS:
3193 break
3194 elif isinstance( 3194 ↛ 3197line 3194 didn't jump to line 3197 because the condition on line 3194 was never true
3195 child_node, TemplateNode
3196 ) and child_node.template_name in ["zh-x", "zh-q"]:
3197 if "etymology_examples" not in data:
3198 data["etymology_examples"] = []
3199 data["etymology_examples"].extend(
3200 extract_template_zh_x(
3201 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3202 )
3203 )
3205 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3206 """This recurses into a subtree in the parse tree for a page."""
3207 nonlocal etym_data
3208 nonlocal pos_data
3209 nonlocal inside_level_four
3211 redirect_list: list[str] = [] # for `zh-see` template
3213 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3214 """This is called for otherwise unprocessed parts of the page.
3215 We still expand them so that e.g. Category links get captured."""
3216 if name in wikipedia_templates:
3217 data = select_data()
3218 parse_wikipedia_template(wxr, data, ht)
3219 return None
3220 if is_panel_template(wxr, name):
3221 return ""
3222 return None
3224 for node in treenode.children:
3225 if not isinstance(node, WikiNode):
3226 # print(" X{}".format(repr(node)[:40]))
3227 continue
3228 if isinstance(node, TemplateNode):
3229 if process_soft_redirect_template(wxr, node, redirect_list):
3230 continue
3231 elif node.template_name == "zh-forms":
3232 extract_zh_forms_template(wxr, node, select_data())
3233 elif (
3234 node.template_name.endswith("-kanjitab")
3235 or node.template_name == "ja-kt"
3236 ):
3237 extract_ja_kanjitab_template(wxr, node, select_data())
3239 if not isinstance(node, LevelNode):
3240 # XXX handle e.g. wikipedia links at the top of a language
3241 # XXX should at least capture "also" at top of page
3242 if node.kind in (
3243 NodeKind.HLINE,
3244 NodeKind.LIST,
3245 NodeKind.LIST_ITEM,
3246 ):
3247 continue
3248 # print(" UNEXPECTED: {}".format(node))
3249 # Clean the node to collect category links
3250 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3251 continue
3252 t = clean_node(
3253 wxr, etym_data, node.sarg if node.sarg else node.largs
3254 )
3255 t = t.lower()
3256 # XXX these counts were never implemented fully, and even this
3257 # gets discarded: Search STATISTICS_IMPLEMENTATION
3258 wxr.config.section_counts[t] += 1
3259 # print("PROCESS_CHILDREN: T:", repr(t))
3260 if t in IGNORED_TITLES:
3261 pass
3262 elif t.startswith(PRONUNCIATION_TITLE):
3263 # Chinese Pronunciation section kludge; we demote these to
3264 # be level 4 instead of 3 so that they're part of a larger
3265 # etymology hierarchy; usually the data here is empty and
3266 # acts as an inbetween between POS and Etymology data
3267 if lang_code in ("zh",):
3268 inside_level_four = True
3269 if t.startswith(PRONUNCIATION_TITLE + " "):
3270 # Pronunciation 1, etc, are used in Chinese Glyphs,
3271 # and each of them may have senses under Definition
3272 push_level_four_section(True)
3273 wxr.wtp.start_subsection(None)
3274 if wxr.config.capture_pronunciation: 3274 ↛ 3382line 3274 didn't jump to line 3382 because the condition on line 3274 was always true
3275 data = select_data()
3276 parse_pronunciation(
3277 wxr,
3278 node,
3279 data,
3280 etym_data,
3281 have_etym,
3282 base_data,
3283 lang_code,
3284 )
3285 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3286 push_etym()
3287 wxr.wtp.start_subsection(None)
3288 if wxr.config.capture_etymologies: 3288 ↛ 3382line 3288 didn't jump to line 3382 because the condition on line 3288 was always true
3289 m = re.search(r"\s(\d+)$", t)
3290 if m:
3291 etym_data["etymology_number"] = int(m.group(1))
3292 parse_etymology(etym_data, node)
3293 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:
3294 data = select_data()
3295 extract_descendant_section(wxr, data, node, False)
3296 elif (
3297 t in PROTO_ROOT_DERIVED_TITLES
3298 and pos == "root"
3299 and is_reconstruction
3300 and wxr.config.capture_descendants
3301 ):
3302 data = select_data()
3303 extract_descendant_section(wxr, data, node, True)
3304 elif t == TRANSLATIONS_TITLE:
3305 data = select_data()
3306 parse_translations(data, node)
3307 elif t in INFLECTION_TITLES:
3308 parse_inflection(node, t, pos)
3309 elif t == "alternative forms":
3310 extract_alt_form_section(wxr, select_data(), node)
3311 else:
3312 lst = t.split()
3313 while len(lst) > 1 and lst[-1].isdigit(): 3313 ↛ 3314line 3313 didn't jump to line 3314 because the condition on line 3313 was never true
3314 lst = lst[:-1]
3315 t_no_number = " ".join(lst).lower()
3316 if t_no_number in POS_TITLES:
3317 push_pos()
3318 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3319 pos = dt["pos"] or "MISSING_POS"
3320 wxr.wtp.start_subsection(t)
3321 if "debug" in dt:
3322 wxr.wtp.debug(
3323 "{} in section {}".format(dt["debug"], t),
3324 sortid="page/2755",
3325 )
3326 if "warning" in dt: 3326 ↛ 3327line 3326 didn't jump to line 3327 because the condition on line 3326 was never true
3327 wxr.wtp.wiki_notice(
3328 "{} in section {}".format(dt["warning"], t),
3329 sortid="page/2759",
3330 )
3331 if "error" in dt: 3331 ↛ 3332line 3331 didn't jump to line 3332 because the condition on line 3331 was never true
3332 wxr.wtp.error(
3333 "{} in section {}".format(dt["error"], t),
3334 sortid="page/2763",
3335 )
3336 if "note" in dt: 3336 ↛ 3337line 3336 didn't jump to line 3337 because the condition on line 3336 was never true
3337 wxr.wtp.note(
3338 "{} in section {}".format(dt["note"], t),
3339 sortid="page/20251017a",
3340 )
3341 if "wiki_notice" in dt: 3341 ↛ 3342line 3341 didn't jump to line 3342 because the condition on line 3341 was never true
3342 wxr.wtp.wiki_notice(
3343 "{} in section {}".format(dt["wiki_notices"], t),
3344 sortid="page/20251017b",
3345 )
3346 # Parse word senses for the part-of-speech
3347 parse_part_of_speech(node, pos)
3348 if "tags" in dt:
3349 for pdata in sense_datas:
3350 data_extend(pdata, "tags", dt["tags"])
3351 elif t_no_number in LINKAGE_TITLES:
3352 # print(f"LINKAGE_TITLES NODE {node=}")
3353 rel = LINKAGE_TITLES[t_no_number]
3354 data = select_data()
3355 parse_linkage(
3356 wxr,
3357 data,
3358 rel,
3359 node,
3360 word,
3361 sense_datas,
3362 is_reconstruction,
3363 )
3364 elif t_no_number == COMPOUNDS_TITLE:
3365 data = select_data()
3366 if wxr.config.capture_compounds: 3366 ↛ 3382line 3366 didn't jump to line 3382 because the condition on line 3366 was always true
3367 parse_linkage(
3368 wxr,
3369 data,
3370 "derived",
3371 node,
3372 word,
3373 sense_datas,
3374 is_reconstruction,
3375 )
3377 # XXX parse interesting templates also from other sections. E.g.,
3378 # {{Letter|...}} in ===See also===
3379 # Also <gallery>
3381 # Recurse to children of this node, processing subtitles therein
3382 stack.append(t)
3383 process_children(node, pos)
3384 stack.pop()
3386 if len(redirect_list) > 0:
3387 if len(pos_data) > 0:
3388 pos_data["redirects"] = redirect_list
3389 if "pos" not in pos_data: 3389 ↛ 3390line 3389 didn't jump to line 3390 because the condition on line 3389 was never true
3390 pos_data["pos"] = "soft-redirect"
3391 else:
3392 new_page_data = copy.deepcopy(base_data)
3393 new_page_data["redirects"] = redirect_list
3394 if "pos" not in new_page_data: 3394 ↛ 3396line 3394 didn't jump to line 3396 because the condition on line 3394 was always true
3395 new_page_data["pos"] = "soft-redirect"
3396 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3397 page_datas.append(new_page_data)
3399 def extract_examples(
3400 others: list[WikiNode], sense_base: SenseData
3401 ) -> list[ExampleData]:
3402 """Parses through a list of definitions and quotes to find examples.
3403 Returns a list of example dicts to be added to sense data. Adds
3404 meta-data, mostly categories, into sense_base."""
3405 assert isinstance(others, list)
3406 examples: list[ExampleData] = []
3408 for sub in others:
3409 if not sub.sarg.endswith((":", "*")): 3409 ↛ 3410line 3409 didn't jump to line 3410 because the condition on line 3409 was never true
3410 continue
3411 for item in sub.children:
3412 if not isinstance(item, WikiNode): 3412 ↛ 3413line 3412 didn't jump to line 3413 because the condition on line 3412 was never true
3413 continue
3414 if item.kind != NodeKind.LIST_ITEM: 3414 ↛ 3415line 3414 didn't jump to line 3415 because the condition on line 3414 was never true
3415 continue
3416 usex_type = None
3417 example_template_args = []
3418 example_template_names = []
3419 taxons = set()
3421 # Bypass this function when parsing Chinese, Japanese and
3422 # quotation templates.
3423 new_example_lists = extract_example_list_item(
3424 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3425 )
3426 if len(new_example_lists) > 0:
3427 examples.extend(new_example_lists)
3428 continue
3430 def usex_template_fn(
3431 name: str, ht: TemplateArgs
3432 ) -> Optional[str]:
3433 nonlocal usex_type
3434 if is_panel_template(wxr, name):
3435 return ""
3436 if name in usex_templates:
3437 usex_type = "example"
3438 example_template_args.append(ht)
3439 example_template_names.append(name)
3440 elif name in quotation_templates:
3441 usex_type = "quotation"
3442 elif name in taxonomy_templates: 3442 ↛ 3443line 3442 didn't jump to line 3443 because the condition on line 3442 was never true
3443 taxons.update(ht.get(1, "").split())
3444 for prefix in template_linkages_to_ignore_in_examples:
3445 if re.search(
3446 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3447 ):
3448 return ""
3449 return None
3451 # bookmark
3452 ruby: list[tuple[str, str]] = []
3453 contents = item.children
3454 if lang_code == "ja":
3455 # Capture ruby contents if this is a Japanese language
3456 # example.
3457 # print(contents)
3458 if ( 3458 ↛ 3463line 3458 didn't jump to line 3463 because the condition on line 3458 was never true
3459 contents
3460 and isinstance(contents, str)
3461 and re.match(r"\s*$", contents[0])
3462 ):
3463 contents = contents[1:]
3464 exp = wxr.wtp.parse(
3465 wxr.wtp.node_to_wikitext(contents),
3466 # post_template_fn=head_post_template_fn,
3467 expand_all=True,
3468 )
3469 rub, rest = extract_ruby(wxr, exp.children)
3470 if rub:
3471 for rtup in rub:
3472 ruby.append(rtup)
3473 contents = rest
3474 subtext = clean_node(
3475 wxr, sense_base, contents, template_fn=usex_template_fn
3476 )
3478 frozen_taxons = frozenset(taxons)
3479 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3481 # print(f"{subtext=}")
3482 subtext = re.sub(
3483 r"\s*\(please add an English "
3484 r"translation of this "
3485 r"(example|usage example|quote)\)",
3486 "",
3487 subtext,
3488 ).strip()
3489 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3490 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3491 # print("subtext:", repr(subtext))
3493 lines = subtext.splitlines()
3494 # print(lines)
3496 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3497 lines = list(
3498 x
3499 for x in lines
3500 if not re.match(
3501 r"(Synonyms: |Antonyms: |Hyponyms: |"
3502 r"Synonym: |Antonym: |Hyponym: |"
3503 r"Hypernyms: |Derived terms: |"
3504 r"Related terms: |"
3505 r"Hypernym: |Derived term: |"
3506 r"Coordinate terms:|"
3507 r"Related term: |"
3508 r"For more quotations using )",
3509 x,
3510 )
3511 )
3512 tr = ""
3513 ref = ""
3514 roman = ""
3515 # for line in lines:
3516 # print("LINE:", repr(line))
3517 # print(classify_desc(line))
3518 if len(lines) == 1 and lang_code != "en":
3519 parts = example_splitter_re.split(lines[0])
3520 if ( 3520 ↛ 3528line 3520 didn't jump to line 3528 because the condition on line 3520 was never true
3521 len(parts) > 2
3522 and len(example_template_args) == 1
3523 and any(
3524 ("―" in s) or ("—" in s)
3525 for s in example_template_args[0].values()
3526 )
3527 ):
3528 if nparts := synch_splits_with_args(
3529 lines[0], example_template_args[0]
3530 ):
3531 parts = nparts
3532 if ( 3532 ↛ 3537line 3532 didn't jump to line 3537 because the condition on line 3532 was never true
3533 len(example_template_args) == 1
3534 and "lit" in example_template_args[0]
3535 ):
3536 # ugly brute-force kludge in case there's a lit= arg
3537 literally = example_template_args[0].get("lit", "")
3538 if literally:
3539 literally = (
3540 " (literally, “"
3541 + clean_value(wxr, literally)
3542 + "”)"
3543 )
3544 else:
3545 literally = ""
3546 if ( 3546 ↛ 3585line 3546 didn't jump to line 3585 because the condition on line 3546 was never true
3547 len(example_template_args) == 1
3548 and len(parts) == 2
3549 and len(example_template_args[0])
3550 - (
3551 # horrible kludge to ignore these arguments
3552 # when calculating how many there are
3553 sum(
3554 s in example_template_args[0]
3555 for s in (
3556 "lit", # generates text, but we handle it
3557 "inline",
3558 "noenum",
3559 "nocat",
3560 "sort",
3561 )
3562 )
3563 )
3564 == 3
3565 and clean_value(
3566 wxr, example_template_args[0].get(2, "")
3567 )
3568 == parts[0].strip()
3569 and clean_value(
3570 wxr,
3571 (
3572 example_template_args[0].get(3)
3573 or example_template_args[0].get("translation")
3574 or example_template_args[0].get("t", "")
3575 )
3576 + literally, # in case there's a lit= argument
3577 )
3578 == parts[1].strip()
3579 ):
3580 # {{exampletemplate|ex|Foo bar baz|English translation}}
3581 # is a pretty reliable 'heuristic', so we use it here
3582 # before the others. To be extra sure the template
3583 # doesn't do anything weird, we compare the arguments
3584 # and the output to each other.
3585 lines = [parts[0].strip()]
3586 tr = parts[1].strip()
3587 elif (
3588 len(parts) == 2
3589 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3590 ):
3591 # These other branches just do some simple heuristics w/
3592 # the expanded output of the template (if applicable).
3593 lines = [parts[0].strip()]
3594 tr = parts[1].strip()
3595 elif ( 3595 ↛ 3601line 3595 didn't jump to line 3601 because the condition on line 3595 was never true
3596 len(parts) == 3
3597 and classify_desc2(parts[1])
3598 in ("romanization", "english")
3599 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3600 ):
3601 lines = [parts[0].strip()]
3602 roman = parts[1].strip()
3603 tr = parts[2].strip()
3604 else:
3605 parts = re.split(r"\s+-\s+", lines[0])
3606 if ( 3606 ↛ 3610line 3606 didn't jump to line 3610 because the condition on line 3606 was never true
3607 len(parts) == 2
3608 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3609 ):
3610 lines = [parts[0].strip()]
3611 tr = parts[1].strip()
3612 elif len(lines) > 1:
3613 if any(
3614 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3615 ) and not (len(example_template_names) == 1):
3616 refs: list[str] = []
3617 for i in range(len(lines)): 3617 ↛ 3623line 3617 didn't jump to line 3623 because the loop on line 3617 didn't complete
3618 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3618 ↛ 3619line 3618 didn't jump to line 3619 because the condition on line 3618 was never true
3619 break
3620 refs.append(lines[i].strip())
3621 if re.search(r"[]\d:)]\s*$", lines[i]):
3622 break
3623 ref = " ".join(refs)
3624 lines = lines[i + 1 :]
3625 if (
3626 lang_code != "en"
3627 and len(lines) >= 2
3628 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3629 ):
3630 i = len(lines) - 1
3631 while ( 3631 ↛ 3636line 3631 didn't jump to line 3636 because the condition on line 3631 was never true
3632 i > 1
3633 and classify_desc2(lines[i - 1])
3634 in ENGLISH_TEXTS
3635 ):
3636 i -= 1
3637 tr = "\n".join(lines[i:])
3638 lines = lines[:i]
3639 if len(lines) >= 2:
3640 if classify_desc2(lines[-1]) == "romanization":
3641 roman = lines[-1].strip()
3642 lines = lines[:-1]
3644 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):
3645 ref = lines[0]
3646 lines = lines[1:]
3647 elif lang_code != "en" and len(lines) == 2:
3648 cls1 = classify_desc2(lines[0])
3649 cls2 = classify_desc2(lines[1])
3650 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3651 tr = lines[1]
3652 lines = [lines[0]]
3653 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3653 ↛ 3654line 3653 didn't jump to line 3654 because the condition on line 3653 was never true
3654 tr = lines[0]
3655 lines = [lines[1]]
3656 elif ( 3656 ↛ 3663line 3656 didn't jump to line 3663 because the condition on line 3656 was never true
3657 re.match(r"^[#*]*:+", lines[1])
3658 and classify_desc2(
3659 re.sub(r"^[#*:]+\s*", "", lines[1])
3660 )
3661 in ENGLISH_TEXTS
3662 ):
3663 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3664 lines = [lines[0]]
3665 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3666 # Both were classified as English, but
3667 # presumably one is not. Assume first is
3668 # non-English, as that seems more common.
3669 tr = lines[1]
3670 lines = [lines[0]]
3671 elif (
3672 usex_type != "quotation"
3673 and lang_code != "en"
3674 and len(lines) == 3
3675 ):
3676 cls1 = classify_desc2(lines[0])
3677 cls2 = classify_desc2(lines[1])
3678 cls3 = classify_desc2(lines[2])
3679 if (
3680 cls3 == "english"
3681 and cls2 in ("english", "romanization")
3682 and cls1 != "english"
3683 ):
3684 tr = lines[2].strip()
3685 roman = lines[1].strip()
3686 lines = [lines[0].strip()]
3687 elif ( 3687 ↛ 3695line 3687 didn't jump to line 3695 because the condition on line 3687 was never true
3688 usex_type == "quotation"
3689 and lang_code != "en"
3690 and len(lines) > 2
3691 ):
3692 # for x in lines:
3693 # print(" LINE: {}: {}"
3694 # .format(classify_desc2(x), x))
3695 if re.match(r"^[#*]*:+\s*$", lines[1]):
3696 ref = lines[0]
3697 lines = lines[2:]
3698 cls1 = classify_desc2(lines[-1])
3699 if cls1 == "english":
3700 i = len(lines) - 1
3701 while (
3702 i > 1
3703 and classify_desc2(lines[i - 1])
3704 == ENGLISH_TEXTS
3705 ):
3706 i -= 1
3707 tr = "\n".join(lines[i:])
3708 lines = lines[:i]
3710 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3711 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3712 tr = re.sub(r"^[#*:]+\s*", "", tr)
3713 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3714 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3715 ref = re.sub(r"^[#*:]+\s*", "", ref)
3716 ref = re.sub(
3717 r", (volume |number |page )?“?"
3718 r"\(please specify ([^)]|\(s\))*\)”?|"
3719 ", text here$",
3720 "",
3721 ref,
3722 )
3723 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3724 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3725 subtext = "\n".join(x for x in lines if x)
3726 if not tr and lang_code != "en":
3727 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3728 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3728 ↛ 3729line 3728 didn't jump to line 3729 because the condition on line 3728 was never true
3729 tr = m.group(2)
3730 subtext = subtext[: m.start()] + m.group(1)
3731 elif lines:
3732 parts = re.split(r"\s*[―—]+\s*", lines[0])
3733 if ( 3733 ↛ 3737line 3733 didn't jump to line 3737 because the condition on line 3733 was never true
3734 len(parts) == 2
3735 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3736 ):
3737 subtext = parts[0].strip()
3738 tr = parts[1].strip()
3739 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3740 subtext = re.sub(
3741 r"(please add an English translation of "
3742 r"this (quote|usage example))",
3743 "",
3744 subtext,
3745 )
3746 subtext = re.sub(
3747 r"\s*→New International Version " "translation$",
3748 "",
3749 subtext,
3750 ) # e.g. pis/Tok Pisin (Bible)
3751 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
3752 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
3753 note = None
3754 m = re.match(r"^\(([^)]*)\):\s+", subtext)
3755 if ( 3755 ↛ 3763line 3755 didn't jump to line 3763 because the condition on line 3755 was never true
3756 m is not None
3757 and lang_code != "en"
3758 and (
3759 m.group(1).startswith("with ")
3760 or classify_desc2(m.group(1)) == "english"
3761 )
3762 ):
3763 note = m.group(1)
3764 subtext = subtext[m.end() :]
3765 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
3766 ref = re.sub(r",\s*→ISBN", "", ref)
3767 ref = ref.strip()
3768 if ref.endswith(":") or ref.endswith(","):
3769 ref = ref[:-1].strip()
3770 ref = re.sub(r"\s+,\s+", ", ", ref)
3771 ref = re.sub(r"\s+", " ", ref)
3772 if ref and not subtext: 3772 ↛ 3773line 3772 didn't jump to line 3773 because the condition on line 3772 was never true
3773 subtext = ref
3774 ref = ""
3775 if subtext:
3776 dt: ExampleData = {"text": subtext}
3777 if ref:
3778 dt["ref"] = ref
3779 if tr:
3780 dt["english"] = tr # DEPRECATED for "translation"
3781 dt["translation"] = tr
3782 if usex_type:
3783 dt["type"] = usex_type
3784 if note: 3784 ↛ 3785line 3784 didn't jump to line 3785 because the condition on line 3784 was never true
3785 dt["note"] = note
3786 if roman:
3787 dt["roman"] = roman
3788 if ruby:
3789 dt["ruby"] = ruby
3790 examples.append(dt)
3792 return examples
3794 # Main code of parse_language()
3795 # Process the section
3796 stack.append(language)
3797 process_children(langnode, None)
3798 stack.pop()
3800 # Finalize word entires
3801 push_etym()
3802 ret = []
3803 for data in page_datas:
3804 merge_base(data, base_data)
3805 ret.append(data)
3807 # Copy all tags to word senses
3808 for data in ret:
3809 if "senses" not in data: 3809 ↛ 3810line 3809 didn't jump to line 3810 because the condition on line 3809 was never true
3810 continue
3811 # WordData should not have a 'tags' field, but if it does, it's
3812 # deleted and its contents removed and placed in each sense;
3813 # that's why the type ignores.
3814 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
3815 if "tags" in data:
3816 del data["tags"] # type: ignore[typeddict-item]
3817 for sense in data["senses"]:
3818 data_extend(sense, "tags", tags)
3820 return ret
3823def parse_wikipedia_template(
3824 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
3825) -> None:
3826 """Helper function for parsing {{wikipedia|...}} and related templates."""
3827 assert isinstance(wxr, WiktextractContext)
3828 assert isinstance(data, dict)
3829 assert isinstance(ht, dict)
3830 langid = clean_node(wxr, data, ht.get("lang", ()))
3831 pagename = (
3832 clean_node(wxr, data, ht.get(1, ()))
3833 or wxr.wtp.title
3834 or "MISSING_PAGE_TITLE"
3835 )
3836 if langid:
3837 data_append(data, "wikipedia", langid + ":" + pagename)
3838 else:
3839 data_append(data, "wikipedia", pagename)
3842def parse_top_template(
3843 wxr: WiktextractContext, node: WikiNode, data: WordData
3844) -> None:
3845 """Parses a template that occurs on the top-level in a page, before any
3846 language subtitles."""
3847 assert isinstance(wxr, WiktextractContext)
3848 assert isinstance(node, WikiNode)
3849 assert isinstance(data, dict)
3851 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3852 if name in wikipedia_templates:
3853 parse_wikipedia_template(wxr, data, ht)
3854 return None
3855 if is_panel_template(wxr, name):
3856 return ""
3857 if name in ("reconstruction",): 3857 ↛ 3858line 3857 didn't jump to line 3858 because the condition on line 3857 was never true
3858 return ""
3859 if name.lower() == "also" or name.lower().startswith("also/"):
3860 # XXX shows related words that might really have been the intended
3861 # word, capture them
3862 return ""
3863 if name == "see also": 3863 ↛ 3865line 3863 didn't jump to line 3865 because the condition on line 3863 was never true
3864 # XXX capture
3865 return ""
3866 if name == "cardinalbox": 3866 ↛ 3868line 3866 didn't jump to line 3868 because the condition on line 3866 was never true
3867 # XXX capture
3868 return ""
3869 if name == "character info": 3869 ↛ 3871line 3869 didn't jump to line 3871 because the condition on line 3869 was never true
3870 # XXX capture
3871 return ""
3872 if name == "commonscat": 3872 ↛ 3874line 3872 didn't jump to line 3874 because the condition on line 3872 was never true
3873 # XXX capture link to Wikimedia commons
3874 return ""
3875 if name == "wrongtitle": 3875 ↛ 3878line 3875 didn't jump to line 3878 because the condition on line 3875 was never true
3876 # XXX this should be captured to replace page title with the
3877 # correct title. E.g. ⿰亻革家
3878 return ""
3879 if name == "wikidata": 3879 ↛ 3880line 3879 didn't jump to line 3880 because the condition on line 3879 was never true
3880 arg = clean_node(wxr, data, ht.get(1, ()))
3881 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
3882 data_append(data, "wikidata", arg)
3883 return ""
3884 wxr.wtp.debug(
3885 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
3886 sortid="page/2870",
3887 )
3888 return ""
3890 clean_node(wxr, None, [node], template_fn=top_template_fn)
3893def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
3894 """Fix subtitle hierarchy to be strict Language -> Etymology ->
3895 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
3896 that are next to each other."""
3898 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
3899 # section get overwritten. In this case, let's just combine the two.
3901 # In Chinese entries, Pronunciation can be preceded on the
3902 # same level 3 by its Etymology *and* Glyph Origin sections:
3903 # ===Glyph Origin===
3904 # ===Etymology===
3905 # ===Pronunciation===
3906 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
3907 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
3908 # are now level 6
3910 # Known lowercase PoS names are in part_of_speech_map
3911 # Known lowercase linkage section names are in linkage_map
3913 old = re.split(
3914 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
3915 )
3917 parts = []
3918 npar = 4 # Number of parentheses in above expression
3919 parts.append(old[0])
3920 prev_level = None
3921 level = None
3922 skip_level_title = False # When combining etymology sections
3923 for i in range(1, len(old), npar + 1):
3924 left = old[i]
3925 right = old[i + npar - 1]
3926 # remove Wikilinks in title
3927 title = re.sub(r"^\[\[", "", old[i + 1])
3928 title = re.sub(r"\]\]$", "", title)
3929 prev_level = level
3930 level = len(left)
3931 part = old[i + npar]
3932 if level != len(right): 3932 ↛ 3933line 3932 didn't jump to line 3933 because the condition on line 3932 was never true
3933 wxr.wtp.debug(
3934 "subtitle has unbalanced levels: "
3935 "{!r} has {} on the left and {} on the right".format(
3936 title, left, right
3937 ),
3938 sortid="page/2904",
3939 )
3940 lc = title.lower()
3941 if name_to_code(title, "en") != "":
3942 if level > 2: 3942 ↛ 3943line 3942 didn't jump to line 3943 because the condition on line 3942 was never true
3943 wxr.wtp.debug(
3944 "subtitle has language name {} at level {}".format(
3945 title, level
3946 ),
3947 sortid="page/2911",
3948 )
3949 level = 2
3950 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
3951 if level > 3: 3951 ↛ 3952line 3951 didn't jump to line 3952 because the condition on line 3951 was never true
3952 wxr.wtp.debug(
3953 "etymology section {} at level {}".format(title, level),
3954 sortid="page/2917",
3955 )
3956 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)
3957 # sections cheek-to-cheek
3958 skip_level_title = True
3959 # Modify the title of previous ("Glyph Origin") section, in
3960 # case we have a meaningful title like "Etymology 1"
3961 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
3962 level = 3
3963 elif lc.startswith(PRONUNCIATION_TITLE):
3964 # Pronunciation is now a level between POS and Etymology, so
3965 # we need to shift everything down by one
3966 level = 4
3967 elif lc in POS_TITLES:
3968 level = 5
3969 elif lc == TRANSLATIONS_TITLE:
3970 level = 6
3971 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:
3972 level = 6
3973 elif lc in INFLECTION_TITLES:
3974 level = 6
3975 elif lc == DESCENDANTS_TITLE:
3976 level = 6
3977 elif title in PROTO_ROOT_DERIVED_TITLES: 3977 ↛ 3978line 3977 didn't jump to line 3978 because the condition on line 3977 was never true
3978 level = 6
3979 elif lc in IGNORED_TITLES:
3980 level = 6
3981 else:
3982 level = 6
3983 if skip_level_title:
3984 skip_level_title = False
3985 parts.append(part)
3986 else:
3987 parts.append("{}{}{}".format("=" * level, title, "=" * level))
3988 parts.append(part)
3989 # print("=" * level, title)
3990 # if level != len(left):
3991 # print(" FIXED LEVEL OF {} {} -> {}"
3992 # .format(title, len(left), level))
3994 text = "".join(parts)
3995 # print(text)
3996 return text
3999def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4000 # Skip translation pages
4001 if word.endswith("/" + TRANSLATIONS_TITLE): 4001 ↛ 4002line 4001 didn't jump to line 4002 because the condition on line 4001 was never true
4002 return []
4004 if wxr.config.verbose: 4004 ↛ 4005line 4004 didn't jump to line 4005 because the condition on line 4004 was never true
4005 logger.info(f"Parsing page: {word}")
4007 wxr.config.word = word
4008 wxr.wtp.start_page(word)
4010 # Remove <noinclude> and similar tags from main pages. They
4011 # should not appear there, but at least net/Elfdala has one and it
4012 # is probably not the only one.
4013 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4014 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4015 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4017 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4018 # pages that have, for example, Translations section under Linkage, or
4019 # Translations section on the same level as Noun. Enforce a proper
4020 # hierarchy by manipulating the subtitle levels in certain cases.
4021 text = fix_subtitle_hierarchy(wxr, text)
4023 # Parse the page, pre-expanding those templates that are likely to
4024 # influence parsing
4025 tree = wxr.wtp.parse(
4026 text,
4027 pre_expand=True,
4028 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4029 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4030 )
4031 # from wikitextprocessor.parser import print_tree
4032 # print("PAGE PARSE:", print_tree(tree))
4034 top_data: WordData = {}
4036 # Iterate over top-level titles, which should be languages for normal
4037 # pages
4038 by_lang = defaultdict(list)
4039 for langnode in tree.children:
4040 if not isinstance(langnode, WikiNode):
4041 continue
4042 if langnode.kind == NodeKind.TEMPLATE:
4043 parse_top_template(wxr, langnode, top_data)
4044 continue
4045 if langnode.kind == NodeKind.LINK:
4046 # Some pages have links at top level, e.g., "trees" in Wiktionary
4047 continue
4048 if langnode.kind != NodeKind.LEVEL2: 4048 ↛ 4049line 4048 didn't jump to line 4049 because the condition on line 4048 was never true
4049 wxr.wtp.debug(
4050 f"unexpected top-level node: {langnode}", sortid="page/3014"
4051 )
4052 continue
4053 lang = clean_node(
4054 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4055 )
4056 lang_code = name_to_code(lang, "en")
4057 if lang_code == "": 4057 ↛ 4058line 4057 didn't jump to line 4058 because the condition on line 4057 was never true
4058 wxr.wtp.debug(
4059 f"unrecognized language name: {lang}", sortid="page/3019"
4060 )
4061 if (
4062 wxr.config.capture_language_codes
4063 and lang_code not in wxr.config.capture_language_codes
4064 ):
4065 continue
4066 wxr.wtp.start_section(lang)
4068 # Collect all words from the page.
4069 # print(f"{langnode=}")
4070 datas = parse_language(wxr, langnode, lang, lang_code)
4072 # Propagate fields resulting from top-level templates to this
4073 # part-of-speech.
4074 for data in datas:
4075 if "lang" not in data: 4075 ↛ 4076line 4075 didn't jump to line 4076 because the condition on line 4075 was never true
4076 wxr.wtp.debug(
4077 "internal error -- no lang in data: {}".format(data),
4078 sortid="page/3034",
4079 )
4080 continue
4081 for k, v in top_data.items():
4082 assert isinstance(v, (list, tuple))
4083 data_extend(data, k, v)
4084 by_lang[data["lang"]].append(data)
4086 # XXX this code is clearly out of date. There is no longer a "conjugation"
4087 # field. FIX OR REMOVE.
4088 # Do some post-processing on the words. For example, we may distribute
4089 # conjugation information to all the words.
4090 ret = []
4091 for lang, lang_datas in by_lang.items():
4092 ret.extend(lang_datas)
4094 for x in ret:
4095 if x["word"] != word:
4096 if word.startswith("Unsupported titles/"):
4097 wxr.wtp.debug(
4098 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4099 sortid="20231101/3578page.py",
4100 )
4101 else:
4102 wxr.wtp.debug(
4103 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",
4104 sortid="20231101/3582page.py",
4105 )
4106 x["original_title"] = word
4107 # validate tag data
4108 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4109 return ret
4112def recursively_separate_raw_tags(
4113 wxr: WiktextractContext, data: dict[str, Any]
4114) -> None:
4115 if not isinstance(data, dict): 4115 ↛ 4116line 4115 didn't jump to line 4116 because the condition on line 4115 was never true
4116 wxr.wtp.error(
4117 "'data' is not dict; most probably "
4118 "data has a list that contains at least one dict and "
4119 "at least one non-dict item",
4120 sortid="en/page-4016/20240419",
4121 )
4122 return
4123 new_tags: list[str] = []
4124 raw_tags: list[str] = data.get("raw_tags", [])
4125 for field, val in data.items():
4126 if field == "tags":
4127 for tag in val:
4128 if tag not in valid_tags:
4129 raw_tags.append(tag)
4130 else:
4131 new_tags.append(tag)
4132 if isinstance(val, list):
4133 if len(val) > 0 and isinstance(val[0], dict):
4134 for d in val:
4135 recursively_separate_raw_tags(wxr, d)
4136 if "tags" in data and not new_tags:
4137 del data["tags"]
4138 elif new_tags:
4139 data["tags"] = new_tags
4140 if raw_tags:
4141 data["raw_tags"] = raw_tags
4144def process_soft_redirect_template(
4145 wxr: WiktextractContext,
4146 template_node: TemplateNode,
4147 redirect_pages: list[str],
4148) -> bool:
4149 # return `True` if the template is soft redirect template
4150 if template_node.template_name == "zh-see":
4151 # https://en.wiktionary.org/wiki/Template:zh-see
4152 title = clean_node(
4153 wxr, None, template_node.template_parameters.get(1, "")
4154 )
4155 if title != "": 4155 ↛ 4157line 4155 didn't jump to line 4157 because the condition on line 4155 was always true
4156 redirect_pages.append(title)
4157 return True
4158 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4159 # https://en.wiktionary.org/wiki/Template:ja-see
4160 for key, value in template_node.template_parameters.items():
4161 if isinstance(key, int): 4161 ↛ 4160line 4161 didn't jump to line 4160 because the condition on line 4161 was always true
4162 title = clean_node(wxr, None, value)
4163 if title != "": 4163 ↛ 4160line 4163 didn't jump to line 4160 because the condition on line 4163 was always true
4164 redirect_pages.append(title)
4165 return True
4166 return False
4169ZH_FORMS_TAGS = {
4170 "trad.": "Traditional-Chinese",
4171 "simp.": "Simplified-Chinese",
4172 "alternative forms": "alternative",
4173 "2nd round simp.": "Second-Round-Simplified-Chinese",
4174}
4177def extract_zh_forms_template(
4178 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4179):
4180 # https://en.wiktionary.org/wiki/Template:zh-forms
4181 lit_meaning = clean_node(
4182 wxr, None, t_node.template_parameters.get("lit", "")
4183 )
4184 if lit_meaning != "":
4185 base_data["literal_meaning"] = lit_meaning
4186 expanded_node = wxr.wtp.parse(
4187 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4188 )
4189 for table in expanded_node.find_child(NodeKind.TABLE):
4190 for row in table.find_child(NodeKind.TABLE_ROW):
4191 row_header = ""
4192 row_header_tags: list[str] = []
4193 header_has_span = False
4194 for cell in row.find_child(
4195 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
4196 ):
4197 if cell.kind == NodeKind.TABLE_HEADER_CELL:
4198 row_header, row_header_tags, header_has_span = (
4199 extract_zh_forms_header_cell(wxr, base_data, cell)
4200 )
4201 elif not header_has_span:
4202 extract_zh_forms_data_cell(
4203 wxr, base_data, cell, row_header, row_header_tags
4204 )
4206 if "forms" in base_data and len(base_data["forms"]) == 0: 4206 ↛ 4207line 4206 didn't jump to line 4207 because the condition on line 4206 was never true
4207 del base_data["forms"]
4210def extract_zh_forms_header_cell(
4211 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode
4212) -> tuple[str, list[str], bool]:
4213 row_header = ""
4214 row_header_tags = []
4215 header_has_span = False
4216 first_span_index = len(header_cell.children)
4217 for index, span_tag in header_cell.find_html("span", with_index=True):
4218 if index < first_span_index: 4218 ↛ 4220line 4218 didn't jump to line 4220 because the condition on line 4218 was always true
4219 first_span_index = index
4220 header_has_span = True
4221 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])
4222 for raw_tag in row_header.split(" and "):
4223 raw_tag = raw_tag.strip()
4224 if raw_tag != "":
4225 row_header_tags.append(raw_tag)
4226 for span_tag in header_cell.find_html_recursively("span"):
4227 span_lang = span_tag.attrs.get("lang", "")
4228 form_nodes = []
4229 sup_title = ""
4230 for node in span_tag.children:
4231 if isinstance(node, HTMLNode) and node.tag == "sup": 4231 ↛ 4232line 4231 didn't jump to line 4232 because the condition on line 4231 was never true
4232 for sup_span in node.find_html("span"):
4233 sup_title = sup_span.attrs.get("title", "")
4234 else:
4235 form_nodes.append(node)
4236 if span_lang in ["zh-Hant", "zh-Hans"]:
4237 for word in clean_node(wxr, None, form_nodes).split("/"):
4238 if word not in [wxr.wtp.title, ""]:
4239 form = {"form": word}
4240 for raw_tag in row_header_tags:
4241 if raw_tag in ZH_FORMS_TAGS: 4241 ↛ 4244line 4241 didn't jump to line 4244 because the condition on line 4241 was always true
4242 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4243 else:
4244 data_append(form, "raw_tags", raw_tag)
4245 if sup_title != "": 4245 ↛ 4246line 4245 didn't jump to line 4246 because the condition on line 4245 was never true
4246 data_append(form, "raw_tags", sup_title)
4247 data_append(base_data, "forms", form)
4248 return row_header, row_header_tags, header_has_span
4251TagLiteral = Literal["tags", "raw_tags"]
4252TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags")
4255def extract_zh_forms_data_cell(
4256 wxr: WiktextractContext,
4257 base_data: WordData,
4258 cell: WikiNode,
4259 row_header: str,
4260 row_header_tags: list[str],
4261) -> None:
4262 from .zh_pron_tags import ZH_PRON_TAGS
4264 forms: list[FormData] = []
4265 for top_span_tag in cell.find_html("span"):
4266 span_style = top_span_tag.attrs.get("style", "")
4267 span_lang = top_span_tag.attrs.get("lang", "")
4268 if span_style == "white-space:nowrap;":
4269 extract_zh_forms_data_cell(
4270 wxr, base_data, top_span_tag, row_header, row_header_tags
4271 )
4272 elif "font-size:80%" in span_style:
4273 raw_tag = clean_node(wxr, None, top_span_tag)
4274 if raw_tag != "": 4274 ↛ 4265line 4274 didn't jump to line 4265 because the condition on line 4274 was always true
4275 for form in forms:
4276 if raw_tag in ZH_PRON_TAGS: 4276 ↛ 4282line 4276 didn't jump to line 4282 because the condition on line 4276 was always true
4277 tr_tag = ZH_PRON_TAGS[raw_tag]
4278 if isinstance(tr_tag, list): 4278 ↛ 4279line 4278 didn't jump to line 4279 because the condition on line 4278 was never true
4279 data_extend(form, "tags", tr_tag)
4280 elif isinstance(tr_tag, str): 4280 ↛ 4275line 4280 didn't jump to line 4275 because the condition on line 4280 was always true
4281 data_append(form, "tags", tr_tag)
4282 elif raw_tag in valid_tags:
4283 data_append(form, "tags", raw_tag)
4284 else:
4285 data_append(form, "raw_tags", raw_tag)
4286 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4286 ↛ 4265line 4286 didn't jump to line 4265 because the condition on line 4286 was always true
4287 word = clean_node(wxr, None, top_span_tag)
4288 if word not in ["", "/", wxr.wtp.title]:
4289 form = {"form": word}
4290 if row_header != "anagram": 4290 ↛ 4296line 4290 didn't jump to line 4296 because the condition on line 4290 was always true
4291 for raw_tag in row_header_tags:
4292 if raw_tag in ZH_FORMS_TAGS: 4292 ↛ 4295line 4292 didn't jump to line 4295 because the condition on line 4292 was always true
4293 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])
4294 else:
4295 data_append(form, "raw_tags", raw_tag)
4296 if span_lang == "zh-Hant":
4297 data_append(form, "tags", "Traditional-Chinese")
4298 elif span_lang == "zh-Hans":
4299 data_append(form, "tags", "Simplified-Chinese")
4300 forms.append(form)
4302 if row_header == "anagram": 4302 ↛ 4303line 4302 didn't jump to line 4303 because the condition on line 4302 was never true
4303 for form in forms:
4304 l_data: LinkageData = {"word": form["form"]}
4305 for key in TAG_LITERALS_TUPLE:
4306 if key in form:
4307 l_data[key] = form[key]
4308 data_append(base_data, "anagrams", l_data)
4309 else:
4310 data_extend(base_data, "forms", forms)
4313def extract_ja_kanjitab_template(
4314 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData
4315):
4316 # https://en.wiktionary.org/wiki/Template:ja-kanjitab
4317 expanded_node = wxr.wtp.parse(
4318 wxr.wtp.node_to_wikitext(t_node), expand_all=True
4319 )
4320 for table in expanded_node.find_child(NodeKind.TABLE):
4321 is_alt_form_table = False
4322 for row in table.find_child(NodeKind.TABLE_ROW):
4323 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
4324 header_text = clean_node(wxr, None, header_node)
4325 if header_text.startswith("Alternative spelling"):
4326 is_alt_form_table = True
4327 if not is_alt_form_table:
4328 continue
4329 forms = []
4330 for row in table.find_child(NodeKind.TABLE_ROW):
4331 for cell_node in row.find_child(NodeKind.TABLE_CELL):
4332 for child_node in cell_node.children:
4333 if isinstance(child_node, HTMLNode):
4334 if child_node.tag == "span":
4335 word = clean_node(wxr, None, child_node)
4336 if word != "": 4336 ↛ 4332line 4336 didn't jump to line 4332 because the condition on line 4336 was always true
4337 forms.append(
4338 {
4339 "form": word,
4340 "tags": ["alternative", "kanji"],
4341 }
4342 )
4343 elif child_node.tag == "small":
4344 raw_tag = clean_node(wxr, None, child_node).strip(
4345 "()"
4346 )
4347 if raw_tag != "" and len(forms) > 0: 4347 ↛ 4332line 4347 didn't jump to line 4332 because the condition on line 4347 was always true
4348 data_append(
4349 forms[-1],
4350 "tags"
4351 if raw_tag in valid_tags
4352 else "raw_tags",
4353 raw_tag,
4354 )
4355 data_extend(base_data, "forms", forms)
4356 for link_node in expanded_node.find_child(NodeKind.LINK):
4357 clean_node(wxr, base_data, link_node)