Coverage for src/wiktextract/extractor/en/page.py: 44%
1905 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1# Code for parsing information from a single Wiktionary page.
2#
3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import copy
6import html
7import re
8import sys
9from collections import defaultdict
10from functools import partial
11from typing import (
12 TYPE_CHECKING,
13 Any,
14 Iterable,
15 Iterator,
16 Optional,
17 Set,
18 Union,
19 cast,
20)
22from mediawiki_langcodes import get_all_names, name_to_code
23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable
24from wikitextprocessor.parser import (
25 LEVEL_KIND_FLAGS,
26 GeneralNode,
27 NodeKind,
28 TemplateNode,
29 WikiNode,
30)
32from ...clean import clean_template_args, clean_value
33from ...datautils import (
34 data_append,
35 data_extend,
36 ns_title_prefix_tuple,
37)
38from ...page import (
39 LEVEL_KINDS,
40 clean_node,
41 is_panel_template,
42 recursively_extract,
43)
44from ...tags import valid_tags
45from ...wxr_context import WiktextractContext
46from ...wxr_logging import logger
47from ..ruby import extract_ruby, parse_ruby
48from ..share import strip_nodes
49from .example import extract_example_list_item, extract_template_zh_x
50from .form_descriptions import (
51 classify_desc,
52 decode_tags,
53 distw,
54 parse_alt_or_inflection_of,
55 parse_sense_qualifier,
56 parse_word_head,
57)
58from .inflection import TableContext, parse_inflection_section
59from .info_templates import (
60 INFO_TEMPLATE_FUNCS,
61 parse_info_template_arguments,
62 parse_info_template_node,
63)
64from .linkages import parse_linkage_item_text
65from .parts_of_speech import PARTS_OF_SPEECH
66from .section_titles import (
67 COMPOUNDS_TITLE,
68 DESCENDANTS_TITLE,
69 ETYMOLOGY_TITLES,
70 IGNORED_TITLES,
71 INFLECTION_TITLES,
72 LINKAGE_TITLES,
73 POS_TITLES,
74 PRONUNCIATION_TITLE,
75 PROTO_ROOT_DERIVED_TITLES,
76 TRANSLATIONS_TITLE,
77)
78from .translations import parse_translation_item_text
79from .type_utils import (
80 DescendantData,
81 ExampleData,
82 FormData,
83 LinkageData,
84 SenseData,
85 SoundData,
86 TemplateData,
87 WordData,
88)
89from .unsupported_titles import unsupported_title_map
91# When determining whether a string is 'english', classify_desc
92# might return 'taxonomic' which is English text 99% of the time.
93ENGLISH_TEXTS = ("english", "taxonomic")
95# Matches head tag
96HEAD_TAG_RE = re.compile(
97 r"^(head|Han char|arabic-noun|arabic-noun-form|"
98 r"hangul-symbol|syllable-hangul)$|"
99 + r"^(latin|"
100 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))
101 + r")-("
102 + "|".join(
103 [
104 "abbr",
105 "adj",
106 "adjective",
107 "adjective form",
108 "adjective-form",
109 "adv",
110 "adverb",
111 "affix",
112 "animal command",
113 "art",
114 "article",
115 "aux",
116 "bound pronoun",
117 "bound-pronoun",
118 "Buyla",
119 "card num",
120 "card-num",
121 "cardinal",
122 "chunom",
123 "classifier",
124 "clitic",
125 "cls",
126 "cmene",
127 "cmavo",
128 "colloq-verb",
129 "colverbform",
130 "combining form",
131 "combining-form",
132 "comparative",
133 "con",
134 "concord",
135 "conj",
136 "conjunction",
137 "conjug",
138 "cont",
139 "contr",
140 "converb",
141 "daybox",
142 "decl",
143 "decl noun",
144 "def",
145 "dem",
146 "det",
147 "determ",
148 "Deva",
149 "ending",
150 "entry",
151 "form",
152 "fuhivla",
153 "gerund",
154 "gismu",
155 "hanja",
156 "hantu",
157 "hanzi",
158 "head",
159 "ideophone",
160 "idiom",
161 "inf",
162 "indef",
163 "infixed pronoun",
164 "infixed-pronoun",
165 "infl",
166 "inflection",
167 "initialism",
168 "int",
169 "interfix",
170 "interj",
171 "interjection",
172 "jyut",
173 "latin",
174 "letter",
175 "locative",
176 "lujvo",
177 "monthbox",
178 "mutverb",
179 "name",
180 "nisba",
181 "nom",
182 "noun",
183 "noun form",
184 "noun-form",
185 "noun plural",
186 "noun-plural",
187 "nounprefix",
188 "num",
189 "number",
190 "numeral",
191 "ord",
192 "ordinal",
193 "par",
194 "part",
195 "part form",
196 "part-form",
197 "participle",
198 "particle",
199 "past",
200 "past neg",
201 "past-neg",
202 "past participle",
203 "past-participle",
204 "perfect participle",
205 "perfect-participle",
206 "personal pronoun",
207 "personal-pronoun",
208 "pref",
209 "prefix",
210 "phrase",
211 "pinyin",
212 "plural noun",
213 "plural-noun",
214 "pos",
215 "poss-noun",
216 "post",
217 "postp",
218 "postposition",
219 "PP",
220 "pp",
221 "ppron",
222 "pred",
223 "predicative",
224 "prep",
225 "prep phrase",
226 "prep-phrase",
227 "preposition",
228 "present participle",
229 "present-participle",
230 "pron",
231 "prondem",
232 "pronindef",
233 "pronoun",
234 "prop",
235 "proper noun",
236 "proper-noun",
237 "proper noun form",
238 "proper-noun form",
239 "proper noun-form",
240 "proper-noun-form",
241 "prov",
242 "proverb",
243 "prpn",
244 "prpr",
245 "punctuation mark",
246 "punctuation-mark",
247 "regnoun",
248 "rel",
249 "rom",
250 "romanji",
251 "root",
252 "sign",
253 "suff",
254 "suffix",
255 "syllable",
256 "symbol",
257 "verb",
258 "verb form",
259 "verb-form",
260 "verbal noun",
261 "verbal-noun",
262 "verbnec",
263 "vform",
264 ]
265 )
266 + r")(-|/|\+|$)"
267)
269FLOATING_TABLE_TEMPLATES: set[str] = {
270 # az-suffix-form creates a style=floatright div that is otherwise
271 # deleted; if it is not pre-expanded, we can intercept the template
272 # so we add this set into do_not_pre_expand, and intercept the
273 # templates in parse_part_of_speech
274 "az-suffix-forms",
275 "az-inf-p",
276 "kk-suffix-forms",
277 "ky-suffix-forms",
278 "tr-inf-p",
279 "tr-suffix-forms",
280 "tt-suffix-forms",
281 "uz-suffix-forms",
282}
283# These two should contain template names that should always be
284# pre-expanded when *first* processing the tree, or not pre-expanded
285# so that the template are left in place with their identifying
286# name intact for later filtering.
288DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()
289DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)
291# Additional templates to be expanded in the pre-expand phase
292ADDITIONAL_EXPAND_TEMPLATES: set[str] = {
293 "multitrans",
294 "multitrans-nowiki",
295 "trans-top",
296 "trans-top-also",
297 "trans-bottom",
298 "checktrans-top",
299 "checktrans-bottom",
300 "col1",
301 "col2",
302 "col3",
303 "col4",
304 "col5",
305 "col1-u",
306 "col2-u",
307 "col3-u",
308 "col4-u",
309 "col5-u",
310 "check deprecated lang param usage",
311 "deprecated code",
312 "ru-verb-alt-ё",
313 "ru-noun-alt-ё",
314 "ru-adj-alt-ё",
315 "ru-proper noun-alt-ё",
316 "ru-pos-alt-ё",
317 "ru-alt-ё",
318 "inflection of",
319 "no deprecated lang param usage",
320}
322# Inverse linkage for those that have them
323linkage_inverses: dict[str, str] = {
324 # XXX this is not currently used, move to post-processing
325 "synonyms": "synonyms",
326 "hypernyms": "hyponyms",
327 "hyponyms": "hypernyms",
328 "holonyms": "meronyms",
329 "meronyms": "holonyms",
330 "derived": "derived_from",
331 "coordinate_terms": "coordinate_terms",
332 "troponyms": "hypernyms",
333 "antonyms": "antonyms",
334 "instances": "instance_of",
335 "related": "related",
336}
338# Templates that are used to form panels on pages and that
339# should be ignored in various positions
340PANEL_TEMPLATES: set[str] = {
341 "Character info",
342 "CJKV",
343 "French personal pronouns",
344 "French possessive adjectives",
345 "French possessive pronouns",
346 "Han etym",
347 "Japanese demonstratives",
348 "Latn-script",
349 "LDL",
350 "MW1913Abbr",
351 "Number-encoding",
352 "Nuttall",
353 "Spanish possessive adjectives",
354 "Spanish possessive pronouns",
355 "USRegionDisputed",
356 "Webster 1913",
357 "ase-rfr",
358 "attention",
359 "attn",
360 "beer",
361 "broken ref",
362 "ca-compass",
363 "character info",
364 "character info/var",
365 "checksense",
366 "compass-fi",
367 "copyvio suspected",
368 "delete",
369 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean
370 "etystub",
371 "examples",
372 "hu-corr",
373 "hu-suff-pron",
374 "interwiktionary",
375 "ja-kanjitab",
376 "ko-hanja-search",
377 "look",
378 "maintenance box",
379 "maintenance line",
380 "mediagenic terms",
381 "merge",
382 "missing template",
383 "morse links",
384 "move",
385 "multiple images",
386 "no inline",
387 "picdic",
388 "picdicimg",
389 "picdiclabel",
390 "polyominoes",
391 "predidential nomics",
392 "punctuation", # This actually gets pre-expanded
393 "reconstructed",
394 "request box",
395 "rf-sound example",
396 "rfaccents",
397 "rfap",
398 "rfaspect",
399 "rfc",
400 "rfc-auto",
401 "rfc-header",
402 "rfc-level",
403 "rfc-pron-n",
404 "rfc-sense",
405 "rfclarify",
406 "rfd",
407 "rfd-redundant",
408 "rfd-sense",
409 "rfdate",
410 "rfdatek",
411 "rfdef",
412 "rfe",
413 "rfe/dowork",
414 "rfex",
415 "rfexp",
416 "rfform",
417 "rfgender",
418 "rfi",
419 "rfinfl",
420 "rfm",
421 "rfm-sense",
422 "rfp",
423 "rfp-old",
424 "rfquote",
425 "rfquote-sense",
426 "rfquotek",
427 "rfref",
428 "rfscript",
429 "rft2",
430 "rftaxon",
431 "rftone",
432 "rftranslit",
433 "rfv",
434 "rfv-etym",
435 "rfv-pron",
436 "rfv-quote",
437 "rfv-sense",
438 "selfref",
439 "split",
440 "stroke order", # XXX consider capturing this?
441 "stub entry",
442 "t-needed",
443 "tbot entry",
444 "tea room",
445 "tea room sense",
446 # "ttbc", - XXX needed in at least on/Preposition/Translation page
447 "unblock",
448 "unsupportedpage",
449 "video frames",
450 "was wotd",
451 "wrongtitle",
452 "zh-forms",
453 "zh-hanzi-box",
454}
456# lookup table for the tags of Chinese dialectal synonyms
457zh_tag_lookup: dict[str, list[str]] = {
458 "Formal": ["formal"],
459 "Written-Standard-Chinese": ["Standard-Chinese"],
460 "historical or Internet slang": ["historical", "internet-slang"],
461 "now usually derogatory or offensive": ["offensive", "derogatory"],
462 "lofty": [],
463}
465# Template name prefixes used for language-specific panel templates (i.e.,
466# templates that create side boxes or notice boxes or that should generally
467# be ignored).
468PANEL_PREFIXES: set[str] = {
469 "list:compass points/",
470 "list:Gregorian calendar months/",
471 "RQ:",
472}
474# Templates used for wikipedia links.
475wikipedia_templates: set[str] = {
476 "wikipedia",
477 "slim-wikipedia",
478 "w",
479 "W",
480 "swp",
481 "wiki",
482 "Wikipedia",
483 "wtorw",
484}
485for x in PANEL_PREFIXES & wikipedia_templates: 485 ↛ 486line 485 didn't jump to line 486 because the loop on line 485 never started
486 print(
487 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(
488 x
489 )
490 )
492# Mapping from a template name (without language prefix) for the main word
493# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which
494# it could validly occur. This is used as just a sanity check to give
495# warnings about probably incorrect coding in Wiktionary.
496template_allowed_pos_map: dict[str, list[str]] = {
497 "abbr": ["abbrev"],
498 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],
499 "plural noun": ["noun", "name"],
500 "plural-noun": ["noun", "name"],
501 "proper noun": ["noun", "name"],
502 "proper-noun": ["name", "noun"],
503 "prop": ["name", "noun"],
504 "verb": ["verb", "phrase"],
505 "gerund": ["verb"],
506 "particle": ["adv", "particle"],
507 "adj": ["adj", "adj_noun"],
508 "pron": ["pron", "noun"],
509 "name": ["name", "noun"],
510 "adv": ["adv", "intj", "conj", "particle"],
511 "phrase": ["phrase", "prep_phrase"],
512 "noun phrase": ["phrase"],
513 "ordinal": ["num"],
514 "number": ["num"],
515 "pos": ["affix", "name", "num"],
516 "suffix": ["suffix", "affix"],
517 "character": ["character"],
518 "letter": ["character"],
519 "kanji": ["character"],
520 "cont": ["abbrev"],
521 "interj": ["intj"],
522 "con": ["conj"],
523 "part": ["particle"],
524 "prep": ["prep", "postp"],
525 "postp": ["postp"],
526 "misspelling": ["noun", "adj", "verb", "adv"],
527 "part-form": ["verb"],
528}
529for k, v in template_allowed_pos_map.items():
530 for x in v:
531 if x not in PARTS_OF_SPEECH: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true
532 print(
533 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"
534 "".format(x, k, v)
535 )
536 assert False
539# Templates ignored during etymology extraction, i.e., these will not be listed
540# in the extracted etymology templates.
541ignored_etymology_templates: list[str] = [
542 "...",
543 "IPAchar",
544 "ipachar",
545 "ISBN",
546 "isValidPageName",
547 "redlink category",
548 "deprecated code",
549 "check deprecated lang param usage",
550 "para",
551 "p",
552 "cite",
553 "Cite news",
554 "Cite newsgroup",
555 "cite paper",
556 "cite MLLM 1976",
557 "cite journal",
558 "cite news/documentation",
559 "cite paper/documentation",
560 "cite video game",
561 "cite video game/documentation",
562 "cite newsgroup",
563 "cite newsgroup/documentation",
564 "cite web/documentation",
565 "cite news",
566 "Cite book",
567 "Cite-book",
568 "cite book",
569 "cite web",
570 "cite-usenet",
571 "cite-video/documentation",
572 "Cite-journal",
573 "rfe",
574 "catlangname",
575 "cln",
576 "langname-lite",
577 "no deprecated lang param usage",
578 "mention",
579 "m",
580 "m-self",
581 "link",
582 "l",
583 "ll",
584 "l-self",
585]
586# Regexp for matching ignored etymology template names. This adds certain
587# prefixes to the names listed above.
588ignored_etymology_templates_re = re.compile(
589 r"^((cite-|R:|RQ:).*|"
590 + r"|".join(re.escape(x) for x in ignored_etymology_templates)
591 + r")$"
592)
594# Regexp for matching ignored descendants template names. Right now we just
595# copy the ignored etymology templates
596ignored_descendants_templates_re = ignored_etymology_templates_re
598# Set of template names that are used to define usage examples. If the usage
599# example contains one of these templates, then it its type is set to
600# "example"
601usex_templates: set[str] = {
602 "afex",
603 "affixusex",
604 "co", # {{collocation}} acts like a example template, specifically for
605 # pairs of combinations of words that are more common than you'd
606 # except would be randomly; hlavní#Czech
607 "coi",
608 "collocation",
609 "el-example",
610 "el-x",
611 "example",
612 "examples",
613 "he-usex",
614 "he-x",
615 "hi-usex",
616 "hi-x",
617 "ja-usex-inline",
618 "ja-usex",
619 "ja-x",
620 "jbo-example",
621 "jbo-x",
622 "km-usex",
623 "km-x",
624 "ko-usex",
625 "ko-x",
626 "lo-usex",
627 "lo-x",
628 "ne-x",
629 "ne-usex",
630 "prefixusex",
631 "ryu-usex",
632 "ryu-x",
633 "shn-usex",
634 "shn-x",
635 "suffixusex",
636 "th-usex",
637 "th-x",
638 "ur-usex",
639 "ur-x",
640 "usex",
641 "usex-suffix",
642 "ux",
643 "uxi",
644}
646stop_head_at_these_templates: set[str] = {
647 "category",
648 "cat",
649 "topics",
650 "catlangname",
651 "c",
652 "C",
653 "top",
654 "cln",
655}
657# Set of template names that are used to define quotation examples. If the
658# usage example contains one of these templates, then its type is set to
659# "quotation".
660quotation_templates: set[str] = {
661 "collapse-quote",
662 "quote-av",
663 "quote-book",
664 "quote-GYLD",
665 "quote-hansard",
666 "quotei",
667 "quote-journal",
668 "quotelite",
669 "quote-mailing list",
670 "quote-meta",
671 "quote-newsgroup",
672 "quote-song",
673 "quote-text",
674 "quote",
675 "quote-us-patent",
676 "quote-video game",
677 "quote-web",
678 "quote-wikipedia",
679 "wikiquote",
680 "Wikiquote",
681}
683taxonomy_templates = {
684 # argument 1 should be the taxonomic name, frex. "Lupus lupus"
685 "taxfmt",
686 "taxlink",
687 "taxlink2",
688 "taxlinknew",
689 "taxlook",
690}
692# Template name component to linkage section listing. Integer section means
693# default section, starting at that argument.
694# XXX not used anymore, except for the first elements: moved to
695# template_linkages
696# template_linkage_mappings: list[list[Union[str, int]]] = [
697# ["syn", "synonyms"],
698# ["synonyms", "synonyms"],
699# ["ant", "antonyms"],
700# ["antonyms", "antonyms"],
701# ["hyp", "hyponyms"],
702# ["hyponyms", "hyponyms"],
703# ["der", "derived"],
704# ["derived terms", "derived"],
705# ["coordinate terms", "coordinate_terms"],
706# ["rel", "related"],
707# ["col", 2],
708# ]
710# Template names, this was exctracted from template_linkage_mappings,
711# because the code using template_linkage_mappings was actually not used
712# (but not removed).
713template_linkages: set[str] = {
714 "syn",
715 "synonyms",
716 "ant",
717 "antonyms",
718 "hyp",
719 "hyponyms",
720 "der",
721 "derived terms",
722 "coordinate terms",
723 "rel",
724 "col",
725}
727# Maps template name used in a word sense to a linkage field that it adds.
728sense_linkage_templates: dict[str, str] = {
729 "syn": "synonyms",
730 "synonyms": "synonyms",
731 "hyp": "hyponyms",
732 "hyponyms": "hyponyms",
733 "ant": "antonyms",
734 "antonyms": "antonyms",
735}
738def decode_html_entities(v: Union[str, int]) -> str:
739 """Decodes HTML entities from a value, converting them to the respective
740 Unicode characters/strings."""
741 if isinstance(v, int):
742 # I changed this to return str(v) instead of v = str(v),
743 # but there might have been the intention to have more logic
744 # here. html.unescape would not do anything special with an integer,
745 # it needs html escape symbols (&xx;).
746 return str(v)
747 return html.unescape(v)
750def parse_sense_linkage(
751 wxr: WiktextractContext,
752 data: SenseData,
753 name: str,
754 ht: TemplateArgs,
755) -> None:
756 """Parses a linkage (synonym, etc) specified in a word sense."""
757 assert isinstance(wxr, WiktextractContext)
758 assert isinstance(data, dict)
759 assert isinstance(name, str)
760 assert isinstance(ht, dict)
761 field = sense_linkage_templates[name]
762 for i in range(2, 20):
763 w = ht.get(i) or ""
764 w = clean_node(wxr, data, w)
765 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):
766 if w.startswith(alias):
767 w = w[len(alias) :]
768 break
769 if not w:
770 break
771 tags: list[str] = []
772 topics: list[str] = []
773 english: Optional[str] = None
774 # Try to find qualifiers for this synonym
775 q = ht.get("q{}".format(i - 1))
776 if q:
777 cls = classify_desc(q)
778 if cls == "tags":
779 tagsets1, topics1 = decode_tags(q)
780 for ts in tagsets1:
781 tags.extend(ts)
782 topics.extend(topics1)
783 elif cls == "english":
784 if english:
785 english += "; " + q
786 else:
787 english = q
788 # Try to find English translation for this synonym
789 t = ht.get("t{}".format(i - 1))
790 if t:
791 if english:
792 english += "; " + t
793 else:
794 english = t
796 # See if the linkage contains a parenthesized alt
797 alt = None
798 m = re.search(r"\(([^)]+)\)$", w)
799 if m:
800 w = w[: m.start()].strip()
801 alt = m.group(1)
803 dt = {"word": w}
804 if tags:
805 data_extend(dt, "tags", tags)
806 if topics:
807 data_extend(dt, "topics", topics)
808 if english:
809 dt["english"] = english
810 if alt:
811 dt["alt"] = alt
812 data_append(data, field, dt)
815EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"
816example_splitter_re = re.compile(EXAMPLE_SPLITTERS)
817captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")
820def synch_splits_with_args(
821 line: str, targs: TemplateArgs
822) -> Optional[list[str]]:
823 """If it looks like there's something weird with how a line of example
824 text has been split, this function will do the splitting after counting
825 occurences of the splitting regex inside the two main template arguments
826 containing the string data for the original language example and the
827 English translations.
828 """
829 # Previously, we split without capturing groups, but here we want to
830 # keep the original splitting hyphen regex intact.
831 fparts = captured_splitters_re.split(line)
832 new_parts = []
833 # ["First", " – ", "second", " – ", "third..."] from OL argument
834 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))
835 new_parts.append("".join(fparts[:first]))
836 # Translation argument
837 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")
838 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.
839 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))
840 new_parts.append("".join(fparts[first + 1 : second]))
842 if all(new_parts): # no empty strings from the above spaghetti
843 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens
844 return new_parts
845 else:
846 return None
849QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*"
850QUALIFIERS_RE = re.compile(QUALIFIERS)
851# (...): ... or (...(...)...): ...
854def parse_language(
855 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str
856) -> list[WordData]:
857 """Iterates over the text of the page, returning words (parts-of-speech)
858 defined on the page one at a time. (Individual word senses for the
859 same part-of-speech are typically encoded in the same entry.)"""
860 # imported here to avoid circular import
861 from .pronunciation import parse_pronunciation
863 assert isinstance(wxr, WiktextractContext)
864 assert isinstance(langnode, WikiNode)
865 assert isinstance(language, str)
866 assert isinstance(lang_code, str)
867 # print("parse_language", language)
869 is_reconstruction = False
870 word: str = wxr.wtp.title # type: ignore[assignment]
871 unsupported_prefix = "Unsupported titles/"
872 if word.startswith(unsupported_prefix):
873 w = word[len(unsupported_prefix) :]
874 if w in unsupported_title_map: 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true
875 word = unsupported_title_map[w]
876 else:
877 wxr.wtp.error(
878 "Unimplemented unsupported title: {}".format(word),
879 sortid="page/870",
880 )
881 word = w
882 elif word.startswith("Reconstruction:"): 882 ↛ 883line 882 didn't jump to line 883 because the condition on line 882 was never true
883 word = word[word.find("/") + 1 :]
884 is_reconstruction = True
886 base_data: WordData = {
887 "word": word,
888 "lang": language,
889 "lang_code": lang_code,
890 }
891 if is_reconstruction: 891 ↛ 892line 891 didn't jump to line 892 because the condition on line 891 was never true
892 data_append(base_data, "tags", "reconstruction")
893 sense_data: SenseData = {}
894 pos_data: WordData = {} # For a current part-of-speech
895 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between
896 etym_data: WordData = {} # For one etymology
897 pos_datas: list[SenseData] = []
898 level_four_datas: list[WordData] = []
899 etym_datas: list[WordData] = []
900 page_datas: list[WordData] = []
901 have_etym = False
902 inside_level_four = False # This is for checking if the etymology section
903 # or article has a Pronunciation section, for Chinese mostly; because
904 # Chinese articles can have three level three sections (two etymology
905 # sections and pronunciation sections) one after another, we need a kludge
906 # to better keep track of whether we're in a normal "etym" or inside a
907 # "level four" (which is what we've turned the level three Pron sections
908 # into in the fix_subtitle_hierarchy(); all other sections are demoted by
909 # a step.
910 stack: list[str] = [] # names of items on the "stack"
912 def merge_base(data: WordData, base: WordData) -> None:
913 for k, v in base.items():
914 # Copy the value to ensure that we don't share lists or
915 # dicts between structures (even nested ones).
916 v = copy.deepcopy(v)
917 if k not in data:
918 # The list was copied above, so this will not create shared ref
919 data[k] = v # type: ignore[literal-required]
920 continue
921 if data[k] == v: # type: ignore[literal-required] 921 ↛ 923line 921 didn't jump to line 923 because the condition on line 921 was always true
922 continue
923 if (
924 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]
925 or isinstance(
926 v,
927 (list, tuple), # Should this be "and"?
928 )
929 ):
930 data[k] = list(data[k]) + list(v) # type: ignore
931 elif data[k] != v: # type: ignore[literal-required]
932 wxr.wtp.warning(
933 "conflicting values for {} in merge_base: "
934 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]
935 sortid="page/904",
936 )
938 def complementary_pop(pron: SoundData, key: str) -> SoundData:
939 """Remove unnecessary keys from dict values
940 in a list comprehension..."""
941 if key in pron:
942 pron.pop(key) # type: ignore
943 return pron
945 # If the result has sounds, eliminate sounds that have a prefix that
946 # does not match "word" or one of "forms"
947 if "sounds" in data and "word" in data: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true
948 accepted = [data["word"]]
949 accepted.extend(f["form"] for f in data.get("forms", dict()))
950 data["sounds"] = list(
951 s
952 for s in data["sounds"]
953 if "form" not in s or s["form"] in accepted
954 )
955 # If the result has sounds, eliminate sounds that have a pos that
956 # does not match "pos"
957 if "sounds" in data and "pos" in data: 957 ↛ 958line 957 didn't jump to line 958 because the condition on line 957 was never true
958 data["sounds"] = list(
959 complementary_pop(s, "pos")
960 for s in data["sounds"]
961 # "pos" is not a field of SoundData, correctly, so we're
962 # removing it here. It's a kludge on a kludge on a kludge.
963 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]
964 )
966 def push_sense() -> bool:
967 """Starts collecting data for a new word sense. This returns True
968 if a sense was added."""
969 nonlocal sense_data
970 tags = sense_data.get("tags", ())
971 if (
972 not sense_data.get("glosses")
973 and "translation-hub" not in tags
974 and "no-gloss" not in tags
975 ):
976 return False
978 if ( 978 ↛ 988line 978 didn't jump to line 988
979 (
980 "participle" in sense_data.get("tags", ())
981 or "infinitive" in sense_data.get("tags", ())
982 )
983 and "alt_of" not in sense_data
984 and "form_of" not in sense_data
985 and "etymology_text" in etym_data
986 and etym_data["etymology_text"] != ""
987 ):
988 etym = etym_data["etymology_text"]
989 etym = etym.split(". ")[0]
990 ret = parse_alt_or_inflection_of(wxr, etym, set())
991 if ret is not None:
992 tags, lst = ret
993 assert isinstance(lst, (list, tuple))
994 if "form-of" in tags:
995 data_extend(sense_data, "form_of", lst)
996 data_extend(sense_data, "tags", tags)
997 elif "alt-of" in tags:
998 data_extend(sense_data, "alt_of", lst)
999 data_extend(sense_data, "tags", tags)
1001 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1001 ↛ 1004line 1001 didn't jump to line 1004 because the condition on line 1001 was never true
1002 "tags", ()
1003 ):
1004 data_append(sense_data, "tags", "no-gloss")
1006 pos_datas.append(sense_data)
1007 sense_data = {}
1008 return True
1010 def push_pos() -> None:
1011 """Starts collecting data for a new part-of-speech."""
1012 nonlocal pos_data
1013 nonlocal pos_datas
1014 push_sense()
1015 if wxr.wtp.subsection:
1016 data: WordData = {"senses": pos_datas}
1017 merge_base(data, pos_data)
1018 level_four_datas.append(data)
1019 pos_data = {}
1020 pos_datas = []
1021 wxr.wtp.start_subsection(None)
1023 def push_level_four_section() -> None:
1024 """Starts collecting data for a new level four sections, which
1025 is usually virtual and empty, unless the article has Chinese
1026 'Pronunciation' sections that are etymology-section-like but
1027 under etymology, and at the same level in the source. We modify
1028 the source to demote Pronunciation sections like that to level
1029 4, and other sections one step lower."""
1030 nonlocal level_four_data
1031 nonlocal level_four_datas
1032 nonlocal etym_datas
1033 push_pos()
1034 # print(f"======\n{etym_data=}")
1035 # print(f"======\n{etym_datas=}")
1036 # print(f"======\n{level_four_data=}")
1037 # print(f"======\n{level_four_datas=}")
1038 for data in level_four_datas:
1039 merge_base(data, level_four_data)
1040 etym_datas.append(data)
1041 for data in etym_datas:
1042 merge_base(data, etym_data)
1043 page_datas.append(data)
1044 level_four_data = {}
1045 level_four_datas = []
1046 etym_datas = []
1048 def push_etym() -> None:
1049 """Starts collecting data for a new etymology."""
1050 nonlocal etym_data
1051 nonlocal etym_datas
1052 nonlocal have_etym
1053 nonlocal inside_level_four
1054 have_etym = True
1055 push_level_four_section()
1056 inside_level_four = False
1057 etym_data = {}
1059 def select_data() -> WordData:
1060 """Selects where to store data (pos or etym) based on whether we
1061 are inside a pos (part-of-speech)."""
1062 # print(f"{wxr.wtp.subsection=}")
1063 # print(f"{stack=}")
1064 if wxr.wtp.subsection is not None: 1064 ↛ 1066line 1064 didn't jump to line 1066 because the condition on line 1064 was always true
1065 return pos_data
1066 if stack[-1] == language:
1067 return base_data
1068 if inside_level_four is False:
1069 return etym_data
1070 return level_four_data
1072 def head_post_template_fn(
1073 name: str, ht: TemplateArgs, expansion: str
1074 ) -> Optional[str]:
1075 """Handles special templates in the head section of a word. Head
1076 section is the text after part-of-speech subtitle and before word
1077 sense list. Typically it generates the bold line for the word, but
1078 may also contain other useful information that often ends in
1079 side boxes. We want to capture some of that additional information."""
1080 # print("HEAD_POST_TEMPLATE_FN", name, ht)
1081 if is_panel_template(wxr, name): 1081 ↛ 1084line 1081 didn't jump to line 1084 because the condition on line 1081 was never true
1082 # Completely ignore these templates (not even recorded in
1083 # head_templates)
1084 return ""
1085 if name == "head":
1086 # XXX are these also captured in forms? Should this special case
1087 # be removed?
1088 t = ht.get(2, "")
1089 if t == "pinyin": 1089 ↛ 1090line 1089 didn't jump to line 1090 because the condition on line 1089 was never true
1090 data_append(pos_data, "tags", "Pinyin")
1091 elif t == "romanization": 1091 ↛ 1092line 1091 didn't jump to line 1092 because the condition on line 1091 was never true
1092 data_append(pos_data, "tags", "romanization")
1093 if HEAD_TAG_RE.fullmatch(name) is not None: 1093 ↛ 1102line 1093 didn't jump to line 1102 because the condition on line 1093 was always true
1094 args_ht = clean_template_args(wxr, ht)
1095 cleaned_expansion = clean_node(wxr, None, expansion)
1096 dt = {"name": name, "args": args_ht, "expansion": cleaned_expansion}
1097 data_append(pos_data, "head_templates", dt)
1099 # The following are both captured in head_templates and parsed
1100 # separately
1102 if name in wikipedia_templates: 1102 ↛ 1105line 1102 didn't jump to line 1105 because the condition on line 1102 was never true
1103 # Note: various places expect to have content from wikipedia
1104 # templates, so cannot convert this to empty
1105 parse_wikipedia_template(wxr, pos_data, ht)
1106 return None
1108 if name == "number box": 1108 ↛ 1110line 1108 didn't jump to line 1110 because the condition on line 1108 was never true
1109 # XXX extract numeric value?
1110 return ""
1111 if name == "enum": 1111 ↛ 1113line 1111 didn't jump to line 1113 because the condition on line 1111 was never true
1112 # XXX extract?
1113 return ""
1114 if name == "cardinalbox": 1114 ↛ 1117line 1114 didn't jump to line 1117 because the condition on line 1114 was never true
1115 # XXX extract similar to enum?
1116 # XXX this can also occur in top-level under language
1117 return ""
1118 if name == "Han simplified forms": 1118 ↛ 1120line 1118 didn't jump to line 1120 because the condition on line 1118 was never true
1119 # XXX extract?
1120 return ""
1121 # if name == "ja-kanji forms":
1122 # # XXX extract?
1123 # return ""
1124 # if name == "vi-readings":
1125 # # XXX extract?
1126 # return ""
1127 # if name == "ja-kanji":
1128 # # XXX extract?
1129 # return ""
1130 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1130 ↛ 1132line 1130 didn't jump to line 1132 because the condition on line 1130 was never true
1131 # XXX extract?
1132 return ""
1134 return None
1136 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:
1137 """Parses the subsection for a part-of-speech under a language on
1138 a page."""
1139 assert isinstance(posnode, WikiNode)
1140 assert isinstance(pos, str)
1141 # print("parse_part_of_speech", pos)
1142 pos_data["pos"] = pos
1143 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists
1144 lists: list[list[WikiNode]] = [[]] # list of lists
1145 first_para = True
1146 first_head_tmplt = True
1147 collecting_head = True
1148 start_of_paragraph = True
1150 # XXX extract templates from posnode with recursively_extract
1151 # that break stuff, like ja-kanji or az-suffix-form.
1152 # Do the extraction with a list of template names, combined from
1153 # different lists, then separate out them into different lists
1154 # that are handled at different points of the POS section.
1155 # First, extract az-suffix-form, put it in `inflection`,
1156 # and parse `inflection`'s content when appropriate later.
1157 # The contents of az-suffix-form (and ja-kanji) that generate
1158 # divs with "floatright" in their style gets deleted by
1159 # clean_value, so templates that slip through from here won't
1160 # break anything.
1161 # XXX bookmark
1162 # print(posnode.children)
1164 floaters, poschildren = recursively_extract(
1165 posnode.children,
1166 lambda x: (
1167 isinstance(x, WikiNode)
1168 and x.kind == NodeKind.TEMPLATE
1169 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES
1170 ),
1171 )
1172 tempnode = WikiNode(NodeKind.LEVEL6, 0)
1173 tempnode.largs = [["Inflection"]]
1174 tempnode.children = floaters
1175 parse_inflection(tempnode, "Floating Div", pos)
1176 # print(poschildren)
1177 # XXX new above
1179 if not poschildren: 1179 ↛ 1180line 1179 didn't jump to line 1180 because the condition on line 1179 was never true
1180 if not floaters:
1181 wxr.wtp.debug(
1182 "PoS section without contents",
1183 sortid="en/page/1051/20230612",
1184 )
1185 else:
1186 wxr.wtp.debug(
1187 "PoS section without contents except for a floating table",
1188 sortid="en/page/1056/20230612",
1189 )
1190 return
1192 for node in poschildren:
1193 if isinstance(node, str):
1194 for m in re.finditer(r"\n+|[^\n]+", node):
1195 p = m.group(0)
1196 if p.startswith("\n\n") and pre:
1197 first_para = False
1198 start_of_paragraph = True
1199 break
1200 if p and collecting_head:
1201 pre[-1].append(p)
1202 continue
1203 assert isinstance(node, WikiNode)
1204 kind = node.kind
1205 if kind == NodeKind.LIST:
1206 lists[-1].append(node)
1207 collecting_head = False
1208 start_of_paragraph = True
1209 continue
1210 elif kind in LEVEL_KINDS:
1211 # Stop parsing section if encountering any kind of
1212 # level header (like ===Noun=== or ====Further Reading====).
1213 # At a quick glance, this should be the default behavior,
1214 # but if some kinds of source articles have sub-sub-sections
1215 # that should be parsed XXX it should be handled by changing
1216 # this break.
1217 break
1218 elif collecting_head and kind == NodeKind.LINK: 1218 ↛ 1221line 1218 didn't jump to line 1221 because the condition on line 1218 was never true
1219 # We might collect relevant links as they are often pictures
1220 # relating to the word
1221 if len(node.largs[0]) >= 1 and isinstance(
1222 node.largs[0][0], str
1223 ):
1224 if node.largs[0][0].startswith(
1225 ns_title_prefix_tuple(wxr, "Category")
1226 ):
1227 # [[Category:...]]
1228 # We're at the end of the file, probably, so stop
1229 # here. Otherwise the head will get garbage.
1230 break
1231 if node.largs[0][0].startswith(
1232 ns_title_prefix_tuple(wxr, "File")
1233 ):
1234 # Skips file links
1235 continue
1236 start_of_paragraph = False
1237 pre[-1].extend(node.largs[-1])
1238 elif kind == NodeKind.HTML:
1239 if node.sarg == "br": 1239 ↛ 1245line 1239 didn't jump to line 1245 because the condition on line 1239 was always true
1240 if pre[-1]: 1240 ↛ 1192line 1240 didn't jump to line 1192 because the condition on line 1240 was always true
1241 pre.append([]) # Switch to next head
1242 lists.append([]) # Lists parallels pre
1243 collecting_head = True
1244 start_of_paragraph = True
1245 elif collecting_head and node.sarg not in (
1246 "gallery",
1247 "ref",
1248 "cite",
1249 "caption",
1250 ):
1251 start_of_paragraph = False
1252 pre[-1].append(node)
1253 else:
1254 start_of_paragraph = False
1255 elif isinstance(node, TemplateNode):
1256 # XXX Insert code here that disambiguates between
1257 # templates that generate word heads and templates
1258 # that don't.
1259 # There's head_tag_re that seems like a regex meant
1260 # to identify head templates. Too bad it's None.
1262 # ignore {{category}}, {{cat}}... etc.
1263 if node.template_name in stop_head_at_these_templates:
1264 # we've reached a template that should be at the end,
1265 continue
1267 # skip these templates; panel_templates is already used
1268 # to skip certain templates else, but it also applies to
1269 # head parsing quite well.
1270 # node.largs[0][0] should always be str, but can't type-check
1271 # that.
1272 if is_panel_template(wxr, node.template_name): 1272 ↛ 1273line 1272 didn't jump to line 1273 because the condition on line 1272 was never true
1273 continue
1274 # skip these templates
1275 # if node.largs[0][0] in skip_these_templates_in_head:
1276 # first_head_tmplt = False # no first_head_tmplt at all
1277 # start_of_paragraph = False
1278 # continue
1280 if first_head_tmplt and pre[-1]:
1281 first_head_tmplt = False
1282 start_of_paragraph = False
1283 pre[-1].append(node)
1284 elif pre[-1] and start_of_paragraph:
1285 pre.append([]) # Switch to the next head
1286 lists.append([]) # lists parallel pre
1287 collecting_head = True
1288 start_of_paragraph = False
1289 pre[-1].append(node)
1290 else:
1291 pre[-1].append(node)
1292 elif first_para:
1293 start_of_paragraph = False
1294 if collecting_head: 1294 ↛ 1192line 1294 didn't jump to line 1192 because the condition on line 1294 was always true
1295 pre[-1].append(node)
1296 # XXX use template_fn in clean_node to check that the head macro
1297 # is compatible with the current part-of-speech and generate warning
1298 # if not. Use template_allowed_pos_map.
1300 # Clean up empty pairs, and fix messes with extra newlines that
1301 # separate templates that are followed by lists wiktextract issue #314
1303 cleaned_pre: list[list[Union[str, WikiNode]]] = []
1304 cleaned_lists: list[list[WikiNode]] = []
1305 pairless_pre_index = None
1307 for pre1, ls in zip(pre, lists):
1308 if pre1 and not ls:
1309 pairless_pre_index = len(cleaned_pre)
1310 if not pre1 and not ls: 1310 ↛ 1312line 1310 didn't jump to line 1312 because the condition on line 1310 was never true
1311 # skip [] + []
1312 continue
1313 if not ls and all(
1314 (isinstance(x, str) and not x.strip()) for x in pre1
1315 ):
1316 # skip ["\n", " "] + []
1317 continue
1318 if ls and not pre1:
1319 if pairless_pre_index is not None: 1319 ↛ 1320line 1319 didn't jump to line 1320 because the condition on line 1319 was never true
1320 cleaned_lists[pairless_pre_index] = ls
1321 pairless_pre_index = None
1322 continue
1323 cleaned_pre.append(pre1)
1324 cleaned_lists.append(ls)
1326 pre = cleaned_pre
1327 lists = cleaned_lists
1329 there_are_many_heads = len(pre) > 1
1330 header_tags: list[str] = []
1332 if not any(g for g in lists):
1333 process_gloss_without_list(poschildren, pos, pos_data, header_tags)
1334 else:
1335 for i, (pre1, ls) in enumerate(zip(pre, lists)):
1336 # if len(ls) == 0:
1337 # # don't have gloss list
1338 # # XXX add code here to filter out 'garbage', like text
1339 # # that isn't a head template or head.
1340 # continue
1342 if all(not sl for sl in lists[i:]): 1342 ↛ 1343line 1342 didn't jump to line 1343 because the condition on line 1342 was never true
1343 if i == 0:
1344 if isinstance(node, str):
1345 wxr.wtp.debug(
1346 "first head without list of senses,"
1347 "string: '{}[...]', {}/{}".format(
1348 node[:20], word, language
1349 ),
1350 sortid="page/1689/20221215",
1351 )
1352 if isinstance(node, WikiNode):
1353 if node.largs and node.largs[0][0] in [
1354 "Han char",
1355 ]:
1356 # just ignore these templates
1357 pass
1358 else:
1359 wxr.wtp.debug(
1360 "first head without "
1361 "list of senses, "
1362 "template node "
1363 "{}, {}/{}".format(
1364 node.largs, word, language
1365 ),
1366 sortid="page/1694/20221215",
1367 )
1368 else:
1369 wxr.wtp.debug(
1370 "first head without list of senses, "
1371 "{}/{}".format(word, language),
1372 sortid="page/1700/20221215",
1373 )
1374 # no break here so that the first head always
1375 # gets processed.
1376 else:
1377 if isinstance(node, str):
1378 wxr.wtp.debug(
1379 "later head without list of senses,"
1380 "string: '{}[...]', {}/{}".format(
1381 node[:20], word, language
1382 ),
1383 sortid="page/1708/20221215",
1384 )
1385 if isinstance(node, WikiNode):
1386 wxr.wtp.debug(
1387 "later head without list of senses,"
1388 "template node "
1389 "{}, {}/{}".format(
1390 node.sarg if node.sarg else node.largs,
1391 word,
1392 language,
1393 ),
1394 sortid="page/1713/20221215",
1395 )
1396 else:
1397 wxr.wtp.debug(
1398 "later head without list of senses, "
1399 "{}/{}".format(word, language),
1400 sortid="page/1719/20221215",
1401 )
1402 break
1403 head_group = i + 1 if there_are_many_heads else None
1404 # print("parse_part_of_speech: {}: {}: pre={}"
1405 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))
1406 process_gloss_header(
1407 pre1, pos, head_group, pos_data, header_tags
1408 )
1409 for ln in ls:
1410 # Parse each list associated with this head.
1411 for node in ln.children:
1412 # Parse nodes in l.children recursively.
1413 # The recursion function uses push_sense() to
1414 # add stuff into pos_data, and returns True or
1415 # False if something is added, which bubbles upward.
1416 # If the bubble is "True", then higher levels of
1417 # the recursion will not push_sense(), because
1418 # the data is already pushed into a sub-gloss
1419 # downstream, unless the higher level has examples
1420 # that need to be put somewhere.
1421 common_data: SenseData = {"tags": list(header_tags)}
1422 if head_group:
1423 common_data["head_nr"] = head_group
1424 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]
1426 # If there are no senses extracted, add a dummy sense. We want to
1427 # keep tags extracted from the head for the dummy sense.
1428 push_sense() # Make sure unfinished data pushed, and start clean sense
1429 if len(pos_datas) == 0:
1430 data_extend(sense_data, "tags", header_tags)
1431 data_append(sense_data, "tags", "no-gloss")
1432 push_sense()
1434 def process_gloss_header(
1435 header_nodes: list[Union[WikiNode, str]],
1436 pos_type: str,
1437 header_group: Optional[int],
1438 pos_data: WordData,
1439 header_tags: list[str],
1440 ) -> None:
1441 ruby = []
1442 links: list[str] = []
1444 # process template parse nodes here
1445 new_nodes = []
1446 info_template_data = []
1447 for node in header_nodes:
1448 info_data, info_out = parse_info_template_node(wxr, node, "head")
1449 if info_data or info_out:
1450 if info_data: 1450 ↛ 1452line 1450 didn't jump to line 1452 because the condition on line 1450 was always true
1451 info_template_data.append(info_data)
1452 if info_out: # including just the original node 1452 ↛ 1453line 1452 didn't jump to line 1453 because the condition on line 1452 was never true
1453 new_nodes.append(info_out)
1454 else:
1455 new_nodes.append(node)
1456 header_nodes = new_nodes
1458 if info_template_data:
1459 if "info_templates" not in pos_data: 1459 ↛ 1462line 1459 didn't jump to line 1462 because the condition on line 1459 was always true
1460 pos_data["info_templates"] = info_template_data
1461 else:
1462 pos_data["info_templates"].extend(info_template_data)
1464 if not word.isalnum():
1465 # if the word contains non-letter or -number characters, it might
1466 # have something that messes with split-at-semi-comma; we collect
1467 # links so that we can skip splitting them.
1468 exp = wxr.wtp.parse(
1469 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1470 )
1471 link_nodes, _ = recursively_extract(
1472 exp.children,
1473 lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.LINK,
1474 )
1475 for ln in link_nodes:
1476 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]
1477 if not ltext.isalnum():
1478 links.append(ltext)
1479 if word not in links: 1479 ↛ 1481line 1479 didn't jump to line 1481 because the condition on line 1479 was always true
1480 links.append(word)
1481 if lang_code == "ja":
1482 exp = wxr.wtp.parse(
1483 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True
1484 )
1485 rub, _ = recursively_extract(
1486 exp.children,
1487 lambda x: isinstance(x, WikiNode)
1488 and x.kind == NodeKind.HTML
1489 and x.sarg == "ruby",
1490 )
1491 if rub is not None: 1491 ↛ 1500line 1491 didn't jump to line 1500 because the condition on line 1491 was always true
1492 for r in rub: 1492 ↛ 1493line 1492 didn't jump to line 1493 because the loop on line 1492 never started
1493 if TYPE_CHECKING:
1494 # we know the lambda above in recursively_extract
1495 # returns only WikiNodes in rub
1496 assert isinstance(r, WikiNode)
1497 rt = parse_ruby(wxr, r)
1498 if rt is not None:
1499 ruby.append(rt)
1500 header_text = clean_node(
1501 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn
1502 )
1503 header_text = re.sub(r"\s+", " ", header_text)
1504 # print(f"{header_text=}")
1505 parse_word_head(
1506 wxr,
1507 pos_type,
1508 header_text,
1509 pos_data,
1510 is_reconstruction,
1511 header_group,
1512 ruby=ruby,
1513 links=links,
1514 )
1515 if "tags" in pos_data:
1516 # pos_data can get "tags" data from some source; type-checkers
1517 # doesn't like it, so let's ignore it.
1518 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]
1519 del pos_data["tags"] # type: ignore[typeddict-item]
1520 else:
1521 header_tags.clear()
1523 def process_gloss_without_list(
1524 nodes: list[Union[WikiNode, str]],
1525 pos_type: str,
1526 pos_data: WordData,
1527 header_tags: list[str],
1528 ) -> None:
1529 # gloss text might not inside a list
1530 header_nodes: list[Union[str, WikiNode]] = []
1531 gloss_nodes: list[Union[str, WikiNode]] = []
1532 for node in strip_nodes(nodes):
1533 if isinstance(node, WikiNode):
1534 if isinstance(node, TemplateNode):
1535 if node.template_name in (
1536 "zh-see",
1537 "ja-see",
1538 "ja-see-kango",
1539 ):
1540 continue # soft redirect
1541 elif ( 1541 ↛ 1549line 1541 didn't jump to line 1549
1542 node.template_name == "head"
1543 or node.template_name.startswith(f"{lang_code}-")
1544 ):
1545 header_nodes.append(node)
1546 continue
1547 elif node.kind in LEVEL_KINDS: # following nodes are not gloss
1548 break
1549 gloss_nodes.append(node)
1551 if len(header_nodes) > 0:
1552 process_gloss_header(
1553 header_nodes, pos_type, None, pos_data, header_tags
1554 )
1555 if len(gloss_nodes) > 0:
1556 process_gloss_contents(
1557 gloss_nodes, pos_type, {"tags": list(header_tags)}
1558 )
1560 def parse_sense_node(
1561 node: Union[str, WikiNode], # never receives str
1562 sense_base: SenseData,
1563 pos: str,
1564 ) -> bool:
1565 """Recursively (depth first) parse LIST_ITEM nodes for sense data.
1566 Uses push_sense() to attempt adding data to pos_data in the scope
1567 of parse_language() when it reaches deep in the recursion. push_sense()
1568 returns True if it succeeds, and that is bubbled up the stack; if
1569 a sense was added downstream, the higher levels (whose shared data
1570 was already added by a subsense) do not push_sense(), unless it
1571 has examples that need to be put somewhere.
1572 """
1573 assert isinstance(sense_base, dict) # Added to every sense deeper in
1574 if not isinstance(node, WikiNode): 1574 ↛ 1576line 1574 didn't jump to line 1576 because the condition on line 1574 was never true
1575 # This doesn't seem to ever happen in practice.
1576 wxr.wtp.debug(
1577 "{}: parse_sense_node called with"
1578 "something that isn't a WikiNode".format(pos),
1579 sortid="page/1287/20230119",
1580 )
1581 return False
1583 if node.kind != NodeKind.LIST_ITEM: 1583 ↛ 1584line 1583 didn't jump to line 1584 because the condition on line 1583 was never true
1584 wxr.wtp.debug(
1585 "{}: non-list-item inside list".format(pos), sortid="page/1678"
1586 )
1587 return False
1589 if node.sarg == ":": 1589 ↛ 1595line 1589 didn't jump to line 1595 because the condition on line 1589 was never true
1590 # Skip example entries at the highest level, ones without
1591 # a sense ("...#") above them.
1592 # If node.sarg is exactly and only ":", then it's at
1593 # the highest level; lower levels would have more
1594 # "indentation", like "#:" or "##:"
1595 return False
1597 # If a recursion call succeeds in push_sense(), bubble it up with
1598 # `added`.
1599 # added |= push_sense() or added |= parse_sense_node(...) to OR.
1600 added = False
1602 gloss_template_args: set[str] = set()
1604 # For LISTs and LIST_ITEMS, their argument is something like
1605 # "##" or "##:", and using that we can rudimentally determine
1606 # list 'depth' if need be, and also what kind of list or
1607 # entry it is; # is for normal glosses, : for examples (indent)
1608 # and * is used for quotations on wiktionary.
1609 current_depth = node.sarg
1611 children = node.children
1613 # subentries, (presumably) a list
1614 # of subglosses below this. The list's
1615 # argument ends with #, and its depth should
1616 # be bigger than parent node.
1617 subentries = [
1618 x
1619 for x in children
1620 if isinstance(x, WikiNode)
1621 and x.kind == NodeKind.LIST
1622 and x.sarg == current_depth + "#"
1623 ]
1625 # sublists of examples and quotations. .sarg
1626 # does not end with "#".
1627 others = [
1628 x
1629 for x in children
1630 if isinstance(x, WikiNode)
1631 and x.kind == NodeKind.LIST
1632 and x.sarg != current_depth + "#"
1633 ]
1635 # the actual contents of this particular node.
1636 # can be a gloss (or a template that expands into
1637 # many glosses which we can't easily pre-expand)
1638 # or could be an "outer gloss" with more specific
1639 # subglosses, or could be a qualfier for the subglosses.
1640 contents = [
1641 x
1642 for x in children
1643 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1644 ]
1645 # If this entry has sublists of entries, we should combine
1646 # gloss information from both the "outer" and sublist content.
1647 # Sometimes the outer gloss
1648 # is more non-gloss or tags, sometimes it is a coarse sense
1649 # and the inner glosses are more specific. The outer one
1650 # does not seem to have qualifiers.
1652 # If we have one sublist with one element, treat it
1653 # specially as it may be a Wiktionary error; raise
1654 # that nested element to the same level.
1655 # XXX If need be, this block can be easily removed in
1656 # the current recursive logicand the result is one sense entry
1657 # with both glosses in the glosses list, as you would
1658 # expect. If the higher entry has examples, there will
1659 # be a higher entry with some duplicated data.
1660 if len(subentries) == 1:
1661 slc = subentries[0].children
1662 if len(slc) == 1: 1662 ↛ 1665line 1662 didn't jump to line 1665 because the condition on line 1662 was never true
1663 # copy current node and modify it so it doesn't
1664 # loop infinitely.
1665 cropped_node = copy.copy(node)
1666 cropped_node.children = [
1667 x
1668 for x in children
1669 if not (
1670 isinstance(x, WikiNode)
1671 and x.kind == NodeKind.LIST
1672 and x.sarg == current_depth + "#"
1673 )
1674 ]
1675 added |= parse_sense_node(cropped_node, sense_base, pos)
1676 nonlocal sense_data # this kludge causes duplicated raw_
1677 # glosses data if this is not done;
1678 # if the top-level (cropped_node)
1679 # does not push_sense() properly or
1680 # parse_sense_node() returns early,
1681 # sense_data is not reset. This happens
1682 # for example when you have a no-gloss
1683 # string like "(intransitive)":
1684 # no gloss, push_sense() returns early
1685 # and sense_data has duplicate data with
1686 # sense_base
1687 sense_data = {}
1688 added |= parse_sense_node(slc[0], sense_base, pos)
1689 return added
1691 return process_gloss_contents(
1692 contents,
1693 pos,
1694 sense_base,
1695 subentries,
1696 others,
1697 gloss_template_args,
1698 added,
1699 )
1701 def process_gloss_contents(
1702 contents: list[Union[str, WikiNode]],
1703 pos: str,
1704 sense_base: SenseData,
1705 subentries: list[WikiNode] = [],
1706 others: list[WikiNode] = [],
1707 gloss_template_args: Set[str] = set(),
1708 added: bool = False,
1709 ) -> bool:
1710 def sense_template_fn(
1711 name: str, ht: TemplateArgs, is_gloss: bool = False
1712 ) -> Optional[str]:
1713 # print(f"sense_template_fn: {name}, {ht}")
1714 if name in wikipedia_templates: 1714 ↛ 1716line 1714 didn't jump to line 1716 because the condition on line 1714 was never true
1715 # parse_wikipedia_template(wxr, pos_data, ht)
1716 return None
1717 if is_panel_template(wxr, name): 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true
1718 return ""
1719 if name in INFO_TEMPLATE_FUNCS:
1720 info_data, info_exp = parse_info_template_arguments(
1721 wxr, name, ht, "sense"
1722 )
1723 if info_data or info_exp: 1723 ↛ 1729line 1723 didn't jump to line 1729 because the condition on line 1723 was always true
1724 if info_data: 1724 ↛ 1726line 1724 didn't jump to line 1726 because the condition on line 1724 was always true
1725 data_append(sense_base, "info_templates", info_data)
1726 if info_exp and isinstance(info_exp, str): 1726 ↛ 1728line 1726 didn't jump to line 1728 because the condition on line 1726 was always true
1727 return info_exp
1728 return ""
1729 if name in ("defdate",): 1729 ↛ 1730line 1729 didn't jump to line 1730 because the condition on line 1729 was never true
1730 return ""
1731 if name == "senseid": 1731 ↛ 1732line 1731 didn't jump to line 1732 because the condition on line 1731 was never true
1732 langid = clean_node(wxr, None, ht.get(1, ()))
1733 arg = clean_node(wxr, sense_base, ht.get(2, ()))
1734 if re.match(r"Q\d+$", arg):
1735 data_append(sense_base, "wikidata", arg)
1736 data_append(sense_base, "senseid", langid + ":" + arg)
1737 if name in sense_linkage_templates: 1737 ↛ 1739line 1737 didn't jump to line 1739 because the condition on line 1737 was never true
1738 # print(f"SENSE_TEMPLATE_FN: {name}")
1739 parse_sense_linkage(wxr, sense_base, name, ht)
1740 return ""
1741 if name == "†" or name == "zh-obsolete": 1741 ↛ 1742line 1741 didn't jump to line 1742 because the condition on line 1741 was never true
1742 data_append(sense_base, "tags", "obsolete")
1743 return ""
1744 if name in {
1745 "ux",
1746 "uxi",
1747 "usex",
1748 "afex",
1749 "prefixusex",
1750 "ko-usex",
1751 "ko-x",
1752 "hi-x",
1753 "ja-usex-inline",
1754 "ja-x",
1755 "quotei",
1756 "he-x",
1757 "hi-x",
1758 "km-x",
1759 "ne-x",
1760 "shn-x",
1761 "th-x",
1762 "ur-x",
1763 }:
1764 # Usage examples are captured separately below. We don't
1765 # want to expand them into glosses even when unusual coding
1766 # is used in the entry.
1767 # These templates may slip through inside another item, but
1768 # currently we're separating out example entries (..#:)
1769 # well enough that there seems to very little contamination.
1770 if is_gloss: 1770 ↛ 1776line 1770 didn't jump to line 1776 because the condition on line 1770 was always true
1771 wxr.wtp.warning(
1772 "Example template is used for gloss text",
1773 sortid="extractor.en.page.sense_template_fn/1415",
1774 )
1775 else:
1776 return ""
1777 if name == "w": 1777 ↛ 1778line 1777 didn't jump to line 1778 because the condition on line 1777 was never true
1778 if ht.get(2) == "Wp":
1779 return ""
1780 for k, v in ht.items():
1781 v = v.strip()
1782 if v and "<" not in v: 1782 ↛ 1780line 1782 didn't jump to line 1780 because the condition on line 1782 was always true
1783 gloss_template_args.add(v)
1784 return None
1786 def extract_link_texts(item: GeneralNode) -> None:
1787 """Recursively extracts link texts from the gloss source. This
1788 information is used to select whether to remove final "." from
1789 form_of/alt_of (e.g., ihm/Hunsrik)."""
1790 if isinstance(item, (list, tuple)):
1791 for x in item:
1792 extract_link_texts(x)
1793 return
1794 if isinstance(item, str):
1795 # There seem to be HTML sections that may futher contain
1796 # unparsed links.
1797 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1797 ↛ 1798line 1797 didn't jump to line 1798 because the loop on line 1797 never started
1798 print("ITER:", m.group(0))
1799 v = m.group(1).split("|")[-1].strip()
1800 if v:
1801 gloss_template_args.add(v)
1802 return
1803 if not isinstance(item, WikiNode): 1803 ↛ 1804line 1803 didn't jump to line 1804 because the condition on line 1803 was never true
1804 return
1805 if item.kind == NodeKind.LINK:
1806 v = item.largs[-1]
1807 if ( 1807 ↛ 1813line 1807 didn't jump to line 1813
1808 isinstance(v, list)
1809 and len(v) == 1
1810 and isinstance(v[0], str)
1811 ):
1812 gloss_template_args.add(v[0].strip())
1813 for x in item.children:
1814 extract_link_texts(x)
1816 extract_link_texts(contents)
1818 # get the raw text of non-list contents of this node, and other stuff
1819 # like tag and category data added to sense_base
1820 # cast = no-op type-setter for the type-checker
1821 partial_template_fn = cast(
1822 TemplateFnCallable,
1823 partial(sense_template_fn, is_gloss=True),
1824 )
1825 rawgloss = clean_node(
1826 wxr,
1827 sense_base,
1828 contents,
1829 template_fn=partial_template_fn,
1830 collect_links=True,
1831 )
1833 if not rawgloss:
1834 return False
1836 # remove manually typed ordered list text at the start("1. ")
1837 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()
1839 # get stuff like synonyms and categories from "others",
1840 # maybe examples and quotations
1841 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)
1843 # The gloss could contain templates that produce more list items.
1844 # This happens commonly with, e.g., {{inflection of|...}}. Split
1845 # to parts. However, e.g. Interlingua generates multiple glosses
1846 # in HTML directly without Wikitext markup, so we must also split
1847 # by just newlines.
1848 subglosses = rawgloss.splitlines()
1850 if len(subglosses) == 0: 1850 ↛ 1851line 1850 didn't jump to line 1851 because the condition on line 1850 was never true
1851 return False
1853 if any(s.startswith("#") for s in subglosses):
1854 subtree = wxr.wtp.parse(rawgloss)
1855 # from wikitextprocessor.parser import print_tree
1856 # print("SUBTREE GENERATED BY TEMPLATE:")
1857 # print_tree(subtree)
1858 new_subentries = [
1859 x
1860 for x in subtree.children
1861 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
1862 ]
1864 new_others = [
1865 x
1866 for x in subtree.children
1867 if isinstance(x, WikiNode)
1868 and x.kind == NodeKind.LIST
1869 and not x.sarg.endswith("#")
1870 ]
1872 new_contents = [
1873 clean_node(wxr, [], x)
1874 for x in subtree.children
1875 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
1876 ]
1878 subentries = subentries or new_subentries
1879 others = others or new_others
1880 subglosses = new_contents
1881 rawgloss = "".join(subglosses)
1882 # Generate no gloss for translation hub pages, but add the
1883 # "translation-hub" tag for them
1884 if rawgloss == "(This entry is a translation hub.)": 1884 ↛ 1885line 1884 didn't jump to line 1885 because the condition on line 1884 was never true
1885 data_append(sense_data, "tags", "translation-hub")
1886 return push_sense()
1888 # Remove certain substrings specific to outer glosses
1889 strip_ends = [", particularly:"]
1890 for x in strip_ends:
1891 if rawgloss.endswith(x): 1891 ↛ 1892line 1891 didn't jump to line 1892 because the condition on line 1891 was never true
1892 rawgloss = rawgloss[: -len(x)].strip()
1893 break
1895 # A single gloss, or possibly an outer gloss.
1896 # Check if the possible outer gloss starts with
1897 # parenthesized tags/topics
1899 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 1899 ↛ 1901line 1899 didn't jump to line 1901 because the condition on line 1899 was always true
1900 data_append(sense_base, "raw_glosses", subglosses[0].strip())
1901 m = QUALIFIERS_RE.match(rawgloss)
1902 # (...): ... or (...(...)...): ...
1903 if m:
1904 q = m.group(1)
1905 rawgloss = rawgloss[m.end() :].strip()
1906 parse_sense_qualifier(wxr, q, sense_base)
1907 if rawgloss == "A pejorative:": 1907 ↛ 1908line 1907 didn't jump to line 1908 because the condition on line 1907 was never true
1908 data_append(sense_base, "tags", "pejorative")
1909 rawgloss = ""
1910 elif rawgloss == "Short forms.": 1910 ↛ 1911line 1910 didn't jump to line 1911 because the condition on line 1910 was never true
1911 data_append(sense_base, "tags", "abbreviation")
1912 rawgloss = ""
1913 elif rawgloss == "Technical or specialized senses.": 1913 ↛ 1914line 1913 didn't jump to line 1914 because the condition on line 1913 was never true
1914 rawgloss = ""
1915 elif rawgloss.startswith("inflection of "):
1916 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())
1917 if parsed is not None: 1917 ↛ 1926line 1917 didn't jump to line 1926 because the condition on line 1917 was always true
1918 tags, origins = parsed
1919 if origins is not None: 1919 ↛ 1921line 1919 didn't jump to line 1921 because the condition on line 1919 was always true
1920 data_extend(sense_base, "form_of", origins)
1921 if tags is not None: 1921 ↛ 1924line 1921 didn't jump to line 1924 because the condition on line 1921 was always true
1922 data_extend(sense_base, "tags", tags)
1923 else:
1924 data_append(sense_base, "tags", "form-of")
1925 else:
1926 data_append(sense_base, "tags", "form-of")
1927 if rawgloss: 1927 ↛ 1958line 1927 didn't jump to line 1958 because the condition on line 1927 was always true
1928 # Code duplicating a lot of clean-up operations from later in
1929 # this block. We want to clean up the "supergloss" as much as
1930 # possible, in almost the same way as a normal gloss.
1931 supergloss = rawgloss
1933 if supergloss.startswith("; "): 1933 ↛ 1934line 1933 didn't jump to line 1934 because the condition on line 1933 was never true
1934 supergloss = supergloss[1:].strip()
1936 if supergloss.startswith(("^†", "†")):
1937 data_append(sense_base, "tags", "obsolete")
1938 supergloss = supergloss[2:].strip()
1939 elif supergloss.startswith("^‡"): 1939 ↛ 1940line 1939 didn't jump to line 1940 because the condition on line 1939 was never true
1940 data_extend(sense_base, "tags", ["obsolete", "historical"])
1941 supergloss = supergloss[2:].strip()
1943 # remove [14th century...] style brackets at the end
1944 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)
1946 if supergloss.startswith((",", ":")): 1946 ↛ 1947line 1946 didn't jump to line 1947 because the condition on line 1946 was never true
1947 supergloss = supergloss[1:]
1948 supergloss = supergloss.strip()
1949 if supergloss.startswith("N. of "): 1949 ↛ 1950line 1949 didn't jump to line 1950 because the condition on line 1949 was never true
1950 supergloss = "Name of " + supergloss[6:]
1951 supergloss = supergloss[2:]
1952 data_append(sense_base, "glosses", supergloss)
1953 if supergloss in ("A person:",): 1953 ↛ 1954line 1953 didn't jump to line 1954 because the condition on line 1953 was never true
1954 data_append(sense_base, "tags", "g-person")
1956 # The main recursive call (except for the exceptions at the
1957 # start of this function).
1958 for sublist in subentries:
1959 if not ( 1959 ↛ 1962line 1959 didn't jump to line 1962 because the condition on line 1959 was never true
1960 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST
1961 ):
1962 wxr.wtp.debug(
1963 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"
1964 f"with items that are not LISTs",
1965 sortid="page/1511/20230119",
1966 )
1967 continue
1968 for item in sublist.children:
1969 if not ( 1969 ↛ 1973line 1969 didn't jump to line 1973 because the condition on line 1969 was never true
1970 isinstance(item, WikiNode)
1971 and item.kind == NodeKind.LIST_ITEM
1972 ):
1973 continue
1974 # copy sense_base to prevent cross-contamination between
1975 # subglosses and other subglosses and superglosses
1976 sense_base2 = copy.deepcopy(sense_base)
1977 if parse_sense_node(item, sense_base2, pos): 1977 ↛ 1968line 1977 didn't jump to line 1968 because the condition on line 1977 was always true
1978 added = True
1980 # Capture examples.
1981 # This is called after the recursive calls above so that
1982 # sense_base is not contaminated with meta-data from
1983 # example entries for *this* gloss.
1984 examples = []
1985 if wxr.config.capture_examples: 1985 ↛ 1989line 1985 didn't jump to line 1989 because the condition on line 1985 was always true
1986 examples = extract_examples(others, sense_base)
1988 # push_sense() succeeded somewhere down-river, so skip this level
1989 if added:
1990 if examples:
1991 # this higher-up gloss has examples that we do not want to skip
1992 wxr.wtp.debug(
1993 "'{}[...]' gloss has examples we want to keep, "
1994 "but there are subglosses.".format(repr(rawgloss[:30])),
1995 sortid="page/1498/20230118",
1996 )
1997 else:
1998 return True
2000 # Some entries, e.g., "iacebam", have weird sentences in quotes
2001 # after the gloss, but these sentences don't seem to be intended
2002 # as glosses. Skip them.
2003 indexed_subglosses = list(
2004 (i, gl)
2005 for i, gl in enumerate(subglosses)
2006 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl)
2007 )
2009 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2009 ↛ 2010line 2009 didn't jump to line 2010 because the condition on line 2009 was never true
2010 gl = indexed_subglosses[0][1].strip()
2011 if gl.endswith(":"):
2012 gl = gl[:-1].strip()
2013 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)
2014 if parsed is not None:
2015 infl_tags, infl_dts = parsed
2016 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:
2017 # Interpret others as a particular form under
2018 # "inflection of"
2019 data_extend(sense_base, "tags", infl_tags)
2020 data_extend(sense_base, "form_of", infl_dts)
2021 indexed_subglosses = indexed_subglosses[1:]
2022 elif not infl_dts:
2023 data_extend(sense_base, "tags", infl_tags)
2024 indexed_subglosses = indexed_subglosses[1:]
2026 # Create senses for remaining subglosses
2027 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):
2028 gloss = gloss.strip()
2029 if not gloss and len(indexed_subglosses) > 1: 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true
2030 continue
2031 # Push a new sense (if the last one is not empty)
2032 if push_sense(): 2032 ↛ 2033line 2032 didn't jump to line 2033 because the condition on line 2032 was never true
2033 added = True
2034 # if gloss not in sense_data.get("raw_glosses", ()):
2035 # data_append(sense_data, "raw_glosses", gloss)
2036 if i == 0 and examples:
2037 # In a multi-line gloss, associate examples
2038 # with only one of them.
2039 # XXX or you could use gloss_i == len(indexed_subglosses)
2040 # to associate examples with the *last* one.
2041 data_extend(sense_data, "examples", examples)
2042 if gloss.startswith("; ") and gloss_i > 0: 2042 ↛ 2043line 2042 didn't jump to line 2043 because the condition on line 2042 was never true
2043 gloss = gloss[1:].strip()
2044 # If the gloss starts with †, mark as obsolete
2045 if gloss.startswith("^†"): 2045 ↛ 2046line 2045 didn't jump to line 2046 because the condition on line 2045 was never true
2046 data_append(sense_data, "tags", "obsolete")
2047 gloss = gloss[2:].strip()
2048 elif gloss.startswith("^‡"): 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true
2049 data_extend(sense_data, "tags", ["obsolete", "historical"])
2050 gloss = gloss[2:].strip()
2051 # Copy data for all senses to this sense
2052 for k, v in sense_base.items():
2053 if isinstance(v, (list, tuple)):
2054 if k != "tags":
2055 # Tags handled below (countable/uncountable special)
2056 data_extend(sense_data, k, v)
2057 else:
2058 assert k not in ("tags", "categories", "topics")
2059 sense_data[k] = v # type:ignore[literal-required]
2060 # Parse the gloss for this particular sense
2061 m = QUALIFIERS_RE.match(gloss)
2062 # (...): ... or (...(...)...): ...
2063 if m:
2064 parse_sense_qualifier(wxr, m.group(1), sense_data)
2065 gloss = gloss[m.end() :].strip()
2067 # Remove common suffix "[from 14th c.]" and similar
2068 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)
2070 # Check to make sure we don't have unhandled list items in gloss
2071 ofs = max(gloss.find("#"), gloss.find("* "))
2072 if ofs > 10 and "(#)" not in gloss: 2072 ↛ 2073line 2072 didn't jump to line 2073 because the condition on line 2072 was never true
2073 wxr.wtp.debug(
2074 "gloss may contain unhandled list items: {}".format(gloss),
2075 sortid="page/1412",
2076 )
2077 elif "\n" in gloss: 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true
2078 wxr.wtp.debug(
2079 "gloss contains newline: {}".format(gloss),
2080 sortid="page/1416",
2081 )
2083 # Kludge, some glosses have a comma after initial qualifiers in
2084 # parentheses
2085 if gloss.startswith((",", ":")): 2085 ↛ 2086line 2085 didn't jump to line 2086 because the condition on line 2085 was never true
2086 gloss = gloss[1:]
2087 gloss = gloss.strip()
2088 if gloss.endswith(":"): 2088 ↛ 2089line 2088 didn't jump to line 2089 because the condition on line 2088 was never true
2089 gloss = gloss[:-1].strip()
2090 if gloss.startswith("N. of "): 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true
2091 gloss = "Name of " + gloss[6:]
2092 if gloss.startswith("†"): 2092 ↛ 2093line 2092 didn't jump to line 2093 because the condition on line 2092 was never true
2093 data_append(sense_data, "tags", "obsolete")
2094 gloss = gloss[1:]
2095 elif gloss.startswith("^†"): 2095 ↛ 2096line 2095 didn't jump to line 2096 because the condition on line 2095 was never true
2096 data_append(sense_data, "tags", "obsolete")
2097 gloss = gloss[2:]
2099 # Copy tags from sense_base if any. This will not copy
2100 # countable/uncountable if either was specified in the sense,
2101 # as sometimes both are specified in word head but only one
2102 # in individual senses.
2103 countability_tags = []
2104 base_tags = sense_base.get("tags", ())
2105 sense_tags = sense_data.get("tags", ())
2106 for tag in base_tags:
2107 if tag in ("countable", "uncountable"): 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true
2108 if tag not in countability_tags:
2109 countability_tags.append(tag)
2110 continue
2111 if tag not in sense_tags:
2112 data_append(sense_data, "tags", tag)
2113 if countability_tags: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true
2114 if (
2115 "countable" not in sense_tags
2116 and "uncountable" not in sense_tags
2117 ):
2118 data_extend(sense_data, "tags", countability_tags)
2120 # If outer gloss specifies a form-of ("inflection of", see
2121 # aquamarine/German), try to parse the inner glosses as
2122 # tags for an inflected form.
2123 if "form-of" in sense_base.get("tags", ()):
2124 parsed = parse_alt_or_inflection_of(
2125 wxr, gloss, gloss_template_args
2126 )
2127 if parsed is not None: 2127 ↛ 2133line 2127 didn't jump to line 2133 because the condition on line 2127 was always true
2128 infl_tags, infl_dts = parsed
2129 if not infl_dts and infl_tags: 2129 ↛ 2133line 2129 didn't jump to line 2133 because the condition on line 2129 was always true
2130 # Interpret as a particular form under "inflection of"
2131 data_extend(sense_data, "tags", infl_tags)
2133 if not gloss: 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true
2134 data_append(sense_data, "tags", "empty-gloss")
2135 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true
2136 if (
2137 gloss_i == 0
2138 and len(sense_data.get("glosses", tuple())) >= 1
2139 ):
2140 # If we added a "high-level gloss" from rawgloss, but this
2141 # is that same gloss_i, add this instead of the raw_gloss
2142 # from before if they're different: the rawgloss was not
2143 # cleaned exactly the same as this later gloss
2144 sense_data["glosses"][-1] = gloss
2145 else:
2146 # Add the gloss for the sense.
2147 data_append(sense_data, "glosses", gloss)
2149 # Kludge: there are cases (e.g., etc./Swedish) where there are
2150 # two abbreviations in the same sense, both generated by the
2151 # {{abbreviation of|...}} template. Handle these with some magic.
2152 position = 0
2153 split_glosses = []
2154 for m in re.finditer(r"Abbreviation of ", gloss): 2154 ↛ 2155line 2154 didn't jump to line 2155 because the loop on line 2154 never started
2155 if m.start() != position:
2156 split_glosses.append(gloss[position : m.start()])
2157 position = m.start()
2158 split_glosses.append(gloss[position:])
2159 for gloss in split_glosses:
2160 # Check if this gloss describes an alt-of or inflection-of
2161 if (
2162 lang_code != "en"
2163 and " " not in gloss
2164 and distw([word], gloss) < 0.3
2165 ):
2166 # Don't try to parse gloss if it is one word
2167 # that is close to the word itself for non-English words
2168 # (probable translations of a tag/form name)
2169 continue
2170 parsed = parse_alt_or_inflection_of(
2171 wxr, gloss, gloss_template_args
2172 )
2173 if parsed is None:
2174 continue
2175 tags, dts = parsed
2176 if not dts and tags: 2176 ↛ 2179line 2176 didn't jump to line 2179 because the condition on line 2176 was always true
2177 data_extend(sense_data, "tags", tags)
2178 continue
2179 for dt in dts: # type:ignore[union-attr]
2180 ftags = list(tag for tag in tags if tag != "form-of")
2181 if "alt-of" in tags:
2182 data_extend(sense_data, "tags", ftags)
2183 data_append(sense_data, "alt_of", dt)
2184 elif "compound-of" in tags:
2185 data_extend(sense_data, "tags", ftags)
2186 data_append(sense_data, "compound_of", dt)
2187 elif "synonym-of" in tags:
2188 data_extend(dt, "tags", ftags)
2189 data_append(sense_data, "synonyms", dt)
2190 elif tags and dt.get("word", "").startswith("of "):
2191 dt["word"] = dt["word"][3:]
2192 data_append(sense_data, "tags", "form-of")
2193 data_extend(sense_data, "tags", ftags)
2194 data_append(sense_data, "form_of", dt)
2195 elif "form-of" in tags:
2196 data_extend(sense_data, "tags", tags)
2197 data_append(sense_data, "form_of", dt)
2199 if len(sense_data) == 0:
2200 if len(sense_base.get("tags", [])) == 0: 2200 ↛ 2202line 2200 didn't jump to line 2202 because the condition on line 2200 was always true
2201 del sense_base["tags"]
2202 sense_data.update(sense_base)
2203 if push_sense(): 2203 ↛ 2207line 2203 didn't jump to line 2207 because the condition on line 2203 was always true
2204 # push_sense succeded in adding a sense to pos_data
2205 added = True
2206 # print("PARSE_SENSE DONE:", pos_datas[-1])
2207 return added
2209 def parse_inflection(
2210 node: WikiNode, section: str, pos: Optional[str]
2211 ) -> None:
2212 """Parses inflection data (declension, conjugation) from the given
2213 page. This retrieves the actual inflection template
2214 parameters, which are very useful for applications that need
2215 to learn the inflection classes and generate inflected
2216 forms."""
2217 assert isinstance(node, WikiNode)
2218 assert isinstance(section, str)
2219 assert pos is None or isinstance(pos, str)
2220 # print("parse_inflection:", node)
2222 if pos is None: 2222 ↛ 2223line 2222 didn't jump to line 2223 because the condition on line 2222 was never true
2223 wxr.wtp.debug(
2224 "inflection table outside part-of-speech", sortid="page/1812"
2225 )
2226 return
2228 def inflection_template_fn(
2229 name: str, ht: TemplateArgs
2230 ) -> Optional[str]:
2231 # print("decl_conj_template_fn", name, ht)
2232 if is_panel_template(wxr, name):
2233 return ""
2234 if name in ("is-u-mutation",):
2235 # These are not to be captured as an exception to the
2236 # generic code below
2237 return None
2238 m = re.search(
2239 r"-(conj|decl|ndecl|adecl|infl|conjugation|"
2240 r"declension|inflection|mut|mutation)($|-)",
2241 name,
2242 )
2243 if m:
2244 args_ht = clean_template_args(wxr, ht)
2245 dt = {"name": name, "args": args_ht}
2246 data_append(pos_data, "inflection_templates", dt)
2248 return None
2250 # Convert the subtree back to Wikitext, then expand all and parse,
2251 # capturing templates in the process
2252 text = wxr.wtp.node_to_wikitext(node.children)
2254 # Split text into separate sections for each to-level template
2255 brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"]
2256 template_sections = []
2257 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }
2258 # Because there is the possibility of triple curly braces
2259 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not
2260 # count nesting depth using pairs of two brackets, but
2261 # instead use singular braces ("{ }").
2262 # Because template delimiters should be balanced, regardless
2263 # of whether {{ or {{{ is used, and because we only care
2264 # about the outer-most delimiters (the highest level template)
2265 # we can just count the single braces when those single
2266 # braces are part of a group.
2268 # print(text)
2269 # print(repr(brace_matches))
2270 if len(brace_matches) > 1: 2270 ↛ 2271line 2270 didn't jump to line 2271 because the condition on line 2270 was never true
2271 tsection: list[str] = []
2272 after_templates = False # kludge to keep any text
2273 # before first template
2274 # with the first template;
2275 # otherwise, text
2276 # goes with preceding template
2277 for m in brace_matches:
2278 if m.startswith("{{"):
2279 if template_nesting == 0 and after_templates:
2280 template_sections.append(tsection)
2281 tsection = []
2282 # start new section
2283 after_templates = True
2284 template_nesting += len(m)
2285 tsection.append(m)
2286 elif m.startswith("}}"):
2287 template_nesting -= len(m)
2288 if template_nesting < 0:
2289 wxr.wtp.error(
2290 "Negatively nested braces, "
2291 "couldn't split inflection templates, "
2292 "{}/{} section {}".format(word, language, section),
2293 sortid="page/1871",
2294 )
2295 template_sections = [] # use whole text
2296 break
2297 tsection.append(m)
2298 else:
2299 tsection.append(m)
2300 if tsection: # dangling tsection
2301 template_sections.append(tsection)
2302 # Why do it this way around? The parser has a preference
2303 # to associate bits outside of tables with the preceding
2304 # table (`after`-variable), so a new tsection begins
2305 # at {{ and everything before it belongs to the previous
2306 # template.
2308 texts = []
2309 if not template_sections: 2309 ↛ 2312line 2309 didn't jump to line 2312 because the condition on line 2309 was always true
2310 texts = [text]
2311 else:
2312 for tsection in template_sections:
2313 texts.append("".join(tsection))
2314 if template_nesting != 0: 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true
2315 wxr.wtp.error(
2316 "Template nesting error: "
2317 "template_nesting = {} "
2318 "couldn't split inflection templates, "
2319 "{}/{} section {}".format(
2320 template_nesting, word, language, section
2321 ),
2322 sortid="page/1896",
2323 )
2324 texts = [text]
2325 for text in texts:
2326 tree = wxr.wtp.parse(
2327 text, expand_all=True, template_fn=inflection_template_fn
2328 )
2330 # Parse inflection tables from the section. The data is stored
2331 # under "forms".
2332 if wxr.config.capture_inflections: 2332 ↛ 2325line 2332 didn't jump to line 2325 because the condition on line 2332 was always true
2333 tablecontext = None
2334 m = re.search(r"{{([^}{|]+)\|?", text)
2335 if m: 2335 ↛ 2336line 2335 didn't jump to line 2336 because the condition on line 2335 was never true
2336 template_name = m.group(1)
2337 tablecontext = TableContext(template_name)
2339 parse_inflection_section(
2340 wxr,
2341 pos_data,
2342 word,
2343 language,
2344 pos,
2345 section,
2346 tree,
2347 tablecontext=tablecontext,
2348 )
2350 def get_subpage_section(
2351 title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]]
2352 ) -> Optional[Union[WikiNode, str]]:
2353 """Loads a subpage of the given page, and finds the section
2354 for the given language, part-of-speech, and section title. This
2355 is used for finding translations and other sections on subpages."""
2356 assert isinstance(language, str)
2357 assert isinstance(title, str)
2358 assert isinstance(subtitle, str)
2359 assert isinstance(seq, (list, tuple))
2360 for x in seq:
2361 assert isinstance(x, str)
2362 subpage_title = word + "/" + subtitle
2363 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)
2364 if subpage_content is None:
2365 wxr.wtp.error(
2366 "/translations not found despite "
2367 "{{see translation subpage|...}}",
2368 sortid="page/1934",
2369 )
2370 return None
2372 def recurse(
2373 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]
2374 ) -> Optional[Union[str, WikiNode]]:
2375 # print(f"seq: {seq}")
2376 if not seq:
2377 return node
2378 if not isinstance(node, WikiNode):
2379 return None
2380 # print(f"node.kind: {node.kind}")
2381 if node.kind in LEVEL_KINDS:
2382 t = clean_node(wxr, None, node.largs[0])
2383 # print(f"t: {t} == seq[0]: {seq[0]}?")
2384 if t.lower() == seq[0].lower():
2385 seq = seq[1:]
2386 if not seq:
2387 return node
2388 for n in node.children:
2389 ret = recurse(n, seq)
2390 if ret is not None:
2391 return ret
2392 return None
2394 tree = wxr.wtp.parse(
2395 subpage_content,
2396 pre_expand=True,
2397 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
2398 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
2399 )
2400 assert tree.kind == NodeKind.ROOT
2401 ret = recurse(tree, seq)
2402 if ret is None:
2403 wxr.wtp.debug(
2404 "Failed to find subpage section {}/{} seq {}".format(
2405 title, subtitle, seq
2406 ),
2407 sortid="page/1963",
2408 )
2409 return ret
2411 def parse_linkage(
2412 data: WordData, field: str, linkagenode: WikiNode
2413 ) -> None:
2414 assert isinstance(data, dict)
2415 assert isinstance(field, str)
2416 assert isinstance(linkagenode, WikiNode)
2417 # if field == "synonyms":
2418 # print("field", field)
2419 # print("data", data)
2420 # print("children:")
2421 # print(linkagenode.children)
2422 if not wxr.config.capture_linkages: 2422 ↛ 2423line 2422 didn't jump to line 2423 because the condition on line 2422 was never true
2423 return
2424 have_panel_template = False
2425 toplevel_text = []
2426 next_navframe_sense = None # Used for "(sense):" before NavFrame
2428 def parse_linkage_item(
2429 contents: list[Union[str, WikiNode]],
2430 field: str,
2431 sense: Optional[str] = None,
2432 ):
2433 assert isinstance(contents, (list, tuple))
2434 assert isinstance(field, str)
2435 assert sense is None or isinstance(sense, str)
2437 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
2438 # .format(field, sense, contents))
2440 parts: list[str] = []
2441 ruby: list[tuple[str, str]] = []
2442 urls: list[str] = []
2443 # data about link text; this is used to skip splitting on
2444 # linkage text items that contain stuff like commas; for
2445 # example "Hunde, die bellen, beißen nicht" in article
2446 # beißen is split into "Hunde", "die bellen" etc.
2447 # We take that link text and use it, eventually,
2448 # in split_at_comma_semi to skip splitting on those
2449 # commas.
2450 links_that_should_not_be_split: list[str] = []
2452 def item_recurse(
2453 contents: list[Union[str, WikiNode]], italic=False
2454 ) -> None:
2455 assert isinstance(contents, (list, tuple))
2456 nonlocal sense
2457 nonlocal ruby
2458 nonlocal parts
2459 # print("ITEM_RECURSE:", contents)
2460 for node in contents:
2461 if isinstance(node, str): 2461 ↛ 2464line 2461 didn't jump to line 2464 because the condition on line 2461 was always true
2462 parts.append(node)
2463 continue
2464 kind = node.kind
2465 # print("ITEM_RECURSE KIND:", kind,
2466 # node.sarg if node.sarg else node.largs)
2467 if kind == NodeKind.LIST:
2468 if parts:
2469 sense1: Optional[str]
2470 sense1 = clean_node(wxr, None, parts)
2471 if sense1.endswith(":"):
2472 sense1 = sense1[:-1].strip()
2473 if sense1.startswith("(") and sense1.endswith(")"):
2474 sense1 = sense1[1:-1].strip()
2475 if sense1.lower() == TRANSLATIONS_TITLE:
2476 sense1 = None
2477 # print("linkage item_recurse LIST sense1:", sense1)
2478 parse_linkage_recurse(
2479 node.children, field, sense=sense1 or sense
2480 )
2481 parts = []
2482 else:
2483 parse_linkage_recurse(node.children, field, sense)
2484 elif kind in (
2485 NodeKind.TABLE,
2486 NodeKind.TABLE_ROW,
2487 NodeKind.TABLE_CELL,
2488 ):
2489 parse_linkage_recurse(node.children, field, sense)
2490 elif kind in (
2491 NodeKind.TABLE_HEADER_CELL,
2492 NodeKind.TABLE_CAPTION,
2493 ):
2494 continue
2495 elif kind == NodeKind.HTML:
2496 classes = (node.attrs.get("class") or "").split()
2497 if node.sarg in ("gallery", "ref", "cite", "caption"):
2498 continue
2499 elif node.sarg == "ruby":
2500 rb = parse_ruby(wxr, node)
2501 if rb:
2502 ruby.append(rb)
2503 parts.append(rb[0])
2504 continue
2505 elif node.sarg == "math":
2506 parts.append(clean_node(wxr, None, node))
2507 continue
2508 elif "interProject" in classes:
2509 continue # These do not seem to be displayed
2510 if "NavFrame" in classes:
2511 parse_linkage_recurse(node.children, field, sense)
2512 else:
2513 item_recurse(node.children, italic=italic)
2514 elif kind == NodeKind.ITALIC:
2515 item_recurse(node.children, italic=True)
2516 elif kind == NodeKind.LINK:
2517 ignore = False
2518 if isinstance(node.largs[0][0], str):
2519 v1 = node.largs[0][0].strip().lower()
2520 if v1.startswith(
2521 ns_title_prefix_tuple(wxr, "Category", True)
2522 + ns_title_prefix_tuple(wxr, "File", True)
2523 ):
2524 ignore = True
2525 if not ignore:
2526 v = node.largs[-1]
2527 if (
2528 len(node.largs) == 1
2529 and len(v) > 0
2530 and isinstance(v[0], str)
2531 and v[0][0] == ":"
2532 ):
2533 v = [v[0][1:]] + list(v[1:]) # type:ignore
2534 if isinstance(v[0], str) and not v[0].isalnum():
2535 links_that_should_not_be_split.append(
2536 "".join(v[0])
2537 ) # type: ignore
2538 item_recurse(v, italic=italic)
2539 elif kind == NodeKind.URL:
2540 if len(node.largs) < 2 and node.largs:
2541 # Naked url captured
2542 urls.extend(node.largs[-1]) # type:ignore[arg-type]
2543 continue
2544 if len(node.largs) == 2:
2545 # Url from link with text
2546 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
2547 # print(f"{node.largs=!r}")
2548 # print("linkage recurse URL {}".format(node))
2549 item_recurse(node.largs[-1], italic=italic)
2550 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD):
2551 item_recurse(node.children, italic=italic)
2552 else:
2553 wxr.wtp.debug(
2554 "linkage item_recurse unhandled {}: {}".format(
2555 node.kind, node
2556 ),
2557 sortid="page/2073",
2558 )
2560 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
2561 # .format(contents))
2563 item_recurse(contents)
2564 item = clean_node(wxr, None, parts)
2565 # print("LINKAGE ITEM CONTENTS:", parts)
2566 # print("CLEANED ITEM: {!r}".format(item))
2567 # print(f"URLS {urls=!r}")
2569 return parse_linkage_item_text(
2570 wxr,
2571 word,
2572 data,
2573 field,
2574 item,
2575 sense,
2576 ruby,
2577 pos_datas,
2578 is_reconstruction,
2579 urls or None,
2580 links_that_should_not_be_split or None,
2581 )
2583 def parse_linkage_recurse(
2584 contents: list[Union[WikiNode, str]],
2585 field: str,
2586 sense: Optional[str],
2587 ) -> None:
2588 assert isinstance(contents, (list, tuple))
2589 assert sense is None or isinstance(sense, str)
2590 nonlocal next_navframe_sense
2591 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
2592 for node in contents:
2593 if isinstance(node, str):
2594 # Ignore top-level text, generally comments before the
2595 # linkages list. However, if no linkages are found, then
2596 # use this for linkages (not all words use bullet points
2597 # for linkages).
2598 toplevel_text.append(node)
2599 continue
2600 assert isinstance(node, WikiNode)
2601 kind = node.kind
2602 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
2603 if kind == NodeKind.LIST:
2604 parse_linkage_recurse(node.children, field, sense)
2605 elif kind == NodeKind.LIST_ITEM: 2605 ↛ 2612line 2605 didn't jump to line 2612 because the condition on line 2605 was always true
2606 v = parse_linkage_item(node.children, field, sense)
2607 if v: 2607 ↛ 2611line 2607 didn't jump to line 2611 because the condition on line 2607 was never true
2608 # parse_linkage_item() can return a value that should
2609 # be used as the sense for the follow-on linkages,
2610 # which are typically provided in a table (see 滿)
2611 next_navframe_sense = v
2612 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
2613 parse_linkage_recurse(node.children, field, sense)
2614 elif kind == NodeKind.TABLE_CELL:
2615 parse_linkage_item(node.children, field, sense)
2616 elif kind in (
2617 NodeKind.TABLE_CAPTION,
2618 NodeKind.TABLE_HEADER_CELL,
2619 NodeKind.PREFORMATTED,
2620 NodeKind.BOLD,
2621 ):
2622 continue
2623 elif kind == NodeKind.HTML:
2624 # Recurse to process inside the HTML for most tags
2625 if node.sarg in ("gallery", "ref", "cite", "caption"):
2626 continue
2627 classes = (node.attrs.get("class") or "").split()
2628 if node.sarg == "li":
2629 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑
2630 v = parse_linkage_item(node.children, field, sense)
2631 if v:
2632 next_navframe_sense = v
2633 elif "qualifier-content" in classes:
2634 sense1 = clean_node(wxr, None, node.children)
2635 if sense1.endswith(":"):
2636 sense1 = sense1[:-1].strip()
2637 if sense and sense1:
2638 wxr.wtp.debug(
2639 "linkage qualifier-content on multiple "
2640 "levels: {!r} and {!r}".format(sense, sense1),
2641 sortid="page/2170",
2642 )
2643 parse_linkage_recurse(node.children, field, sense1)
2644 elif "NavFrame" in classes:
2645 # NavFrame uses previously assigned next_navframe_sense
2646 # (from a "(sense):" item) and clears it afterwards
2647 parse_linkage_recurse(
2648 node.children, field, sense or next_navframe_sense
2649 )
2650 next_navframe_sense = None
2651 else:
2652 parse_linkage_recurse(node.children, field, sense)
2653 elif kind in LEVEL_KINDS:
2654 # Just recurse to any possible subsections
2655 parse_linkage_recurse(node.children, field, sense)
2656 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
2657 # Skip these on top level; at least sometimes bold is
2658 # used for indicating a subtitle
2659 continue
2660 elif kind == NodeKind.LINK:
2661 # Recurse into the last argument
2662 # Apparently ":/" is used as a link to "/", so strip
2663 # initial value
2664 parse_linkage_recurse(node.largs[-1], field, sense)
2665 else:
2666 wxr.wtp.debug(
2667 "parse_linkage_recurse unhandled {}: {}".format(
2668 kind, node
2669 ),
2670 sortid="page/2196",
2671 )
2673 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
2674 nonlocal have_panel_template
2675 if is_panel_template(wxr, name):
2676 have_panel_template = True
2677 return ""
2678 return None
2680 def parse_zh_synonyms(
2681 parsed: list[Union[WikiNode, str]],
2682 data: list[LinkageData],
2683 hdrs: list[str],
2684 root_word: str,
2685 ) -> None:
2686 """Parses Chinese dialectal synonyms tables"""
2687 for item in parsed:
2688 if isinstance(item, WikiNode):
2689 if item.kind == NodeKind.TABLE_ROW:
2690 cleaned = clean_node(wxr, None, item.children)
2691 # print("cleaned:", repr(cleaned))
2692 if any(
2693 [
2694 "Variety" in cleaned,
2695 "Location" in cleaned,
2696 "Words" in cleaned,
2697 ]
2698 ):
2699 pass
2700 else:
2701 split = cleaned.split("\n")
2702 new_hdrs = split[:-1]
2703 if len(new_hdrs) == 2:
2704 hdrs = [new_hdrs[0]]
2705 new_hdrs.pop(0)
2706 combined_hdrs = [x.strip() for x in hdrs + new_hdrs]
2707 tags = []
2708 words = split[-1].split(",")
2709 for hdr in combined_hdrs:
2710 hdr = hdr.replace("(", ",")
2711 hdr = hdr.replace(")", "")
2712 hdr = hdr.replace("N.", "Northern,")
2713 hdr = hdr.replace("S.", "Southern,")
2714 new = hdr.split(",")
2715 for tag in sorted(new):
2716 tag = tag.strip()
2717 tag = tag.replace(" ", "-")
2718 if tag in valid_tags:
2719 tags.append(tag)
2720 else:
2721 if tag in zh_tag_lookup:
2722 tags.extend(zh_tag_lookup[tag])
2723 else:
2724 print(
2725 f"MISSING ZH SYNONYM TAG for "
2726 f"root {root_word}, word "
2727 f"{words}: {tag}"
2728 )
2729 sys.stdout.flush()
2731 for word in words:
2732 data.append(
2733 {"word": word.strip(), "tags": tags}
2734 )
2735 elif item.kind == NodeKind.HTML:
2736 cleaned = clean_node(wxr, None, item.children)
2737 if "Synonyms of" in cleaned:
2738 cleaned = cleaned.replace("Synonyms of ", "")
2739 root_word = cleaned
2740 parse_zh_synonyms(item.children, data, hdrs, root_word)
2741 else:
2742 parse_zh_synonyms(item.children, data, hdrs, root_word)
2744 def parse_zh_synonyms_list(
2745 parsed: list[Union[WikiNode, str]],
2746 data: list[LinkageData],
2747 hdrs: list[str],
2748 root_word: str,
2749 ) -> None:
2750 """Parses Chinese dialectal synonyms tables (list format)"""
2751 for item in parsed:
2752 if isinstance(item, WikiNode):
2753 if item.kind == NodeKind.LIST_ITEM:
2754 cleaned = clean_node(wxr, None, item.children)
2755 # print("cleaned:", repr(cleaned))
2756 if any(
2757 [
2758 "Variety" in cleaned,
2759 "Location" in cleaned,
2760 "Words" in cleaned,
2761 ]
2762 ):
2763 pass
2764 else:
2765 cleaned = cleaned.replace("(", ",")
2766 cleaned = cleaned.replace(")", "")
2767 split = cleaned.split(",")
2768 # skip empty words / titles
2769 if split[0] == "":
2770 continue
2771 words = split[0].split("/")
2772 new_hdrs = [x.strip() for x in split[1:]]
2773 tags = []
2774 roman = None
2775 for tag in sorted(new_hdrs):
2776 if tag in valid_tags:
2777 tags.append(tag)
2778 elif tag in zh_tag_lookup:
2779 tags.extend(zh_tag_lookup[tag])
2780 elif (
2781 classify_desc(tag) == "romanization"
2782 and roman is None
2783 ):
2784 roman = tag
2785 else:
2786 print(
2787 f"MISSING ZH SYNONYM TAG "
2788 f"(possibly pinyin) - root "
2789 f"{root_word}, word {words}: {tag}"
2790 )
2791 sys.stdout.flush()
2793 for word in words:
2794 dt: LinkageData = {"word": word.strip()}
2795 if tags:
2796 dt["tags"] = tags
2797 if roman is not None:
2798 dt["roman"] = roman
2799 data.append(dt)
2800 elif item.kind == NodeKind.HTML:
2801 cleaned = clean_node(wxr, None, item.children)
2802 if cleaned.find("Synonyms of") >= 0:
2803 cleaned = cleaned.replace("Synonyms of ", "")
2804 root_word = cleaned
2805 parse_zh_synonyms_list(
2806 item.children, data, hdrs, root_word
2807 )
2808 else:
2809 parse_zh_synonyms_list(
2810 item.children, data, hdrs, root_word
2811 )
2813 def contains_kind(
2814 children: list[Union[WikiNode, str]], nodekind: NodeKind
2815 ) -> bool:
2816 assert isinstance(children, list)
2817 for item in children:
2818 if not isinstance(item, WikiNode):
2819 continue
2820 if item.kind == nodekind:
2821 return True
2822 elif contains_kind(item.children, nodekind):
2823 return True
2824 return False
2826 # Main body of parse_linkage()
2827 text = wxr.wtp.node_to_wikitext(linkagenode.children)
2828 parsed = wxr.wtp.parse(
2829 text, expand_all=True, template_fn=linkage_template_fn1
2830 )
2831 if field == "synonyms" and lang_code == "zh": 2831 ↛ 2832line 2831 didn't jump to line 2832 because the condition on line 2831 was never true
2832 synonyms: list[LinkageData] = []
2833 if contains_kind(parsed.children, NodeKind.LIST):
2834 parse_zh_synonyms_list(parsed.children, synonyms, [], "")
2835 else:
2836 parse_zh_synonyms(parsed.children, synonyms, [], "")
2837 # print(json.dumps(synonyms, indent=4, ensure_ascii=False))
2838 data_extend(data, "synonyms", synonyms)
2839 parse_linkage_recurse(parsed.children, field, None)
2840 if not data.get(field) and not have_panel_template: 2840 ↛ 2841line 2840 didn't jump to line 2841 because the condition on line 2840 was never true
2841 text = "".join(toplevel_text).strip()
2842 if "\n" not in text and "," in text and text.count(",") > 3:
2843 if not text.startswith("See "):
2844 parse_linkage_item([text], field, None)
2846 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:
2847 """Parses translations for a word. This may also pull in translations
2848 from separate translation subpages."""
2849 assert isinstance(data, dict)
2850 assert isinstance(xlatnode, WikiNode)
2851 # print("===== PARSE_TRANSLATIONS {} {} {}"
2852 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))
2853 # print("parse_translations xlatnode={}".format(xlatnode))
2854 if not wxr.config.capture_translations: 2854 ↛ 2855line 2854 didn't jump to line 2855 because the condition on line 2854 was never true
2855 return
2856 sense_parts: list[Union[WikiNode, str]] = []
2857 sense: Optional[str] = None
2859 def parse_translation_item(
2860 contents: list[Union[WikiNode, str]], lang: Optional[str] = None
2861 ) -> None:
2862 nonlocal sense
2863 assert isinstance(contents, list)
2864 assert lang is None or isinstance(lang, str)
2865 # print("PARSE_TRANSLATION_ITEM:", contents)
2867 langcode: Optional[str] = None
2868 if sense is None:
2869 sense = clean_node(wxr, data, sense_parts).strip()
2870 # print("sense <- clean_node: ", sense)
2871 idx = sense.find("See also translations at")
2872 if idx > 0: 2872 ↛ 2873line 2872 didn't jump to line 2873 because the condition on line 2872 was never true
2873 wxr.wtp.debug(
2874 "Skipping translation see also: {}".format(sense),
2875 sortid="page/2361",
2876 )
2877 sense = sense[:idx].strip()
2878 if sense.endswith(":"): 2878 ↛ 2879line 2878 didn't jump to line 2879 because the condition on line 2878 was never true
2879 sense = sense[:-1].strip()
2880 if sense.endswith("—"): 2880 ↛ 2881line 2880 didn't jump to line 2881 because the condition on line 2880 was never true
2881 sense = sense[:-1].strip()
2882 translations_from_template: list[str] = []
2884 def translation_item_template_fn(
2885 name: str, ht: TemplateArgs
2886 ) -> Optional[str]:
2887 nonlocal langcode
2888 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)
2889 if is_panel_template(wxr, name):
2890 return ""
2891 if name in ("t+check", "t-check", "t-needed"):
2892 # We ignore these templates. They seem to have outright
2893 # garbage in some entries, and very varying formatting in
2894 # others. These should be transitory and unreliable
2895 # anyway.
2896 return "__IGNORE__"
2897 if name in ("t", "t+", "t-simple", "tt", "tt+"):
2898 code = ht.get(1)
2899 if code:
2900 if langcode and code != langcode:
2901 wxr.wtp.debug(
2902 "inconsistent language codes {} vs "
2903 "{} in translation item: {!r} {}".format(
2904 langcode, code, name, ht
2905 ),
2906 sortid="page/2386",
2907 )
2908 langcode = code
2909 tr = ht.get(2)
2910 if tr:
2911 tr = clean_node(wxr, None, [tr])
2912 translations_from_template.append(tr)
2913 return None
2914 if name == "t-egy":
2915 langcode = "egy"
2916 return None
2917 if name == "ttbc":
2918 code = ht.get(1)
2919 if code:
2920 langcode = code
2921 return None
2922 if name == "trans-see":
2923 wxr.wtp.error(
2924 "UNIMPLEMENTED trans-see template", sortid="page/2405"
2925 )
2926 return ""
2927 if name.endswith("-top"):
2928 return ""
2929 if name.endswith("-bottom"):
2930 return ""
2931 if name.endswith("-mid"):
2932 return ""
2933 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"
2934 # .format(name),
2935 # sortid="page/2414")
2936 return None
2938 sublists = list(
2939 x
2940 for x in contents
2941 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST
2942 )
2943 contents = list(
2944 x
2945 for x in contents
2946 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST
2947 )
2949 item = clean_node(
2950 wxr, data, contents, template_fn=translation_item_template_fn
2951 )
2952 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))
2954 # Parse the translation item.
2955 if item: 2955 ↛ exitline 2955 didn't return from function 'parse_translation_item' because the condition on line 2955 was always true
2956 lang = parse_translation_item_text(
2957 wxr,
2958 word,
2959 data,
2960 item,
2961 sense,
2962 lang,
2963 langcode,
2964 translations_from_template,
2965 is_reconstruction,
2966 )
2968 # Handle sublists. They are frequently used for different
2969 # scripts for the language and different variants of the
2970 # language. We will include the lower-level header as a
2971 # tag in those cases.
2972 for listnode in sublists: 2972 ↛ 2973line 2972 didn't jump to line 2973 because the loop on line 2972 never started
2973 assert listnode.kind == NodeKind.LIST
2974 for node in listnode.children:
2975 if not isinstance(node, WikiNode):
2976 continue
2977 if node.kind == NodeKind.LIST_ITEM:
2978 parse_translation_item(node.children, lang=lang)
2980 def parse_translation_template(node: WikiNode) -> None:
2981 assert isinstance(node, WikiNode)
2983 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
2984 nonlocal sense_parts
2985 nonlocal sense
2986 if is_panel_template(wxr, name):
2987 return ""
2988 if name == "see also":
2989 # XXX capture
2990 # XXX for example, "/" has top-level list containing
2991 # see also items. So also should parse those.
2992 return ""
2993 if name == "trans-see":
2994 # XXX capture
2995 return ""
2996 if name == "see translation subpage":
2997 sense_parts = []
2998 sense = None
2999 sub = ht.get(1, "")
3000 if sub:
3001 m = re.match(
3002 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub
3003 )
3004 else:
3005 m = None
3006 etym = ""
3007 etym_numbered = ""
3008 pos = ""
3009 if m:
3010 etym_numbered = m.group(1)
3011 etym = m.group(2)
3012 pos = m.group(3)
3013 if not sub:
3014 wxr.wtp.debug(
3015 "no part-of-speech in "
3016 "{{see translation subpage|...}}, "
3017 "defaulting to just wxr.wtp.section "
3018 "(= language)",
3019 sortid="page/2468",
3020 )
3021 # seq sent to get_subpage_section without sub and pos
3022 seq = [
3023 language,
3024 TRANSLATIONS_TITLE,
3025 ]
3026 elif (
3027 m
3028 and etym.lower().strip() in ETYMOLOGY_TITLES
3029 and pos.lower() in POS_TITLES
3030 ):
3031 seq = [
3032 language,
3033 etym_numbered,
3034 pos,
3035 TRANSLATIONS_TITLE,
3036 ]
3037 elif sub.lower() in POS_TITLES:
3038 # seq with sub but not pos
3039 seq = [
3040 language,
3041 sub,
3042 TRANSLATIONS_TITLE,
3043 ]
3044 else:
3045 # seq with sub and pos
3046 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"
3047 if pos.lower() not in POS_TITLES:
3048 wxr.wtp.debug(
3049 "unhandled see translation subpage: "
3050 "language={} sub={} "
3051 "wxr.wtp.subsection={}".format(
3052 language, sub, wxr.wtp.subsection
3053 ),
3054 sortid="page/2478",
3055 )
3056 seq = [language, sub, pos, TRANSLATIONS_TITLE]
3057 subnode = get_subpage_section(
3058 wxr.wtp.title or "MISSING_TITLE",
3059 TRANSLATIONS_TITLE,
3060 seq,
3061 )
3062 if subnode is not None and isinstance(subnode, WikiNode):
3063 parse_translations(data, subnode)
3064 else:
3065 # Failed to find the normal subpage section
3066 seq = [TRANSLATIONS_TITLE]
3067 subnode = get_subpage_section(
3068 wxr.wtp.title or "MISSING_TITLE",
3069 TRANSLATIONS_TITLE,
3070 seq,
3071 )
3072 if subnode is not None and isinstance(
3073 subnode, WikiNode
3074 ):
3075 parse_translations(data, subnode)
3076 return ""
3077 if name in (
3078 "c",
3079 "C",
3080 "categorize",
3081 "cat",
3082 "catlangname",
3083 "topics",
3084 "top",
3085 "qualifier",
3086 "cln",
3087 ):
3088 # These are expanded in the default way
3089 return None
3090 if name in ("trans-top",):
3091 # XXX capture id from trans-top? Capture sense here
3092 # instead of trying to parse it from expanded content?
3093 if ht.get(1):
3094 sense_parts = []
3095 sense = ht.get(1)
3096 else:
3097 sense_parts = []
3098 sense = None
3099 return None
3100 if name in (
3101 "trans-bottom",
3102 "trans-mid",
3103 "checktrans-mid",
3104 "checktrans-bottom",
3105 ):
3106 return None
3107 if name == "checktrans-top":
3108 sense_parts = []
3109 sense = None
3110 return ""
3111 if name == "trans-top-also":
3112 # XXX capture?
3113 sense_parts = []
3114 sense = None
3115 return ""
3116 wxr.wtp.error(
3117 "UNIMPLEMENTED parse_translation_template: {} {}".format(
3118 name, ht
3119 ),
3120 sortid="page/2517",
3121 )
3122 return ""
3124 wxr.wtp.expand(
3125 wxr.wtp.node_to_wikitext(node), template_fn=template_fn
3126 )
3128 def parse_translation_recurse(xlatnode: WikiNode) -> None:
3129 nonlocal sense
3130 nonlocal sense_parts
3131 for node in xlatnode.children:
3132 # print(node)
3133 if isinstance(node, str):
3134 if sense: 3134 ↛ 3135line 3134 didn't jump to line 3135 because the condition on line 3134 was never true
3135 if not node.isspace():
3136 wxr.wtp.debug(
3137 "skipping string in the middle of "
3138 "translations: {}".format(node),
3139 sortid="page/2530",
3140 )
3141 continue
3142 # Add a part to the sense
3143 sense_parts.append(node)
3144 sense = None
3145 continue
3146 assert isinstance(node, WikiNode)
3147 kind = node.kind
3148 if kind == NodeKind.LIST:
3149 for item in node.children:
3150 if not isinstance(item, WikiNode): 3150 ↛ 3151line 3150 didn't jump to line 3151 because the condition on line 3150 was never true
3151 continue
3152 if item.kind != NodeKind.LIST_ITEM: 3152 ↛ 3153line 3152 didn't jump to line 3153 because the condition on line 3152 was never true
3153 continue
3154 if item.sarg == ":": 3154 ↛ 3155line 3154 didn't jump to line 3155 because the condition on line 3154 was never true
3155 continue
3156 parse_translation_item(item.children)
3157 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3157 ↛ 3161line 3157 didn't jump to line 3161 because the condition on line 3157 was never true
3158 # Silently skip list items that are just indented; these
3159 # are used for text between translations, such as indicating
3160 # translations that need to be checked.
3161 pass
3162 elif kind == NodeKind.TEMPLATE: 3162 ↛ 3163line 3162 didn't jump to line 3163 because the condition on line 3162 was never true
3163 parse_translation_template(node)
3164 elif kind in ( 3164 ↛ 3169line 3164 didn't jump to line 3169 because the condition on line 3164 was never true
3165 NodeKind.TABLE,
3166 NodeKind.TABLE_ROW,
3167 NodeKind.TABLE_CELL,
3168 ):
3169 parse_translation_recurse(node)
3170 elif kind == NodeKind.HTML: 3170 ↛ 3171line 3170 didn't jump to line 3171 because the condition on line 3170 was never true
3171 if node.attrs.get("class") == "NavFrame":
3172 # Reset ``sense_parts`` (and force recomputing
3173 # by clearing ``sense``) as each NavFrame specifies
3174 # its own sense. This helps eliminate garbage coming
3175 # from text at the beginning at the translations
3176 # section.
3177 sense_parts = []
3178 sense = None
3179 # for item in node.children:
3180 # if not isinstance(item, WikiNode):
3181 # continue
3182 # parse_translation_recurse(item)
3183 parse_translation_recurse(node)
3184 elif kind in LEVEL_KINDS: 3184 ↛ 3186line 3184 didn't jump to line 3186 because the condition on line 3184 was never true
3185 # Sub-levels will be recursed elsewhere
3186 pass
3187 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 3187 ↛ 3188line 3187 didn't jump to line 3188 because the condition on line 3187 was never true
3188 parse_translation_recurse(node)
3189 elif kind == NodeKind.PREFORMATTED: 3189 ↛ 3191line 3189 didn't jump to line 3191 because the condition on line 3189 was always true
3190 print("parse_translation_recurse: PREFORMATTED:", node)
3191 elif kind == NodeKind.LINK:
3192 arg0 = node.largs[0]
3193 # Kludge: I've seen occasional normal links to translation
3194 # subpages from main pages (e.g., language/English/Noun
3195 # in July 2021) instead of the normal
3196 # {{see translation subpage|...}} template. This should
3197 # handle them. Note: must be careful not to read other
3198 # links, particularly things like in "human being":
3199 # "a human being -- see [[man/translations]]" (group title)
3200 if (
3201 isinstance(arg0, (list, tuple))
3202 and arg0
3203 and isinstance(arg0[0], str)
3204 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)
3205 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]
3206 == wxr.wtp.title
3207 ):
3208 wxr.wtp.debug(
3209 "translations subpage link found on main "
3210 "page instead "
3211 "of normal {{see translation subpage|...}}",
3212 sortid="page/2595",
3213 )
3214 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"
3215 if sub.lower() in POS_TITLES:
3216 seq = [
3217 language,
3218 sub,
3219 TRANSLATIONS_TITLE,
3220 ]
3221 subnode = get_subpage_section(
3222 wxr.wtp.title,
3223 TRANSLATIONS_TITLE,
3224 seq,
3225 )
3226 if subnode is not None and isinstance(
3227 subnode, WikiNode
3228 ):
3229 parse_translations(data, subnode)
3230 else:
3231 wxr.wtp.error(
3232 "/translations link outside part-of-speech"
3233 )
3235 if (
3236 len(arg0) >= 1
3237 and isinstance(arg0[0], str)
3238 and not arg0[0].lower().startswith("category:")
3239 ):
3240 for x in node.largs[-1]:
3241 if isinstance(x, str):
3242 sense_parts.append(x)
3243 else:
3244 parse_translation_recurse(x)
3245 elif not sense:
3246 sense_parts.append(node)
3247 else:
3248 wxr.wtp.debug(
3249 "skipping text between translation items/senses: "
3250 "{}".format(node),
3251 sortid="page/2621",
3252 )
3254 # Main code of parse_translation(). We want ``sense`` to be assigned
3255 # regardless of recursion levels, and thus the code is structured
3256 # to define at this level and recurse in parse_translation_recurse().
3257 parse_translation_recurse(xlatnode)
3259 def parse_etymology(data: WordData, node: WikiNode) -> None:
3260 """Parses an etymology section."""
3261 assert isinstance(data, dict)
3262 assert isinstance(node, WikiNode)
3264 templates: list[TemplateData] = []
3266 # Counter for preventing the capture of etymology templates
3267 # when we are inside templates that we want to ignore (i.e.,
3268 # not capture).
3269 ignore_count = 0
3271 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3272 nonlocal ignore_count
3273 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:
3274 return ""
3275 if re.match(ignored_etymology_templates_re, name):
3276 ignore_count += 1
3277 return None
3279 # CONTINUE_HERE
3281 def etym_post_template_fn(
3282 name: str, ht: TemplateArgs, expansion: str
3283 ) -> None:
3284 nonlocal ignore_count
3285 if name in wikipedia_templates:
3286 parse_wikipedia_template(wxr, data, ht)
3287 return None
3288 if re.match(ignored_etymology_templates_re, name):
3289 ignore_count -= 1
3290 return None
3291 if ignore_count == 0:
3292 ht = clean_template_args(wxr, ht)
3293 expansion = clean_node(wxr, None, expansion)
3294 templates.append(
3295 {"name": name, "args": ht, "expansion": expansion}
3296 )
3297 return None
3299 # Remove any subsections
3300 contents = list(
3301 x
3302 for x in node.children
3303 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS
3304 )
3305 # Convert to text, also capturing templates using post_template_fn
3306 text = clean_node(
3307 wxr,
3308 None,
3309 contents,
3310 template_fn=etym_template_fn,
3311 post_template_fn=etym_post_template_fn,
3312 ).strip(": \n") # remove ":" indent wikitext before zh-x template
3313 # Save the collected information.
3314 if len(text) > 0: 3314 ↛ 3316line 3314 didn't jump to line 3316 because the condition on line 3314 was always true
3315 data["etymology_text"] = text
3316 if len(templates) > 0: 3316 ↛ 3321line 3316 didn't jump to line 3321 because the condition on line 3316 was never true
3317 # Some etymology templates, like Template:root do not generate
3318 # text, so they should be added here. Elsewhere, we check
3319 # for Template:root and add some text to the expansion to please
3320 # the validation.
3321 data["etymology_templates"] = templates
3323 for child_node in node.find_child_recursively( 3323 ↛ exitline 3323 didn't return from function 'parse_etymology' because the loop on line 3323 didn't complete
3324 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE
3325 ):
3326 if child_node.kind in LEVEL_KIND_FLAGS: 3326 ↛ 3328line 3326 didn't jump to line 3328 because the condition on line 3326 was always true
3327 break
3328 elif isinstance(
3329 child_node, TemplateNode
3330 ) and child_node.template_name in ["zh-x", "zh-q"]:
3331 if "etymology_examples" not in data:
3332 data["etymology_examples"] = []
3333 data["etymology_examples"].extend(
3334 extract_template_zh_x(
3335 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])
3336 )
3337 )
3339 def parse_descendants(
3340 data: WordData, node: WikiNode, is_proto_root_derived_section=False
3341 ) -> None:
3342 """Parses a Descendants section. Also used on Derived terms and
3343 Extensions sections when we are dealing with a root of a reconstructed
3344 language (i.e. is_proto_root_derived_section == True), as they use the
3345 same structure. In the latter case, The wiktionary convention is not to
3346 title the section as descendants since the immediate offspring of the
3347 roots are morphologically derived terms within the same proto-language.
3348 Still, since the rest of the section lists true descendants, we use the
3349 same function. Entries in the descendants list that are technically
3350 derived terms will have a field "tags": ["derived"]."""
3351 assert isinstance(data, dict)
3352 assert isinstance(node, WikiNode)
3353 assert isinstance(is_proto_root_derived_section, bool)
3355 descendants = []
3357 # Most templates that are not in a LIST should be ignored as they only
3358 # add formatting, like "desc-top", "der-top3", etc. Any template in
3359 # unignored_non_list_templates actually contains relevant descendant
3360 # info. E.g. "CJKV" is often the only line at all in descendants
3361 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would
3362 # be skipped if we didn't handle it specially as it is not part of a
3363 # LIST, and additionally is in panel_templates. There are probably more
3364 # such templates that should be added to this...
3365 unignored_non_list_templates: list[str] = ["CJKV"]
3367 def process_list_item_children(
3368 sarg: str, children: list[Union[str, WikiNode]]
3369 ) -> None:
3370 assert isinstance(sarg, str)
3371 assert isinstance(children, list)
3372 # The descendants section is a hierarchical bulleted listed. sarg is
3373 # usually some number of "*" characters indicating the level of
3374 # indentation of the line, e.g. "***" indicates the line will be
3375 # thrice-indented. A bare ";" is used to indicate a subtitle-like
3376 # line with no indentation. ":" at the end of one or more "*"s is
3377 # used to indicate that the bullet will not be displayed.
3378 item_data: DescendantData = {"depth": sarg.count("*")}
3379 templates: list[TemplateData] = []
3380 is_derived = False
3382 # Counter for preventing the capture of templates when we are inside
3383 # templates that we want to ignore (i.e., not capture).
3384 ignore_count = 0
3386 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3387 nonlocal ignore_count
3388 if (
3389 is_panel_template(wxr, name)
3390 and name not in unignored_non_list_templates
3391 ):
3392 return ""
3393 if re.match(ignored_descendants_templates_re, name):
3394 ignore_count += 1
3395 return None
3397 def desc_post_template_fn(
3398 name: str, ht: TemplateArgs, expansion: str
3399 ) -> None:
3400 nonlocal ignore_count
3401 if name in wikipedia_templates:
3402 parse_wikipedia_template(wxr, data, ht)
3403 return None
3404 if re.match(ignored_descendants_templates_re, name):
3405 ignore_count -= 1
3406 return None
3407 if ignore_count == 0:
3408 ht = clean_template_args(wxr, ht)
3409 nonlocal is_derived
3410 # If we're in a proto-root Derived terms or Extensions
3411 # section, and the current list item has a link template
3412 # to a term in the same proto-language, then we tag this
3413 # descendant entry with "derived"
3414 is_derived = (
3415 is_proto_root_derived_section
3416 and (name == "l" or name == "link")
3417 and ("1" in ht and ht["1"] == lang_code)
3418 )
3419 expansion = clean_node(wxr, None, expansion)
3420 templates.append(
3421 {"name": name, "args": ht, "expansion": expansion}
3422 )
3423 return None
3425 text = clean_node(
3426 wxr,
3427 None,
3428 children,
3429 template_fn=desc_template_fn,
3430 post_template_fn=desc_post_template_fn,
3431 )
3432 item_data["templates"] = templates
3433 item_data["text"] = text
3434 if is_derived:
3435 item_data["tags"] = ["derived"]
3436 descendants.append(item_data)
3438 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]:
3439 for i, child in enumerate(node.children):
3440 if isinstance(child, WikiNode):
3441 yield (i, child)
3443 def get_sublist_index(list_item: WikiNode) -> Optional[int]:
3444 for i, child in node_children(list_item):
3445 if child.kind == NodeKind.LIST:
3446 return i
3447 return None
3449 def get_descendants(node: WikiNode) -> None:
3450 """Appends the data for every list item in every list in node
3451 to descendants."""
3452 for _, c in node_children(node):
3453 if (
3454 c.kind == NodeKind.TEMPLATE
3455 and c.largs
3456 and len(c.largs[0]) == 1
3457 and isinstance(c.largs[0][0], str)
3458 and c.largs[0][0] in unignored_non_list_templates
3459 ):
3460 # Some Descendants sections have no wikitext list. Rather,
3461 # the list is entirely generated by a single template (see
3462 # e.g. the use of {{CJKV}} in Chinese entries).
3463 process_list_item_children("", [c])
3464 elif c.kind == NodeKind.HTML:
3465 # The Descendants sections for many languages feature
3466 # templates that generate html to add styling (e.g. using
3467 # multiple columns) to the list, so that the actual wikitext
3468 # list items are found within a <div>. We look within the
3469 # children of the html node for the actual list items.
3470 get_descendants(c)
3471 elif c.kind == NodeKind.LIST:
3472 get_descendants(c)
3473 elif c.kind == NodeKind.LIST_ITEM:
3474 # If a LIST_ITEM has subitems in a sublist, usually its
3475 # last child is a LIST. However, sometimes after the LIST
3476 # there is one or more trailing LIST_ITEMs, like "\n" or
3477 # a reference template. If there is a sublist, we discard
3478 # everything after it.
3479 i = get_sublist_index(c)
3480 if i is not None:
3481 process_list_item_children(c.sarg, c.children[:i])
3482 get_descendants(c.children[i]) # type: ignore[arg-type]
3483 else:
3484 process_list_item_children(c.sarg, c.children)
3486 # parse_descendants() actual work starts here
3487 get_descendants(node)
3489 # if e.g. on a PIE page, there may be both Derived terms and Extensions
3490 # sections, in which case this function will be called multiple times,
3491 # so we have to check if descendants exists first.
3492 if "descendants" in data:
3493 data["descendants"].extend(descendants)
3494 else:
3495 data["descendants"] = descendants
3497 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:
3498 """This recurses into a subtree in the parse tree for a page."""
3499 nonlocal etym_data
3500 nonlocal pos_data
3501 nonlocal inside_level_four
3503 redirect_list: list[str] = [] # for `zh-see` template
3505 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
3506 """This is called for otherwise unprocessed parts of the page.
3507 We still expand them so that e.g. Category links get captured."""
3508 if name in wikipedia_templates: 3508 ↛ 3509line 3508 didn't jump to line 3509 because the condition on line 3508 was never true
3509 data = select_data()
3510 parse_wikipedia_template(wxr, data, ht)
3511 return None
3512 if is_panel_template(wxr, name): 3512 ↛ 3513line 3512 didn't jump to line 3513 because the condition on line 3512 was never true
3513 return ""
3514 return None
3516 for node in treenode.children:
3517 # print(node)
3518 if not isinstance(node, WikiNode):
3519 # print(" X{}".format(repr(node)[:40]))
3520 continue
3521 if isinstance(node, TemplateNode):
3522 if process_soft_redirect_template(wxr, node, redirect_list):
3523 continue
3524 elif node.template_name == "zh-forms": 3524 ↛ 3525line 3524 didn't jump to line 3525 because the condition on line 3524 was never true
3525 process_zh_forms_templates(wxr, node, base_data)
3527 if node.kind not in LEVEL_KINDS:
3528 # XXX handle e.g. wikipedia links at the top of a language
3529 # XXX should at least capture "also" at top of page
3530 if node.kind in (
3531 NodeKind.HLINE,
3532 NodeKind.LIST,
3533 NodeKind.LIST_ITEM,
3534 ):
3535 continue
3536 # print(" UNEXPECTED: {}".format(node))
3537 # Clean the node to collect category links
3538 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)
3539 continue
3540 t = clean_node(
3541 wxr, etym_data, node.sarg if node.sarg else node.largs
3542 )
3543 t = t.lower()
3544 # XXX these counts were never implemented fully, and even this
3545 # gets discarded: Search STATISTICS_IMPLEMENTATION
3546 wxr.config.section_counts[t] += 1
3547 # print("PROCESS_CHILDREN: T:", repr(t))
3548 if t in IGNORED_TITLES: 3548 ↛ 3549line 3548 didn't jump to line 3549 because the condition on line 3548 was never true
3549 pass
3550 elif t.startswith(PRONUNCIATION_TITLE): 3550 ↛ 3555line 3550 didn't jump to line 3555 because the condition on line 3550 was never true
3551 # Chinese Pronunciation section kludge; we demote these to
3552 # be level 4 instead of 3 so that they're part of a larger
3553 # etymology hierarchy; usually the data here is empty and
3554 # acts as an inbetween between POS and Etymology data
3555 inside_level_four = True
3556 if t.startswith(PRONUNCIATION_TITLE + " "):
3557 # Pronunciation 1, etc, are used in Chinese Glyphs,
3558 # and each of them may have senses under Definition
3559 push_level_four_section()
3560 wxr.wtp.start_subsection(None)
3561 if wxr.config.capture_pronunciation:
3562 data = select_data()
3563 parse_pronunciation(
3564 wxr,
3565 node,
3566 data,
3567 etym_data,
3568 have_etym,
3569 base_data,
3570 lang_code,
3571 )
3572 elif t.startswith(tuple(ETYMOLOGY_TITLES)):
3573 push_etym()
3574 wxr.wtp.start_subsection(None)
3575 if wxr.config.capture_etymologies: 3575 ↛ 3640line 3575 didn't jump to line 3640 because the condition on line 3575 was always true
3576 m = re.search(r"\s(\d+)$", t)
3577 if m: 3577 ↛ 3578line 3577 didn't jump to line 3578 because the condition on line 3577 was never true
3578 etym_data["etymology_number"] = int(m.group(1))
3579 parse_etymology(etym_data, node)
3580 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 3580 ↛ 3581line 3580 didn't jump to line 3581 because the condition on line 3580 was never true
3581 data = select_data()
3582 parse_descendants(data, node)
3583 elif ( 3583 ↛ 3589line 3583 didn't jump to line 3589
3584 t in PROTO_ROOT_DERIVED_TITLES
3585 and pos == "root"
3586 and is_reconstruction
3587 and wxr.config.capture_descendants
3588 ):
3589 data = select_data()
3590 parse_descendants(data, node, True)
3591 elif t == TRANSLATIONS_TITLE:
3592 data = select_data()
3593 parse_translations(data, node)
3594 elif t in INFLECTION_TITLES: 3594 ↛ 3595line 3594 didn't jump to line 3595 because the condition on line 3594 was never true
3595 parse_inflection(node, t, pos)
3596 else:
3597 lst = t.split()
3598 while len(lst) > 1 and lst[-1].isdigit(): 3598 ↛ 3599line 3598 didn't jump to line 3599 because the condition on line 3598 was never true
3599 lst = lst[:-1]
3600 t_no_number = " ".join(lst).lower()
3601 if t_no_number in POS_TITLES:
3602 push_pos()
3603 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]
3604 pos = dt["pos"] or "MISSING_POS"
3605 wxr.wtp.start_subsection(t)
3606 if "debug" in dt: 3606 ↛ 3607line 3606 didn't jump to line 3607 because the condition on line 3606 was never true
3607 wxr.wtp.debug(
3608 "{} in section {}".format(dt["debug"], t),
3609 sortid="page/2755",
3610 )
3611 if "warning" in dt: 3611 ↛ 3612line 3611 didn't jump to line 3612 because the condition on line 3611 was never true
3612 wxr.wtp.warning(
3613 "{} in section {}".format(dt["warning"], t),
3614 sortid="page/2759",
3615 )
3616 if "error" in dt: 3616 ↛ 3617line 3616 didn't jump to line 3617 because the condition on line 3616 was never true
3617 wxr.wtp.error(
3618 "{} in section {}".format(dt["error"], t),
3619 sortid="page/2763",
3620 )
3621 # Parse word senses for the part-of-speech
3622 parse_part_of_speech(node, pos)
3623 if "tags" in dt: 3623 ↛ 3624line 3623 didn't jump to line 3624 because the condition on line 3623 was never true
3624 for pdata in pos_datas:
3625 data_extend(pdata, "tags", dt["tags"])
3626 elif t_no_number in LINKAGE_TITLES: 3626 ↛ 3630line 3626 didn't jump to line 3630 because the condition on line 3626 was always true
3627 rel = LINKAGE_TITLES[t_no_number]
3628 data = select_data()
3629 parse_linkage(data, rel, node)
3630 elif t_no_number == COMPOUNDS_TITLE:
3631 data = select_data()
3632 if wxr.config.capture_compounds:
3633 parse_linkage(data, "derived", node)
3635 # XXX parse interesting templates also from other sections. E.g.,
3636 # {{Letter|...}} in ===See also===
3637 # Also <gallery>
3639 # Recurse to children of this node, processing subtitles therein
3640 stack.append(t)
3641 process_children(node, pos)
3642 stack.pop()
3644 if len(redirect_list) > 0:
3645 if len(pos_data) > 0:
3646 pos_data["redirects"] = redirect_list
3647 if "pos" not in pos_data: 3647 ↛ 3648line 3647 didn't jump to line 3648 because the condition on line 3647 was never true
3648 pos_data["pos"] = "soft-redirect"
3649 else:
3650 new_page_data = copy.deepcopy(base_data)
3651 new_page_data["redirects"] = redirect_list
3652 if "pos" not in new_page_data: 3652 ↛ 3654line 3652 didn't jump to line 3654 because the condition on line 3652 was always true
3653 new_page_data["pos"] = "soft-redirect"
3654 new_page_data["senses"] = [{"tags": ["no-gloss"]}]
3655 page_datas.append(new_page_data)
3657 def extract_examples(
3658 others: list[WikiNode], sense_base: SenseData
3659 ) -> list[ExampleData]:
3660 """Parses through a list of definitions and quotes to find examples.
3661 Returns a list of example dicts to be added to sense data. Adds
3662 meta-data, mostly categories, into sense_base."""
3663 assert isinstance(others, list)
3664 examples: list[ExampleData] = []
3666 for sub in others:
3667 if not sub.sarg.endswith((":", "*")): 3667 ↛ 3668line 3667 didn't jump to line 3668 because the condition on line 3667 was never true
3668 continue
3669 for item in sub.children:
3670 if not isinstance(item, WikiNode): 3670 ↛ 3671line 3670 didn't jump to line 3671 because the condition on line 3670 was never true
3671 continue
3672 if item.kind != NodeKind.LIST_ITEM: 3672 ↛ 3673line 3672 didn't jump to line 3673 because the condition on line 3672 was never true
3673 continue
3674 usex_type = None
3675 example_template_args = []
3676 example_template_names = []
3677 taxons = set()
3679 # Bypass this function when parsing Chinese, Japanese and
3680 # quotation templates.
3681 new_example_lists = extract_example_list_item(
3682 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])
3683 )
3684 if len(new_example_lists) > 0: 3684 ↛ 3685line 3684 didn't jump to line 3685 because the condition on line 3684 was never true
3685 examples.extend(new_example_lists)
3686 continue
3688 def usex_template_fn(
3689 name: str, ht: TemplateArgs
3690 ) -> Optional[str]:
3691 nonlocal usex_type
3692 if is_panel_template(wxr, name):
3693 return ""
3694 if name in usex_templates:
3695 usex_type = "example"
3696 example_template_args.append(ht)
3697 example_template_names.append(name)
3698 elif name in quotation_templates:
3699 usex_type = "quotation"
3700 elif name in taxonomy_templates:
3701 taxons.update(ht.get(1, "").split())
3702 for prefix in template_linkages:
3703 if re.search(
3704 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name
3705 ):
3706 return ""
3707 return None
3709 # bookmark
3710 ruby: list[tuple[str, str]] = []
3711 contents = item.children
3712 if lang_code == "ja":
3713 # Capture ruby contents if this is a Japanese language
3714 # example.
3715 # print(contents)
3716 if ( 3716 ↛ 3721line 3716 didn't jump to line 3721
3717 contents
3718 and isinstance(contents, str)
3719 and re.match(r"\s*$", contents[0])
3720 ):
3721 contents = contents[1:]
3722 exp = wxr.wtp.parse(
3723 wxr.wtp.node_to_wikitext(contents),
3724 # post_template_fn=head_post_template_fn,
3725 expand_all=True,
3726 )
3727 rub, rest = extract_ruby(wxr, exp.children)
3728 if rub: 3728 ↛ 3732line 3728 didn't jump to line 3732 because the condition on line 3728 was always true
3729 for rtup in rub:
3730 ruby.append(rtup)
3731 contents = rest
3732 subtext = clean_node(
3733 wxr, sense_base, contents, template_fn=usex_template_fn
3734 )
3736 frozen_taxons = frozenset(taxons)
3737 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)
3739 # print(f"{subtext=}")
3740 subtext = re.sub(
3741 r"\s*\(please add an English "
3742 r"translation of this "
3743 r"(example|usage example|quote)\)",
3744 "",
3745 subtext,
3746 ).strip()
3747 subtext = re.sub(r"\^\([^)]*\)", "", subtext)
3748 subtext = re.sub(r"\s*[―—]+$", "", subtext)
3749 # print("subtext:", repr(subtext))
3751 lines = subtext.splitlines()
3752 # print(lines)
3754 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)
3755 lines = list(
3756 x
3757 for x in lines
3758 if not re.match(
3759 r"(Synonyms: |Antonyms: |Hyponyms: |"
3760 r"Synonym: |Antonym: |Hyponym: |"
3761 r"Hypernyms: |Derived terms: |"
3762 r"Related terms: |"
3763 r"Hypernym: |Derived term: |"
3764 r"Coordinate terms:|"
3765 r"Related term: |"
3766 r"For more quotations using )",
3767 x,
3768 )
3769 )
3770 tr = ""
3771 ref = ""
3772 roman = ""
3773 # for line in lines:
3774 # print("LINE:", repr(line))
3775 # print(classify_desc(line))
3776 if len(lines) == 1 and lang_code != "en": 3776 ↛ 3777line 3776 didn't jump to line 3777 because the condition on line 3776 was never true
3777 parts = example_splitter_re.split(lines[0])
3778 if (
3779 len(parts) > 2
3780 and len(example_template_args) == 1
3781 and any(
3782 ("―" in s) or ("—" in s)
3783 for s in example_template_args[0].values()
3784 )
3785 ):
3786 if nparts := synch_splits_with_args(
3787 lines[0], example_template_args[0]
3788 ):
3789 parts = nparts
3790 if (
3791 len(example_template_args) == 1
3792 and "lit" in example_template_args[0]
3793 ):
3794 # ugly brute-force kludge in case there's a lit= arg
3795 literally = example_template_args[0].get("lit", "")
3796 if literally:
3797 literally = (
3798 " (literally, “"
3799 + clean_value(wxr, literally)
3800 + "”)"
3801 )
3802 else:
3803 literally = ""
3804 if (
3805 len(example_template_args) == 1
3806 and len(parts) == 2
3807 and len(example_template_args[0])
3808 - (
3809 # horrible kludge to ignore these arguments
3810 # when calculating how many there are
3811 sum(
3812 s in example_template_args[0]
3813 for s in (
3814 "lit", # generates text, but we handle it
3815 "inline",
3816 "noenum",
3817 "nocat",
3818 "sort",
3819 )
3820 )
3821 )
3822 == 3
3823 and clean_value(
3824 wxr, example_template_args[0].get(2, "")
3825 )
3826 == parts[0].strip()
3827 and clean_value(
3828 wxr,
3829 (
3830 example_template_args[0].get(3)
3831 or example_template_args[0].get("translation")
3832 or example_template_args[0].get("t", "")
3833 )
3834 + literally, # in case there's a lit= argument
3835 )
3836 == parts[1].strip()
3837 ):
3838 # {{exampletemplate|ex|Foo bar baz|English translation}}
3839 # is a pretty reliable 'heuristic', so we use it here
3840 # before the others. To be extra sure the template
3841 # doesn't do anything weird, we compare the arguments
3842 # and the output to each other.
3843 lines = [parts[0].strip()]
3844 tr = parts[1].strip()
3845 elif (
3846 len(parts) == 2
3847 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3848 ):
3849 # These other branches just do some simple heuristics w/
3850 # the expanded output of the template (if applicable).
3851 lines = [parts[0].strip()]
3852 tr = parts[1].strip()
3853 elif (
3854 len(parts) == 3
3855 and classify_desc2(parts[1])
3856 in ("romanization", "english")
3857 and classify_desc2(parts[2]) in ENGLISH_TEXTS
3858 ):
3859 lines = [parts[0].strip()]
3860 roman = parts[1].strip()
3861 tr = parts[2].strip()
3862 else:
3863 parts = re.split(r"\s+-\s+", lines[0])
3864 if (
3865 len(parts) == 2
3866 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3867 ):
3868 lines = [parts[0].strip()]
3869 tr = parts[1].strip()
3870 elif len(lines) > 1:
3871 if any( 3871 ↛ 3874line 3871 didn't jump to line 3874 because the condition on line 3871 was never true
3872 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]
3873 ) and not (len(example_template_names) == 1):
3874 refs: list[str] = []
3875 for i in range(len(lines)):
3876 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]):
3877 break
3878 refs.append(lines[i].strip())
3879 if re.search(r"[]\d:)]\s*$", lines[i]):
3880 break
3881 ref = " ".join(refs)
3882 lines = lines[i + 1 :]
3883 if (
3884 lang_code != "en"
3885 and len(lines) >= 2
3886 and classify_desc2(lines[-1]) in ENGLISH_TEXTS
3887 ):
3888 i = len(lines) - 1
3889 while (
3890 i > 1
3891 and classify_desc2(lines[i - 1])
3892 in ENGLISH_TEXTS
3893 ):
3894 i -= 1
3895 tr = "\n".join(lines[i:])
3896 lines = lines[:i]
3897 if len(lines) >= 2:
3898 if classify_desc2(lines[-1]) == "romanization":
3899 roman = lines[-1].strip()
3900 lines = lines[:-1]
3902 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 3902 ↛ 3903line 3902 didn't jump to line 3903 because the condition on line 3902 was never true
3903 ref = lines[0]
3904 lines = lines[1:]
3905 elif lang_code != "en" and len(lines) == 2: 3905 ↛ 3906line 3905 didn't jump to line 3906 because the condition on line 3905 was never true
3906 cls1 = classify_desc2(lines[0])
3907 cls2 = classify_desc2(lines[1])
3908 if cls2 in ENGLISH_TEXTS and cls1 != "english":
3909 tr = lines[1]
3910 lines = [lines[0]]
3911 elif cls1 in ENGLISH_TEXTS and cls2 != "english":
3912 tr = lines[0]
3913 lines = [lines[1]]
3914 elif (
3915 re.match(r"^[#*]*:+", lines[1])
3916 and classify_desc2(
3917 re.sub(r"^[#*:]+\s*", "", lines[1])
3918 )
3919 in ENGLISH_TEXTS
3920 ):
3921 tr = re.sub(r"^[#*:]+\s*", "", lines[1])
3922 lines = [lines[0]]
3923 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:
3924 # Both were classified as English, but
3925 # presumably one is not. Assume first is
3926 # non-English, as that seems more common.
3927 tr = lines[1]
3928 lines = [lines[0]]
3929 elif ( 3929 ↛ 3945line 3929 didn't jump to line 3945
3930 usex_type != "quotation"
3931 and lang_code != "en"
3932 and len(lines) == 3
3933 ):
3934 cls1 = classify_desc2(lines[0])
3935 cls2 = classify_desc2(lines[1])
3936 cls3 = classify_desc2(lines[2])
3937 if ( 3937 ↛ 3968line 3937 didn't jump to line 3968
3938 cls3 == "english"
3939 and cls2 in ("english", "romanization")
3940 and cls1 != "english"
3941 ):
3942 tr = lines[2].strip()
3943 roman = lines[1].strip()
3944 lines = [lines[0].strip()]
3945 elif (
3946 usex_type == "quotation"
3947 and lang_code != "en"
3948 and len(lines) > 2
3949 ):
3950 # for x in lines:
3951 # print(" LINE: {}: {}"
3952 # .format(classify_desc2(x), x))
3953 if re.match(r"^[#*]*:+\s*$", lines[1]):
3954 ref = lines[0]
3955 lines = lines[2:]
3956 cls1 = classify_desc2(lines[-1])
3957 if cls1 == "english":
3958 i = len(lines) - 1
3959 while (
3960 i > 1
3961 and classify_desc2(lines[i - 1])
3962 == ENGLISH_TEXTS
3963 ):
3964 i -= 1
3965 tr = "\n".join(lines[i:])
3966 lines = lines[:i]
3968 roman = re.sub(r"[ \t\r]+", " ", roman).strip()
3969 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)
3970 tr = re.sub(r"^[#*:]+\s*", "", tr)
3971 tr = re.sub(r"[ \t\r]+", " ", tr).strip()
3972 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)
3973 ref = re.sub(r"^[#*:]+\s*", "", ref)
3974 ref = re.sub(
3975 r", (volume |number |page )?“?"
3976 r"\(please specify ([^)]|\(s\))*\)”?|"
3977 ", text here$",
3978 "",
3979 ref,
3980 )
3981 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)
3982 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)
3983 subtext = "\n".join(x for x in lines if x)
3984 if not tr and lang_code != "en": 3984 ↛ 3985line 3984 didn't jump to line 3985 because the condition on line 3984 was never true
3985 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext)
3986 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS:
3987 tr = m.group(2)
3988 subtext = subtext[: m.start()] + m.group(1)
3989 elif lines:
3990 parts = re.split(r"\s*[―—]+\s*", lines[0])
3991 if (
3992 len(parts) == 2
3993 and classify_desc2(parts[1]) in ENGLISH_TEXTS
3994 ):
3995 subtext = parts[0].strip()
3996 tr = parts[1].strip()
3997 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)
3998 subtext = re.sub(
3999 r"(please add an English translation of "
4000 r"this (quote|usage example))",
4001 "",
4002 subtext,
4003 )
4004 subtext = re.sub(
4005 r"\s*→New International Version " "translation$",
4006 "",
4007 subtext,
4008 ) # e.g. pis/Tok Pisin (Bible)
4009 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()
4010 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)
4011 note = None
4012 m = re.match(r"^\(([^)]*)\):\s+", subtext)
4013 if ( 4013 ↛ 4021line 4013 didn't jump to line 4021
4014 m is not None
4015 and lang_code != "en"
4016 and (
4017 m.group(1).startswith("with ")
4018 or classify_desc2(m.group(1)) == "english"
4019 )
4020 ):
4021 note = m.group(1)
4022 subtext = subtext[m.end() :]
4023 ref = re.sub(r"\s*\(→ISBN\)", "", ref)
4024 ref = re.sub(r",\s*→ISBN", "", ref)
4025 ref = ref.strip()
4026 if ref.endswith(":") or ref.endswith(","): 4026 ↛ 4027line 4026 didn't jump to line 4027 because the condition on line 4026 was never true
4027 ref = ref[:-1].strip()
4028 ref = re.sub(r"\s+,\s+", ", ", ref)
4029 ref = re.sub(r"\s+", " ", ref)
4030 if ref and not subtext: 4030 ↛ 4031line 4030 didn't jump to line 4031 because the condition on line 4030 was never true
4031 subtext = ref
4032 ref = ""
4033 if subtext: 4033 ↛ 3669line 4033 didn't jump to line 3669 because the condition on line 4033 was always true
4034 dt: ExampleData = {"text": subtext}
4035 if ref: 4035 ↛ 4036line 4035 didn't jump to line 4036 because the condition on line 4035 was never true
4036 dt["ref"] = ref
4037 if tr:
4038 dt["english"] = tr
4039 if usex_type: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true
4040 dt["type"] = usex_type
4041 if note: 4041 ↛ 4042line 4041 didn't jump to line 4042 because the condition on line 4041 was never true
4042 dt["note"] = note
4043 if roman:
4044 dt["roman"] = roman
4045 if ruby:
4046 dt["ruby"] = ruby
4047 examples.append(dt)
4049 return examples
4051 # Main code of parse_language()
4052 # Process the section
4053 stack.append(language)
4054 process_children(langnode, None)
4055 stack.pop()
4057 # Finalize word entires
4058 push_etym()
4059 ret = []
4060 for data in page_datas:
4061 merge_base(data, base_data)
4062 ret.append(data)
4064 # Copy all tags to word senses
4065 for data in ret:
4066 if "senses" not in data: 4066 ↛ 4067line 4066 didn't jump to line 4067 because the condition on line 4066 was never true
4067 continue
4068 # WordData should not have a 'tags' field, but if it does, it's
4069 # deleted and its contents removed and placed in each sense;
4070 # that's why the type ignores.
4071 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]
4072 if "tags" in data: 4072 ↛ 4073line 4072 didn't jump to line 4073 because the condition on line 4072 was never true
4073 del data["tags"] # type: ignore[typeddict-item]
4074 for sense in data["senses"]:
4075 data_extend(sense, "tags", tags)
4077 return ret
4080def parse_wikipedia_template(
4081 wxr: WiktextractContext, data: WordData, ht: TemplateArgs
4082) -> None:
4083 """Helper function for parsing {{wikipedia|...}} and related templates."""
4084 assert isinstance(wxr, WiktextractContext)
4085 assert isinstance(data, dict)
4086 assert isinstance(ht, dict)
4087 langid = clean_node(wxr, data, ht.get("lang", ()))
4088 pagename = (
4089 clean_node(wxr, data, ht.get(1, ()))
4090 or wxr.wtp.title
4091 or "MISSING_PAGE_TITLE"
4092 )
4093 if langid:
4094 data_append(data, "wikipedia", langid + ":" + pagename)
4095 else:
4096 data_append(data, "wikipedia", pagename)
4099def parse_top_template(
4100 wxr: WiktextractContext, node: WikiNode, data: WordData
4101) -> None:
4102 """Parses a template that occurs on the top-level in a page, before any
4103 language subtitles."""
4104 assert isinstance(wxr, WiktextractContext)
4105 assert isinstance(node, WikiNode)
4106 assert isinstance(data, dict)
4108 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:
4109 if name in wikipedia_templates:
4110 parse_wikipedia_template(wxr, data, ht)
4111 return None
4112 if is_panel_template(wxr, name):
4113 return ""
4114 if name in ("reconstruction",):
4115 return ""
4116 if name.lower() == "also":
4117 # XXX shows related words that might really have been the intended
4118 # word, capture them
4119 return ""
4120 if name == "see also":
4121 # XXX capture
4122 return ""
4123 if name == "cardinalbox":
4124 # XXX capture
4125 return ""
4126 if name == "character info":
4127 # XXX capture
4128 return ""
4129 if name == "commonscat":
4130 # XXX capture link to Wikimedia commons
4131 return ""
4132 if name == "wrongtitle":
4133 # XXX this should be captured to replace page title with the
4134 # correct title. E.g. ⿰亻革家
4135 return ""
4136 if name == "wikidata":
4137 arg = clean_node(wxr, data, ht.get(1, ()))
4138 if arg.startswith("Q") or arg.startswith("Lexeme:L"):
4139 data_append(data, "wikidata", arg)
4140 return ""
4141 wxr.wtp.debug(
4142 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),
4143 sortid="page/2870",
4144 )
4145 return ""
4147 clean_node(wxr, None, [node], template_fn=top_template_fn)
4150def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:
4151 """Fix subtitle hierarchy to be strict Language -> Etymology ->
4152 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections
4153 that are next to each other."""
4155 # Wiktextract issue #620, Chinese Glyph Origin before an etymology
4156 # section get overwritten. In this case, let's just combine the two.
4158 # In Chinese entries, Pronunciation can be preceded on the
4159 # same level 3 by its Etymology *and* Glyph Origin sections:
4160 # ===Glyph Origin===
4161 # ===Etymology===
4162 # ===Pronunciation===
4163 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation
4164 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')
4165 # are now level 6
4167 # Known lowercase PoS names are in part_of_speech_map
4168 # Known lowercase linkage section names are in linkage_map
4170 old = re.split(
4171 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text
4172 )
4174 parts = []
4175 npar = 4 # Number of parentheses in above expression
4176 parts.append(old[0])
4177 prev_level = None
4178 level = None
4179 skip_level_title = False # When combining etymology sections
4180 for i in range(1, len(old), npar + 1):
4181 left = old[i]
4182 right = old[i + npar - 1]
4183 # remove Wikilinks in title
4184 title = re.sub(r"^\[\[", "", old[i + 1])
4185 title = re.sub(r"\]\]$", "", title)
4186 prev_level = level
4187 level = len(left)
4188 part = old[i + npar]
4189 if level != len(right): 4189 ↛ 4190line 4189 didn't jump to line 4190 because the condition on line 4189 was never true
4190 wxr.wtp.debug(
4191 "subtitle has unbalanced levels: "
4192 "{!r} has {} on the left and {} on the right".format(
4193 title, left, right
4194 ),
4195 sortid="page/2904",
4196 )
4197 lc = title.lower()
4198 if name_to_code(title, "en") != "":
4199 if level > 2: 4199 ↛ 4200line 4199 didn't jump to line 4200 because the condition on line 4199 was never true
4200 wxr.wtp.debug(
4201 "subtitle has language name {} at level {}".format(
4202 title, level
4203 ),
4204 sortid="page/2911",
4205 )
4206 level = 2
4207 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):
4208 if level > 3: 4208 ↛ 4209line 4208 didn't jump to line 4209 because the condition on line 4208 was never true
4209 wxr.wtp.debug(
4210 "etymology section {} at level {}".format(title, level),
4211 sortid="page/2917",
4212 )
4213 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 4213 ↛ 4215line 4213 didn't jump to line 4215 because the condition on line 4213 was never true
4214 # sections cheek-to-cheek
4215 skip_level_title = True
4216 # Modify the title of previous ("Glyph Origin") section, in
4217 # case we have a meaningful title like "Etymology 1"
4218 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)
4219 level = 3
4220 elif lc.startswith(PRONUNCIATION_TITLE): 4220 ↛ 4223line 4220 didn't jump to line 4223 because the condition on line 4220 was never true
4221 # Pronunciation is now a level between POS and Etymology, so
4222 # we need to shift everything down by one
4223 level = 4
4224 elif lc in POS_TITLES:
4225 level = 5
4226 elif lc == TRANSLATIONS_TITLE:
4227 level = 6
4228 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 4228 ↛ 4230line 4228 didn't jump to line 4230 because the condition on line 4228 was always true
4229 level = 6
4230 elif lc in INFLECTION_TITLES:
4231 level = 6
4232 elif lc == DESCENDANTS_TITLE:
4233 level = 6
4234 elif title in PROTO_ROOT_DERIVED_TITLES:
4235 level = 6
4236 elif lc in IGNORED_TITLES:
4237 level = 6
4238 else:
4239 level = 6
4240 if skip_level_title: 4240 ↛ 4241line 4240 didn't jump to line 4241 because the condition on line 4240 was never true
4241 skip_level_title = False
4242 parts.append(part)
4243 else:
4244 parts.append("{}{}{}".format("=" * level, title, "=" * level))
4245 parts.append(part)
4246 # print("=" * level, title)
4247 # if level != len(left):
4248 # print(" FIXED LEVEL OF {} {} -> {}"
4249 # .format(title, len(left), level))
4251 text = "".join(parts)
4252 # print(text)
4253 return text
4256def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:
4257 # Skip translation pages
4258 if word.endswith("/" + TRANSLATIONS_TITLE): 4258 ↛ 4259line 4258 didn't jump to line 4259 because the condition on line 4258 was never true
4259 return []
4261 if wxr.config.verbose: 4261 ↛ 4262line 4261 didn't jump to line 4262 because the condition on line 4261 was never true
4262 logger.info(f"Parsing page: {word}")
4264 wxr.config.word = word
4265 wxr.wtp.start_page(word)
4267 # Remove <noinclude> and similar tags from main pages. They
4268 # should not appear there, but at least net/Elfdala has one and it
4269 # is probably not the only one.
4270 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)
4271 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)
4272 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)
4274 # Fix up the subtitle hierarchy. There are hundreds if not thousands of
4275 # pages that have, for example, Translations section under Linkage, or
4276 # Translations section on the same level as Noun. Enforce a proper
4277 # hierarchy by manipulating the subtitle levels in certain cases.
4278 text = fix_subtitle_hierarchy(wxr, text)
4280 # Parse the page, pre-expanding those templates that are likely to
4281 # influence parsing
4282 tree = wxr.wtp.parse(
4283 text,
4284 pre_expand=True,
4285 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,
4286 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,
4287 )
4288 # from wikitextprocessor.parser import print_tree
4289 # print("PAGE PARSE:", print_tree(tree))
4291 top_data: WordData = {}
4293 # Iterate over top-level titles, which should be languages for normal
4294 # pages
4295 by_lang = defaultdict(list)
4296 for langnode in tree.children:
4297 if not isinstance(langnode, WikiNode):
4298 continue
4299 if langnode.kind == NodeKind.TEMPLATE: 4299 ↛ 4300line 4299 didn't jump to line 4300 because the condition on line 4299 was never true
4300 parse_top_template(wxr, langnode, top_data)
4301 continue
4302 if langnode.kind == NodeKind.LINK: 4302 ↛ 4304line 4302 didn't jump to line 4304 because the condition on line 4302 was never true
4303 # Some pages have links at top level, e.g., "trees" in Wiktionary
4304 continue
4305 if langnode.kind != NodeKind.LEVEL2: 4305 ↛ 4306line 4305 didn't jump to line 4306 because the condition on line 4305 was never true
4306 wxr.wtp.debug(
4307 f"unexpected top-level node: {langnode}", sortid="page/3014"
4308 )
4309 continue
4310 lang = clean_node(
4311 wxr, None, langnode.sarg if langnode.sarg else langnode.largs
4312 )
4313 lang_code = name_to_code(lang, "en")
4314 if lang_code == "": 4314 ↛ 4315line 4314 didn't jump to line 4315 because the condition on line 4314 was never true
4315 wxr.wtp.debug(
4316 f"unrecognized language name: {lang}", sortid="page/3019"
4317 )
4318 if ( 4318 ↛ 4322line 4318 didn't jump to line 4322
4319 wxr.config.capture_language_codes
4320 and lang_code not in wxr.config.capture_language_codes
4321 ):
4322 continue
4323 wxr.wtp.start_section(lang)
4325 # Collect all words from the page.
4326 # print(f"{langnode=}")
4327 datas = parse_language(wxr, langnode, lang, lang_code)
4329 # Propagate fields resulting from top-level templates to this
4330 # part-of-speech.
4331 for data in datas:
4332 if "lang" not in data: 4332 ↛ 4333line 4332 didn't jump to line 4333 because the condition on line 4332 was never true
4333 wxr.wtp.debug(
4334 "internal error -- no lang in data: {}".format(data),
4335 sortid="page/3034",
4336 )
4337 continue
4338 for k, v in top_data.items(): 4338 ↛ 4339line 4338 didn't jump to line 4339 because the loop on line 4338 never started
4339 assert isinstance(v, (list, tuple))
4340 data_extend(data, k, v)
4341 by_lang[data["lang"]].append(data)
4343 # XXX this code is clearly out of date. There is no longer a "conjugation"
4344 # field. FIX OR REMOVE.
4345 # Do some post-processing on the words. For example, we may distribute
4346 # conjugation information to all the words.
4347 ret = []
4348 for lang, lang_datas in by_lang.items():
4349 ret.extend(lang_datas)
4351 for x in ret:
4352 if x["word"] != word:
4353 if word.startswith("Unsupported titles/"): 4353 ↛ 4359line 4353 didn't jump to line 4359 because the condition on line 4353 was always true
4354 wxr.wtp.debug(
4355 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",
4356 sortid="20231101/3578page.py",
4357 )
4358 else:
4359 wxr.wtp.debug(
4360 f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'",
4361 sortid="20231101/3582page.py",
4362 )
4363 x["original_title"] = word
4364 # validate tag data
4365 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]
4366 return ret
4369def recursively_separate_raw_tags(
4370 wxr: WiktextractContext, data: dict[str, Any]
4371) -> None:
4372 if not isinstance(data, dict): 4372 ↛ 4373line 4372 didn't jump to line 4373 because the condition on line 4372 was never true
4373 wxr.wtp.error(
4374 "'data' is not dict; most probably "
4375 "data has a list that contains at least one dict and "
4376 "at least one non-dict item",
4377 sortid="en/page-4016/20240419",
4378 )
4379 return
4380 new_tags: list[str] = []
4381 raw_tags: list[str] = data.get("raw_tags", [])
4382 for field, val in data.items():
4383 if field == "tags":
4384 for tag in val:
4385 if tag not in valid_tags: 4385 ↛ 4386line 4385 didn't jump to line 4386 because the condition on line 4385 was never true
4386 raw_tags.append(tag)
4387 else:
4388 new_tags.append(tag)
4389 if isinstance(val, list):
4390 if len(val) > 0 and isinstance(val[0], dict):
4391 for d in val:
4392 recursively_separate_raw_tags(wxr, d)
4393 if "tags" in data and not new_tags: 4393 ↛ 4394line 4393 didn't jump to line 4394 because the condition on line 4393 was never true
4394 del data["tags"]
4395 elif new_tags:
4396 data["tags"] = new_tags
4397 if raw_tags: 4397 ↛ 4398line 4397 didn't jump to line 4398 because the condition on line 4397 was never true
4398 data["raw_tags"] = raw_tags
4401def process_soft_redirect_template(
4402 wxr: WiktextractContext,
4403 template_node: TemplateNode,
4404 redirect_pages: list[str],
4405) -> bool:
4406 # return `True` if the template is soft redirect template
4407 if template_node.template_name == "zh-see":
4408 # https://en.wiktionary.org/wiki/Template:zh-see
4409 title = clean_node(
4410 wxr, None, template_node.template_parameters.get(1, "")
4411 )
4412 if title != "": 4412 ↛ 4414line 4412 didn't jump to line 4414 because the condition on line 4412 was always true
4413 redirect_pages.append(title)
4414 return True
4415 elif template_node.template_name in ["ja-see", "ja-see-kango"]:
4416 # https://en.wiktionary.org/wiki/Template:ja-see
4417 for key, value in template_node.template_parameters.items():
4418 if isinstance(key, int): 4418 ↛ 4417line 4418 didn't jump to line 4417 because the condition on line 4418 was always true
4419 title = clean_node(wxr, None, value)
4420 if title != "": 4420 ↛ 4417line 4420 didn't jump to line 4417 because the condition on line 4420 was always true
4421 redirect_pages.append(title)
4422 return True
4423 return False
4426def process_zh_forms_templates(
4427 wxr: WiktextractContext,
4428 template_node: TemplateNode,
4429 base_data: WordData,
4430) -> None:
4431 # https://en.wiktionary.org/wiki/Template:zh-forms
4432 if "forms" not in base_data:
4433 base_data["forms"] = []
4434 for p_name, p_value in template_node.template_parameters.items():
4435 if not isinstance(p_name, str):
4436 continue
4437 if re.fullmatch(r"s\d*", p_name):
4438 form_data: FormData = {
4439 "form": clean_node(wxr, None, p_value),
4440 "tags": ["Simplified Chinese"],
4441 }
4442 if len(form_data["form"]) > 0:
4443 base_data["forms"].append(form_data)
4444 elif re.fullmatch(r"t\d+", p_name):
4445 form_data = {
4446 "form": clean_node(wxr, None, p_value),
4447 "tags": ["Traditional Chinese"],
4448 }
4449 if len(form_data["form"]) > 0:
4450 base_data["forms"].append(form_data)
4451 elif p_name == "alt":
4452 for form_text in clean_node(wxr, None, p_value).split(","):
4453 texts = form_text.split("-")
4454 form_data = {"form": texts[0]}
4455 if len(texts) > 1:
4456 # pronunciation data could be added after "-"
4457 # see https://en.wiktionary.org/wiki/新婦
4458 form_data["raw_tags"] = texts[1:]
4459 if len(form_data["form"]) > 0:
4460 base_data["forms"].append(form_data)
4461 elif p_name == "lit":
4462 lit = clean_node(wxr, None, p_value)
4463 if lit != "":
4464 base_data["literal_meaning"] = lit
4465 if len(base_data["forms"]) == 0:
4466 del base_data["forms"]