Coverage for src/wiktextract/extractor/en/page.py: 76%

1# Code for parsing information from a single Wiktionary page.

5import copy

6import html

7import re

8from collections import defaultdict

9from functools import partial

10from typing import (

11 TYPE_CHECKING,

12 Any,

13 Iterable,

14 Iterator,

15 Optional,

16 Set,

17 Union,

18 cast,

19)

21from mediawiki_langcodes import get_all_names, name_to_code

22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable

23from wikitextprocessor.parser import (

24 LEVEL_KIND_FLAGS,

25 GeneralNode,

26 HTMLNode,

27 LevelNode,

28 NodeKind,

29 TemplateNode,

30 WikiNode,

31)

33from ...clean import clean_template_args, clean_value

34from ...datautils import (

35 data_append,

36 data_extend,

37 ns_title_prefix_tuple,

38)

39from ...page import (

40 LEVEL_KINDS,

41 clean_node,

42 is_panel_template,

43 recursively_extract,

44)

45from ...tags import valid_tags

46from ...wxr_context import WiktextractContext

47from ...wxr_logging import logger

48from ..ruby import extract_ruby, parse_ruby

49from ..share import strip_nodes

50from .example import extract_example_list_item, extract_template_zh_x

51from .form_descriptions import (

52 classify_desc,

53 decode_tags,

54 distw,

55 parse_alt_or_inflection_of,

56 parse_sense_qualifier,

57 parse_word_head,

58)

59from .inflection import TableContext, parse_inflection_section

60from .info_templates import (

61 INFO_TEMPLATE_FUNCS,

62 parse_info_template_arguments,

63 parse_info_template_node,

64)

65from .linkages import (

66 extract_alt_form_section,

67 extract_zh_dial_template,

68 parse_linkage_item_text,

69)

70from .parts_of_speech import PARTS_OF_SPEECH

71from .section_titles import (

72 COMPOUNDS_TITLE,

73 DESCENDANTS_TITLE,

74 ETYMOLOGY_TITLES,

75 IGNORED_TITLES,

76 INFLECTION_TITLES,

77 LINKAGE_TITLES,

78 POS_TITLES,

79 PRONUNCIATION_TITLE,

80 PROTO_ROOT_DERIVED_TITLES,

81 TRANSLATIONS_TITLE,

82)

83from .translations import parse_translation_item_text

84from .type_utils import (

85 AttestationData,

86 DescendantData,

87 ExampleData,

88 LinkageData,

89 ReferenceData,

90 SenseData,

91 SoundData,

92 TemplateData,

93 WordData,

94)

95from .unsupported_titles import unsupported_title_map

97# When determining whether a string is 'english', classify_desc

98# might return 'taxonomic' which is English text 99% of the time.

99ENGLISH_TEXTS = ("english", "taxonomic")

100

101# Matches head tag

102HEAD_TAG_RE = re.compile(

103 r"^(head|Han char|arabic-noun|arabic-noun-form|"

104 r"hangul-symbol|syllable-hangul)$|"

105 + r"^(latin|"

106 + "|".join(lang_code for lang_code, *_ in get_all_names("en"))

107 + r")-("

108 + "|".join(

109 [

110 "abbr",

111 "adj",

112 "adjective",

113 "adjective form",

114 "adjective-form",

115 "adv",

116 "adverb",

117 "affix",

118 "animal command",

119 "art",

120 "article",

121 "aux",

122 "bound pronoun",

123 "bound-pronoun",

124 "Buyla",

125 "card num",

126 "card-num",

127 "cardinal",

128 "chunom",

129 "classifier",

130 "clitic",

131 "cls",

132 "cmene",

133 "cmavo",

134 "colloq-verb",

135 "colverbform",

136 "combining form",

137 "combining-form",

138 "comparative",

139 "con",

140 "concord",

141 "conj",

142 "conjunction",

143 "conjug",

144 "cont",

145 "contr",

146 "converb",

147 "daybox",

148 "decl",

149 "decl noun",

150 "def",

151 "dem",

152 "det",

153 "determ",

154 "Deva",

155 "ending",

156 "entry",

157 "form",

158 "fuhivla",

159 "gerund",

160 "gismu",

161 "hanja",

162 "hantu",

163 "hanzi",

164 "head",

165 "ideophone",

166 "idiom",

167 "inf",

168 "indef",

169 "infixed pronoun",

170 "infixed-pronoun",

171 "infl",

172 "inflection",

173 "initialism",

174 "int",

175 "interfix",

176 "interj",

177 "interjection",

178 "jyut",

179 "latin",

180 "letter",

181 "locative",

182 "lujvo",

183 "monthbox",

184 "mutverb",

185 "name",

186 "nisba",

187 "nom",

188 "noun",

189 "noun form",

190 "noun-form",

191 "noun plural",

192 "noun-plural",

193 "nounprefix",

194 "num",

195 "number",

196 "numeral",

197 "ord",

198 "ordinal",

199 "par",

200 "part",

201 "part form",

202 "part-form",

203 "participle",

204 "particle",

205 "past",

206 "past neg",

207 "past-neg",

208 "past participle",

209 "past-participle",

210 "perfect participle",

211 "perfect-participle",

212 "personal pronoun",

213 "personal-pronoun",

214 "pref",

215 "prefix",

216 "phrase",

217 "pinyin",

218 "plural noun",

219 "plural-noun",

220 "pos",

221 "poss-noun",

222 "post",

223 "postp",

224 "postposition",

225 "PP",

226 "pp",

227 "ppron",

228 "pred",

229 "predicative",

230 "prep",

231 "prep phrase",

232 "prep-phrase",

233 "preposition",

234 "present participle",

235 "present-participle",

236 "pron",

237 "prondem",

238 "pronindef",

239 "pronoun",

240 "prop",

241 "proper noun",

242 "proper-noun",

243 "proper noun form",

244 "proper-noun form",

245 "proper noun-form",

246 "proper-noun-form",

247 "prov",

248 "proverb",

249 "prpn",

250 "prpr",

251 "punctuation mark",

252 "punctuation-mark",

253 "regnoun",

254 "rel",

255 "rom",

256 "romanji",

257 "root",

258 "sign",

259 "suff",

260 "suffix",

261 "syllable",

262 "symbol",

263 "verb",

264 "verb form",

265 "verb-form",

266 "verbal noun",

267 "verbal-noun",

268 "verbnec",

269 "vform",

270 ]

271 )

272 + r")(-|/|\+|$)"

273)

274

275# Head-templates causing problems (like newlines) that can be squashed into

276# an empty string in the template handler while saving their template

277# data for later.

278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"}

279

280FLOATING_TABLE_TEMPLATES: set[str] = {

281 # az-suffix-form creates a style=floatright div that is otherwise

282 # deleted; if it is not pre-expanded, we can intercept the template

283 # so we add this set into do_not_pre_expand, and intercept the

284 # templates in parse_part_of_speech

285 "az-suffix-forms",

286 "az-inf-p",

287 "kk-suffix-forms",

288 "ky-suffix-forms",

289 "tr-inf-p",

290 "tr-suffix-forms",

291 "tt-suffix-forms",

292 "uz-suffix-forms",

293}

294# These two should contain template names that should always be

295# pre-expanded when *first* processing the tree, or not pre-expanded

296# so that the template are left in place with their identifying

297# name intact for later filtering.

298

299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set()

300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES)

301

302# Additional templates to be expanded in the pre-expand phase

303ADDITIONAL_EXPAND_TEMPLATES: set[str] = {

304 "multitrans",

305 "multitrans-nowiki",

306 "trans-top",

307 "trans-top-also",

308 "trans-bottom",

309 "checktrans-top",

310 "checktrans-bottom",

311 "col1",

312 "col2",

313 "col3",

314 "col4",

315 "col5",

316 "col1-u",

317 "col2-u",

318 "col3-u",

319 "col4-u",

320 "col5-u",

321 "check deprecated lang param usage",

322 "deprecated code",

323 "ru-verb-alt-ё",

324 "ru-noun-alt-ё",

325 "ru-adj-alt-ё",

326 "ru-proper noun-alt-ё",

327 "ru-pos-alt-ё",

328 "ru-alt-ё",

329 "inflection of",

330 "no deprecated lang param usage",

331 "transclude", # these produce sense entries (or other lists)

332 "tcl",

333}

334

335# Inverse linkage for those that have them

336linkage_inverses: dict[str, str] = {

337 # XXX this is not currently used, move to post-processing

338 "synonyms": "synonyms",

339 "hypernyms": "hyponyms",

340 "hyponyms": "hypernyms",

341 "holonyms": "meronyms",

342 "meronyms": "holonyms",

343 "derived": "derived_from",

344 "coordinate_terms": "coordinate_terms",

345 "troponyms": "hypernyms",

346 "antonyms": "antonyms",

347 "instances": "instance_of",

348 "related": "related",

349}

350

351# Templates that are used to form panels on pages and that

352# should be ignored in various positions

353PANEL_TEMPLATES: set[str] = {

354 "Character info",

355 "CJKV",

356 "French personal pronouns",

357 "French possessive adjectives",

358 "French possessive pronouns",

359 "Han etym",

360 "Japanese demonstratives",

361 "Latn-script",

362 "LDL",

363 "MW1913Abbr",

364 "Number-encoding",

365 "Nuttall",

366 "Spanish possessive adjectives",

367 "Spanish possessive pronouns",

368 "USRegionDisputed",

369 "Webster 1913",

370 "ase-rfr",

371 "attention",

372 "attn",

373 "beer",

374 "broken ref",

375 "ca-compass",

376 "character info",

377 "character info/var",

378 "checksense",

379 "compass-fi",

380 "copyvio suspected",

381 "delete",

382 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean

383 "etystub",

384 "examples",

385 "hu-corr",

386 "hu-suff-pron",

387 "interwiktionary",

388 "ja-kanjitab",

389 "ko-hanja-search",

390 "look",

391 "maintenance box",

392 "maintenance line",

393 "mediagenic terms",

394 "merge",

395 "missing template",

396 "morse links",

397 "move",

398 "multiple images",

399 "no inline",

400 "picdic",

401 "picdicimg",

402 "picdiclabel",

403 "polyominoes",

404 "predidential nomics",

405 "punctuation", # This actually gets pre-expanded

406 "reconstructed",

407 "request box",

408 "rf-sound example",

409 "rfaccents",

410 "rfap",

411 "rfaspect",

412 "rfc",

413 "rfc-auto",

414 "rfc-header",

415 "rfc-level",

416 "rfc-pron-n",

417 "rfc-sense",

418 "rfclarify",

419 "rfd",

420 "rfd-redundant",

421 "rfd-sense",

422 "rfdate",

423 "rfdatek",

424 "rfdef",

425 "rfe",

426 "rfe/dowork",

427 "rfex",

428 "rfexp",

429 "rfform",

430 "rfgender",

431 "rfi",

432 "rfinfl",

433 "rfm",

434 "rfm-sense",

435 "rfp",

436 "rfp-old",

437 "rfquote",

438 "rfquote-sense",

439 "rfquotek",

440 "rfref",

441 "rfscript",

442 "rft2",

443 "rftaxon",

444 "rftone",

445 "rftranslit",

446 "rfv",

447 "rfv-etym",

448 "rfv-pron",

449 "rfv-quote",

450 "rfv-sense",

451 "selfref",

452 "split",

453 "stroke order", # XXX consider capturing this?

454 "stub entry",

455 "t-needed",

456 "tbot entry",

457 "tea room",

458 "tea room sense",

459 # "ttbc", - XXX needed in at least on/Preposition/Translation page

460 "unblock",

461 "unsupportedpage",

462 "video frames",

463 "was wotd",

464 "wrongtitle",

465 "zh-forms",

466 "zh-hanzi-box",

467 "no entry",

468}

469

470# Template name prefixes used for language-specific panel templates (i.e.,

471# templates that create side boxes or notice boxes or that should generally

472# be ignored).

473PANEL_PREFIXES: set[str] = {

474 "list:compass points/",

475 "list:Gregorian calendar months/",

476 "RQ:",

477}

478

479# Templates used for wikipedia links.

480wikipedia_templates: set[str] = {

481 "wikipedia",

482 "slim-wikipedia",

483 "w",

484 "W",

485 "swp",

486 "wiki",

487 "Wikipedia",

488 "wtorw",

489}

490for x in PANEL_PREFIXES & wikipedia_templates: 490 ↛ 491line 490 didn't jump to line 491 because the loop on line 490 never started

491 print(

492 "WARNING: {!r} in both panel_templates and wikipedia_templates".format(

493 x

494 )

495 )

496

497# Mapping from a template name (without language prefix) for the main word

498# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which

499# it could validly occur. This is used as just a sanity check to give

500# warnings about probably incorrect coding in Wiktionary.

501template_allowed_pos_map: dict[str, list[str]] = {

502 "abbr": ["abbrev"],

503 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"],

504 "plural noun": ["noun", "name"],

505 "plural-noun": ["noun", "name"],

506 "proper noun": ["noun", "name"],

507 "proper-noun": ["name", "noun"],

508 "prop": ["name", "noun"],

509 "verb": ["verb", "phrase"],

510 "gerund": ["verb"],

511 "particle": ["adv", "particle"],

512 "adj": ["adj", "adj_noun"],

513 "pron": ["pron", "noun"],

514 "name": ["name", "noun"],

515 "adv": ["adv", "intj", "conj", "particle"],

516 "phrase": ["phrase", "prep_phrase"],

517 "noun phrase": ["phrase"],

518 "ordinal": ["num"],

519 "number": ["num"],

520 "pos": ["affix", "name", "num"],

521 "suffix": ["suffix", "affix"],

522 "character": ["character"],

523 "letter": ["character"],

524 "kanji": ["character"],

525 "cont": ["abbrev"],

526 "interj": ["intj"],

527 "con": ["conj"],

528 "part": ["particle"],

529 "prep": ["prep", "postp"],

530 "postp": ["postp"],

531 "misspelling": ["noun", "adj", "verb", "adv"],

532 "part-form": ["verb"],

533}

534for k, v in template_allowed_pos_map.items():

535 for x in v:

536 if x not in PARTS_OF_SPEECH: 536 ↛ 537line 536 didn't jump to line 537 because the condition on line 536 was never true

537 print(

538 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}"

539 "".format(x, k, v)

540 )

541 assert False

542

543

544# Templates ignored during etymology extraction, i.e., these will not be listed

545# in the extracted etymology templates.

546ignored_etymology_templates: list[str] = [

547 "...",

548 "IPAchar",

549 "ipachar",

550 "ISBN",

551 "isValidPageName",

552 "redlink category",

553 "deprecated code",

554 "check deprecated lang param usage",

555 "para",

556 "p",

557 "cite",

558 "Cite news",

559 "Cite newsgroup",

560 "cite paper",

561 "cite MLLM 1976",

562 "cite journal",

563 "cite news/documentation",

564 "cite paper/documentation",

565 "cite video game",

566 "cite video game/documentation",

567 "cite newsgroup",

568 "cite newsgroup/documentation",

569 "cite web/documentation",

570 "cite news",

571 "Cite book",

572 "Cite-book",

573 "cite book",

574 "cite web",

575 "cite-usenet",

576 "cite-video/documentation",

577 "Cite-journal",

578 "rfe",

579 "catlangname",

580 "cln",

581 "langname-lite",

582 "no deprecated lang param usage",

583 "mention",

584 "m",

585 "m-self",

586 "link",

587 "l",

588 "ll",

589 "l-self",

590]

591# Regexp for matching ignored etymology template names. This adds certain

592# prefixes to the names listed above.

593ignored_etymology_templates_re = re.compile(

594 r"^((cite-|R:|RQ:).*|"

595 + r"|".join(re.escape(x) for x in ignored_etymology_templates)

596 + r")$"

597)

598

599# Regexp for matching ignored descendants template names. Right now we just

600# copy the ignored etymology templates

601ignored_descendants_templates_re = ignored_etymology_templates_re

602

603# Set of template names that are used to define usage examples. If the usage

604# example contains one of these templates, then it its type is set to

605# "example"

606usex_templates: set[str] = {

607 "afex",

608 "affixusex",

609 "co", # {{collocation}} acts like a example template, specifically for

610 # pairs of combinations of words that are more common than you'd

611 # except would be randomly; hlavní#Czech

612 "coi",

613 "collocation",

614 "el-example",

615 "el-x",

616 "example",

617 "examples",

618 "he-usex",

619 "he-x",

620 "hi-usex",

621 "hi-x",

622 "ja-usex-inline",

623 "ja-usex",

624 "ja-x",

625 "jbo-example",

626 "jbo-x",

627 "km-usex",

628 "km-x",

629 "ko-usex",

630 "ko-x",

631 "lo-usex",

632 "lo-x",

633 "ne-x",

634 "ne-usex",

635 "prefixusex",

636 "ryu-usex",

637 "ryu-x",

638 "shn-usex",

639 "shn-x",

640 "suffixusex",

641 "th-usex",

642 "th-x",

643 "ur-usex",

644 "ur-x",

645 "usex",

646 "usex-suffix",

647 "ux",

648 "uxi",

649}

650

651stop_head_at_these_templates: set[str] = {

652 "category",

653 "cat",

654 "topics",

655 "catlangname",

656 "c",

657 "C",

658 "top",

659 "cln",

660}

661

662# Set of template names that are used to define quotation examples. If the

663# usage example contains one of these templates, then its type is set to

664# "quotation".

665quotation_templates: set[str] = {

666 "collapse-quote",

667 "quote-av",

668 "quote-book",

669 "quote-GYLD",

670 "quote-hansard",

671 "quotei",

672 "quote-journal",

673 "quotelite",

674 "quote-mailing list",

675 "quote-meta",

676 "quote-newsgroup",

677 "quote-song",

678 "quote-text",

679 "quote",

680 "quote-us-patent",

681 "quote-video game",

682 "quote-web",

683 "quote-wikipedia",

684 "wikiquote",

685 "Wikiquote",

686}

687

688taxonomy_templates = {

689 # argument 1 should be the taxonomic name, frex. "Lupus lupus"

690 "taxfmt",

691 "taxlink",

692 "taxlink2",

693 "taxlinknew",

694 "taxlook",

695}

696

697# Template name component to linkage section listing. Integer section means

698# default section, starting at that argument.

699# XXX not used anymore, except for the first elements: moved to

700# template_linkages

701# template_linkage_mappings: list[list[Union[str, int]]] = [

702# ["syn", "synonyms"],

703# ["synonyms", "synonyms"],

704# ["ant", "antonyms"],

705# ["antonyms", "antonyms"],

706# ["hyp", "hyponyms"],

707# ["hyponyms", "hyponyms"],

708# ["der", "derived"],

709# ["derived terms", "derived"],

710# ["coordinate terms", "coordinate_terms"],

711# ["rel", "related"],

712# ["col", 2],

713# ]

714

715# Template names, this was exctracted from template_linkage_mappings,

716# because the code using template_linkage_mappings was actually not used

717# (but not removed).

718template_linkages_to_ignore_in_examples: set[str] = {

719 "syn",

720 "synonyms",

721 "ant",

722 "antonyms",

723 "hyp",

724 "hyponyms",

725 "der",

726 "derived terms",

727 "coordinate terms",

728 "cot",

729 "rel",

730 "col",

731 "inline alt forms",

732 "alti",

733 "comeronyms",

734 "holonyms",

735 "holo",

736 "hypernyms",

737 "hyper",

738 "meronyms",

739 "mero",

740 "troponyms",

741 "perfectives",

742 "pf",

743 "imperfectives",

744 "impf",

745 "syndiff",

746 "synsee",

747 # not linkage nor example templates

748 "sense",

749 "s",

750 "color panel",

751 "colour panel",

752}

753

754# Maps template name used in a word sense to a linkage field that it adds.

755sense_linkage_templates: dict[str, str] = {

756 "syn": "synonyms",

757 "synonyms": "synonyms",

758 "synsee": "synonyms",

759 "syndiff": "synonyms",

760 "hyp": "hyponyms",

761 "hyponyms": "hyponyms",

762 "ant": "antonyms",

763 "antonyms": "antonyms",

764 "alti": "related",

765 "inline alt forms": "related",

766 "coordinate terms": "coordinate_terms",

767 "cot": "coordinate_terms",

768 "comeronyms": "related",

769 "holonyms": "holonyms",

770 "holo": "holonyms",

771 "hypernyms": "hypernyms",

772 "hyper": "hypernyms",

773 "meronyms": "meronyms",

774 "mero": "meronyms",

775 "troponyms": "troponyms",

776 "perfectives": "related",

777 "pf": "related",

778 "imperfectives": "related",

779 "impf": "related",

780}

781

782sense_linkage_templates_tags: dict[str, list[str]] = {

783 "alti": ["alternative"],

784 "inline alt forms": ["alternative"],

785 "comeronyms": ["comeronym"],

786 "perfectives": ["perfective"],

787 "pf": ["perfective"],

788 "imperfectives": ["imperfective"],

789 "impf": ["imperfective"],

790}

791

792

793def decode_html_entities(v: Union[str, int]) -> str:

794 """Decodes HTML entities from a value, converting them to the respective

795 Unicode characters/strings."""

796 if isinstance(v, int):

797 # I changed this to return str(v) instead of v = str(v),

798 # but there might have been the intention to have more logic

799 # here. html.unescape would not do anything special with an integer,

800 # it needs html escape symbols (&xx;).

801 return str(v)

802 return html.unescape(v)

803

804

805def parse_sense_linkage(

806 wxr: WiktextractContext,

807 data: SenseData,

808 name: str,

809 ht: TemplateArgs,

810 pos: str,

811) -> None:

812 """Parses a linkage (synonym, etc) specified in a word sense."""

813 assert isinstance(wxr, WiktextractContext)

814 assert isinstance(data, dict)

815 assert isinstance(name, str)

816 assert isinstance(ht, dict)

817 field = sense_linkage_templates[name]

818 field_tags = sense_linkage_templates_tags.get(name, [])

819 for i in range(2, 20):

820 w = ht.get(i) or ""

821 w = clean_node(wxr, data, w)

822 is_thesaurus = False

823 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"):

824 if w.startswith(alias): 824 ↛ 825line 824 didn't jump to line 825 because the condition on line 824 was never true

825 is_thesaurus = True

826 w = w[len(alias) :]

827 if w != wxr.wtp.title:

828 from ...thesaurus import search_thesaurus

829

830 lang_code = clean_node(wxr, None, ht.get(1, ""))

831 for t_data in search_thesaurus(

832 wxr.thesaurus_db_conn, w, lang_code, pos, field

833 ):

834 l_data = {

835 "word": t_data.term,

836 "source": "Thesaurus:" + w,

837 }

838 if len(t_data.tags) > 0:

839 l_data["tags"] = t_data.tags

840 if len(t_data.raw_tags) > 0:

841 l_data["raw_tags"] = t_data.raw_tags

842 data_append(data, field, l_data)

843 break

844 if not w:

845 break

846 if is_thesaurus: 846 ↛ 847line 846 didn't jump to line 847 because the condition on line 846 was never true

847 continue

848 tags: list[str] = []

849 topics: list[str] = []

850 english: Optional[str] = None

851 # Try to find qualifiers for this synonym

852 q = ht.get("q{}".format(i - 1))

853 if q:

854 cls = classify_desc(q)

855 if cls == "tags":

856 tagsets1, topics1 = decode_tags(q)

857 for ts in tagsets1:

858 tags.extend(ts)

859 topics.extend(topics1)

860 elif cls == "english": 860 ↛ 866line 860 didn't jump to line 866 because the condition on line 860 was always true

861 if english: 861 ↛ 862line 861 didn't jump to line 862 because the condition on line 861 was never true

862 english += "; " + q

863 else:

864 english = q

865 # Try to find English translation for this synonym

866 t = ht.get("t{}".format(i - 1))

867 if t: 867 ↛ 868line 867 didn't jump to line 868 because the condition on line 867 was never true

868 if english:

869 english += "; " + t

870 else:

871 english = t

872

873 # See if the linkage contains a parenthesized alt

874 alt = None

875 m = re.search(r"$([^)]+)$$", w)

876 if m: 876 ↛ 877line 876 didn't jump to line 877 because the condition on line 876 was never true

877 w = w[: m.start()].strip()

878 alt = m.group(1)

879

880 dt = {"word": w}

881 if field_tags: 881 ↛ 882line 881 didn't jump to line 882 because the condition on line 881 was never true

882 data_extend(dt, "tags", field_tags)

883 if tags:

884 data_extend(dt, "tags", tags)

885 if topics: 885 ↛ 886line 885 didn't jump to line 886 because the condition on line 885 was never true

886 data_extend(dt, "topics", topics)

887 if english:

888 dt["english"] = english

889 if alt: 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true

890 dt["alt"] = alt

891 data_append(data, field, dt)

892

893

894EXAMPLE_SPLITTERS = r"\s*[―—]+\s*"

895example_splitter_re = re.compile(EXAMPLE_SPLITTERS)

896captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")")

897

898

899def synch_splits_with_args(

900 line: str, targs: TemplateArgs

901) -> Optional[list[str]]:

902 """If it looks like there's something weird with how a line of example

903 text has been split, this function will do the splitting after counting

904 occurences of the splitting regex inside the two main template arguments

905 containing the string data for the original language example and the

906 English translations.

907 """

908 # Previously, we split without capturing groups, but here we want to

909 # keep the original splitting hyphen regex intact.

910 fparts = captured_splitters_re.split(line)

911 new_parts = []

912 # ["First", " – ", "second", " – ", "third..."] from OL argument

913 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, ""))))

914 new_parts.append("".join(fparts[:first]))

915 # Translation argument

916 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "")

917 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above.

918 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg)))

919 new_parts.append("".join(fparts[first + 1 : second]))

920

921 if all(new_parts): # no empty strings from the above spaghetti

922 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens

923 return new_parts

924 else:

925 return None

926

927

928QUALIFIERS = r"^$(([^()]|\([^()]*$)*)\):?\s*"

929QUALIFIERS_RE = re.compile(QUALIFIERS)

930# (...): ... or (...(...)...): ...

931

932

933def parse_language(

934 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str

935) -> list[WordData]:

936 """Iterates over the text of the page, returning words (parts-of-speech)

937 defined on the page one at a time. (Individual word senses for the

938 same part-of-speech are typically encoded in the same entry.)"""

939 # imported here to avoid circular import

940 from .pronunciation import parse_pronunciation

941

942 assert isinstance(wxr, WiktextractContext)

943 assert isinstance(langnode, WikiNode)

944 assert isinstance(language, str)

945 assert isinstance(lang_code, str)

946 # print("parse_language", language)

947

948 is_reconstruction = False

949 word: str = wxr.wtp.title # type: ignore[assignment]

950 unsupported_prefix = "Unsupported titles/"

951 if word.startswith(unsupported_prefix):

952 w = word[len(unsupported_prefix) :]

953 if w in unsupported_title_map: 953 ↛ 956line 953 didn't jump to line 956 because the condition on line 953 was always true

954 word = unsupported_title_map[w]

955 else:

956 wxr.wtp.error(

957 "Unimplemented unsupported title: {}".format(word),

958 sortid="page/870",

959 )

960 word = w

961 elif word.startswith("Reconstruction:"): 961 ↛ 962line 961 didn't jump to line 962 because the condition on line 961 was never true

962 word = word[word.find("/") + 1 :]

963 is_reconstruction = True

964

965 base_data: WordData = {

966 "word": word,

967 "lang": language,

968 "lang_code": lang_code,

969 }

970 if is_reconstruction: 970 ↛ 971line 970 didn't jump to line 971 because the condition on line 970 was never true

971 data_append(base_data, "tags", "reconstruction")

972 sense_data: SenseData = {}

973 pos_data: WordData = {} # For a current part-of-speech

974 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between

975 etym_data: WordData = {} # For one etymology

976 pos_datas: list[SenseData] = []

977 level_four_datas: list[WordData] = []

978 etym_datas: list[WordData] = []

979 page_datas: list[WordData] = []

980 have_etym = False

981 inside_level_four = False # This is for checking if the etymology section

982 # or article has a Pronunciation section, for Chinese mostly; because

983 # Chinese articles can have three level three sections (two etymology

984 # sections and pronunciation sections) one after another, we need a kludge

985 # to better keep track of whether we're in a normal "etym" or inside a

986 # "level four" (which is what we've turned the level three Pron sections

987 # into in the fix_subtitle_hierarchy(); all other sections are demoted by

988 # a step.

989 stack: list[str] = [] # names of items on the "stack"

990

991 def merge_base(data: WordData, base: WordData) -> None:

992 for k, v in base.items():

993 # Copy the value to ensure that we don't share lists or

994 # dicts between structures (even nested ones).

995 v = copy.deepcopy(v)

996 if k not in data:

997 # The list was copied above, so this will not create shared ref

998 data[k] = v # type: ignore[literal-required]

999 continue

1000 if data[k] == v: # type: ignore[literal-required]

1001 continue

1002 if ( 1002 ↛ 1010line 1002 didn't jump to line 1010 because the condition on line 1002 was always true

1003 isinstance(data[k], (list, tuple)) # type: ignore[literal-required]

1004 or isinstance(

1005 v,

1006 (list, tuple), # Should this be "and"?

1007 )

1008 ):

1009 data[k] = list(data[k]) + list(v) # type: ignore

1010 elif data[k] != v: # type: ignore[literal-required]

1011 wxr.wtp.warning(

1012 "conflicting values for {} in merge_base: "

1013 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required]

1014 sortid="page/904",

1015 )

1016

1017 def complementary_pop(pron: SoundData, key: str) -> SoundData:

1018 """Remove unnecessary keys from dict values

1019 in a list comprehension..."""

1020 if key in pron:

1021 pron.pop(key) # type: ignore

1022 return pron

1023

1024 # If the result has sounds, eliminate sounds that have a prefix that

1025 # does not match "word" or one of "forms"

1026 if "sounds" in data and "word" in data:

1027 accepted = [data["word"]]

1028 accepted.extend(f["form"] for f in data.get("forms", dict()))

1029 data["sounds"] = list(

1030 s

1031 for s in data["sounds"]

1032 if "form" not in s or s["form"] in accepted

1033 )

1034 # If the result has sounds, eliminate sounds that have a pos that

1035 # does not match "pos"

1036 if "sounds" in data and "pos" in data:

1037 data["sounds"] = list(

1038 complementary_pop(s, "pos")

1039 for s in data["sounds"]

1040 # "pos" is not a field of SoundData, correctly, so we're

1041 # removing it here. It's a kludge on a kludge on a kludge.

1042 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item]

1043 )

1044

1045 def push_sense() -> bool:

1046 """Starts collecting data for a new word sense. This returns True

1047 if a sense was added."""

1048 nonlocal sense_data

1049 tags = sense_data.get("tags", ())

1050 if (

1051 not sense_data.get("glosses")

1052 and "translation-hub" not in tags

1053 and "no-gloss" not in tags

1054 ):

1055 return False

1056

1057 if ( 1057 ↛ 1067line 1057 didn't jump to line 1067 because the condition on line 1057 was never true

1058 (

1059 "participle" in sense_data.get("tags", ())

1060 or "infinitive" in sense_data.get("tags", ())

1061 )

1062 and "alt_of" not in sense_data

1063 and "form_of" not in sense_data

1064 and "etymology_text" in etym_data

1065 and etym_data["etymology_text"] != ""

1066 ):

1067 etym = etym_data["etymology_text"]

1068 etym = etym.split(". ")[0]

1069 ret = parse_alt_or_inflection_of(wxr, etym, set())

1070 if ret is not None:

1071 tags, lst = ret

1072 assert isinstance(lst, (list, tuple))

1073 if "form-of" in tags:

1074 data_extend(sense_data, "form_of", lst)

1075 data_extend(sense_data, "tags", tags)

1076 elif "alt-of" in tags:

1077 data_extend(sense_data, "alt_of", lst)

1078 data_extend(sense_data, "tags", tags)

1079

1080 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1080 ↛ 1083line 1080 didn't jump to line 1083 because the condition on line 1080 was never true

1081 "tags", ()

1082 ):

1083 data_append(sense_data, "tags", "no-gloss")

1084

1085 pos_datas.append(sense_data)

1086 sense_data = {}

1087 return True

1088

1089 def push_pos() -> None:

1090 """Starts collecting data for a new part-of-speech."""

1091 nonlocal pos_data

1092 nonlocal pos_datas

1093 push_sense()

1094 if wxr.wtp.subsection:

1095 data: WordData = {"senses": pos_datas}

1096 merge_base(data, pos_data)

1097 level_four_datas.append(data)

1098 pos_data = {}

1099 pos_datas = []

1100 wxr.wtp.start_subsection(None)

1101

1102 def push_level_four_section(clear_sound_data: bool) -> None:

1103 """Starts collecting data for a new level four sections, which

1104 is usually virtual and empty, unless the article has Chinese

1105 'Pronunciation' sections that are etymology-section-like but

1106 under etymology, and at the same level in the source. We modify

1107 the source to demote Pronunciation sections like that to level

1108 4, and other sections one step lower."""

1109 nonlocal level_four_data

1110 nonlocal level_four_datas

1111 nonlocal etym_datas

1112 push_pos()

1113 # print(f"======\n{etym_data=}")

1114 # print(f"======\n{etym_datas=}")

1115 # print(f"======\n{level_four_data=}")

1116 # print(f"======\n{level_four_datas=}")

1117 for data in level_four_datas:

1118 merge_base(data, level_four_data)

1119 etym_datas.append(data)

1120 for data in etym_datas:

1121 merge_base(data, etym_data)

1122 page_datas.append(data)

1123 if clear_sound_data:

1124 level_four_data = {}

1125 level_four_datas = []

1126 etym_datas = []

1127

1128 def push_etym() -> None:

1129 """Starts collecting data for a new etymology."""

1130 nonlocal etym_data

1131 nonlocal etym_datas

1132 nonlocal have_etym

1133 nonlocal inside_level_four

1134 have_etym = True

1135 push_level_four_section(False)

1136 inside_level_four = False

1137 # etymology section could under pronunciation section

1138 etym_data = (

1139 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {}

1140 )

1141

1142 def select_data() -> WordData:

1143 """Selects where to store data (pos or etym) based on whether we

1144 are inside a pos (part-of-speech)."""

1145 # print(f"{wxr.wtp.subsection=}")

1146 # print(f"{stack=}")

1147 if wxr.wtp.subsection is not None:

1148 return pos_data

1149 if inside_level_four:

1150 return level_four_data

1151 if stack[-1] == language:

1152 return base_data

1153 return etym_data

1154

1155 term_label_templates: list[TemplateData] = []

1156

1157 def head_post_template_fn(

1158 name: str, ht: TemplateArgs, expansion: str

1159 ) -> Optional[str]:

1160 """Handles special templates in the head section of a word. Head

1161 section is the text after part-of-speech subtitle and before word

1162 sense list. Typically it generates the bold line for the word, but

1163 may also contain other useful information that often ends in

1164 side boxes. We want to capture some of that additional information."""

1165 # print("HEAD_POST_TEMPLATE_FN", name, ht)

1166 if is_panel_template(wxr, name): 1166 ↛ 1169line 1166 didn't jump to line 1169 because the condition on line 1166 was never true

1167 # Completely ignore these templates (not even recorded in

1168 # head_templates)

1169 return ""

1170 if name == "head":

1171 # XXX are these also captured in forms? Should this special case

1172 # be removed?

1173 t = ht.get(2, "")

1174 if t == "pinyin": 1174 ↛ 1175line 1174 didn't jump to line 1175 because the condition on line 1174 was never true

1175 data_append(pos_data, "tags", "Pinyin")

1176 elif t == "romanization": 1176 ↛ 1177line 1176 didn't jump to line 1177 because the condition on line 1176 was never true

1177 data_append(pos_data, "tags", "romanization")

1178 if (

1179 HEAD_TAG_RE.search(name) is not None

1180 or name in WORD_LEVEL_HEAD_TEMPLATES

1181 ):

1182 args_ht = clean_template_args(wxr, ht)

1183 cleaned_expansion = clean_node(wxr, None, expansion)

1184 dt: TemplateData = {

1185 "name": name,

1186 "args": args_ht,

1187 "expansion": cleaned_expansion,

1188 }

1189 data_append(pos_data, "head_templates", dt)

1190 if name in WORD_LEVEL_HEAD_TEMPLATES:

1191 term_label_templates.append(dt)

1192 # Squash these, their tags are applied to the whole word,

1193 # and some cause problems like "term-label"

1194 return ""

1195

1196 # The following are both captured in head_templates and parsed

1197 # separately

1198

1199 if name in wikipedia_templates:

1200 # Note: various places expect to have content from wikipedia

1201 # templates, so cannot convert this to empty

1202 parse_wikipedia_template(wxr, pos_data, ht)

1203 return None

1204

1205 if name == "number box": 1205 ↛ 1207line 1205 didn't jump to line 1207 because the condition on line 1205 was never true

1206 # XXX extract numeric value?

1207 return ""

1208 if name == "enum":

1209 # XXX extract?

1210 return ""

1211 if name == "cardinalbox": 1211 ↛ 1214line 1211 didn't jump to line 1214 because the condition on line 1211 was never true

1212 # XXX extract similar to enum?

1213 # XXX this can also occur in top-level under language

1214 return ""

1215 if name == "Han simplified forms": 1215 ↛ 1217line 1215 didn't jump to line 1217 because the condition on line 1215 was never true

1216 # XXX extract?

1217 return ""

1218 # if name == "ja-kanji forms":

1219 # # XXX extract?

1220 # return ""

1221 # if name == "vi-readings":

1222 # # XXX extract?

1223 # return ""

1224 # if name == "ja-kanji":

1225 # # XXX extract?

1226 # return ""

1227 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1227 ↛ 1229line 1227 didn't jump to line 1229 because the condition on line 1227 was never true

1228 # XXX extract?

1229 return ""

1230

1231 return None

1232

1233 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None:

1234 """Parses the subsection for a part-of-speech under a language on

1235 a page."""

1236 assert isinstance(posnode, WikiNode)

1237 assert isinstance(pos, str)

1238 # print("parse_part_of_speech", pos)

1239 pos_data["pos"] = pos

1240 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists

1241 lists: list[list[WikiNode]] = [[]] # list of lists

1242 first_para = True

1243 first_head_tmplt = True

1244 collecting_head = True

1245 start_of_paragraph = True

1246

1247 # XXX extract templates from posnode with recursively_extract

1248 # that break stuff, like ja-kanji or az-suffix-form.

1249 # Do the extraction with a list of template names, combined from

1250 # different lists, then separate out them into different lists

1251 # that are handled at different points of the POS section.

1252 # First, extract az-suffix-form, put it in `inflection`,

1253 # and parse `inflection`'s content when appropriate later.

1254 # The contents of az-suffix-form (and ja-kanji) that generate

1255 # divs with "floatright" in their style gets deleted by

1256 # clean_value, so templates that slip through from here won't

1257 # break anything.

1258 # XXX bookmark

1259 # print("===================")

1260 # print(posnode.children)

1261

1262 floaters, poschildren = recursively_extract(

1263 posnode.children,

1264 lambda x: (

1265 isinstance(x, WikiNode)

1266 and (

1267 (

1268 x.kind == NodeKind.TEMPLATE

1269 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES

1270 )

1271 or (

1272 x.kind == NodeKind.LINK

1273 # Need to check for stringiness because some links are

1274 # broken; for example, if a template is missing an

1275 # argument, a link might look like `[[{{{1}}}...]]`

1276 and isinstance(x.largs[0][0], str)

1277 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr]

1278 )

1279 )

1280 ),

1281 )

1282 tempnode = WikiNode(NodeKind.LEVEL6, 0)

1283 tempnode.largs = [["Inflection"]]

1284 tempnode.children = floaters

1285 parse_inflection(tempnode, "Floating Div", pos)

1286 # print(poschildren)

1287 # XXX new above

1288

1289 if not poschildren: 1289 ↛ 1290line 1289 didn't jump to line 1290 because the condition on line 1289 was never true

1290 if not floaters:

1291 wxr.wtp.debug(

1292 "PoS section without contents",

1293 sortid="en/page/1051/20230612",

1294 )

1295 else:

1296 wxr.wtp.debug(

1297 "PoS section without contents except for a floating table",

1298 sortid="en/page/1056/20230612",

1299 )

1300 return

1301

1302 for node in poschildren:

1303 if isinstance(node, str):

1304 for m in re.finditer(r"\n+|[^\n]+", node):

1305 p = m.group(0)

1306 if p.startswith("\n\n") and pre:

1307 first_para = False

1308 start_of_paragraph = True

1309 break

1310 if p and collecting_head:

1311 pre[-1].append(p)

1312 continue

1313 assert isinstance(node, WikiNode)

1314 kind = node.kind

1315 if kind == NodeKind.LIST:

1316 lists[-1].append(node)

1317 collecting_head = False

1318 start_of_paragraph = True

1319 continue

1320 elif kind in LEVEL_KINDS:

1321 # Stop parsing section if encountering any kind of

1322 # level header (like ===Noun=== or ====Further Reading====).

1323 # At a quick glance, this should be the default behavior,

1324 # but if some kinds of source articles have sub-sub-sections

1325 # that should be parsed XXX it should be handled by changing

1326 # this break.

1327 break

1328 elif collecting_head and kind == NodeKind.LINK:

1329 # We might collect relevant links as they are often pictures

1330 # relating to the word

1331 if len(node.largs[0]) >= 1 and isinstance( 1331 ↛ 1346line 1331 didn't jump to line 1346 because the condition on line 1331 was always true

1332 node.largs[0][0], str

1333 ):

1334 if node.largs[0][0].startswith( 1334 ↛ 1340line 1334 didn't jump to line 1340 because the condition on line 1334 was never true

1335 ns_title_prefix_tuple(wxr, "Category")

1336 ):

1337 # [[Category:...]]

1338 # We're at the end of the file, probably, so stop

1339 # here. Otherwise the head will get garbage.

1340 break

1341 if node.largs[0][0].startswith( 1341 ↛ 1346line 1341 didn't jump to line 1346 because the condition on line 1341 was always true

1342 ns_title_prefix_tuple(wxr, "File")

1343 ):

1344 # Skips file links

1345 continue

1346 start_of_paragraph = False

1347 pre[-1].extend(node.largs[-1])

1348 elif kind == NodeKind.HTML:

1349 if node.sarg == "br":

1350 if pre[-1]: 1350 ↛ 1302line 1350 didn't jump to line 1302 because the condition on line 1350 was always true

1351 pre.append([]) # Switch to next head

1352 lists.append([]) # Lists parallels pre

1353 collecting_head = True

1354 start_of_paragraph = True

1355 elif collecting_head and node.sarg not in ( 1355 ↛ 1361line 1355 didn't jump to line 1361 because the condition on line 1355 was never true

1356 "gallery",

1357 "ref",

1358 "cite",

1359 "caption",

1360 ):

1361 start_of_paragraph = False

1362 pre[-1].append(node)

1363 else:

1364 start_of_paragraph = False

1365 elif isinstance(node, TemplateNode):

1366 # XXX Insert code here that disambiguates between

1367 # templates that generate word heads and templates

1368 # that don't.

1369 # There's head_tag_re that seems like a regex meant

1370 # to identify head templates. Too bad it's None.

1371

1372 # ignore {{category}}, {{cat}}... etc.

1373 if node.template_name in stop_head_at_these_templates:

1374 # we've reached a template that should be at the end,

1375 continue

1376

1377 # skip these templates; panel_templates is already used

1378 # to skip certain templates else, but it also applies to

1379 # head parsing quite well.

1380 # node.largs[0][0] should always be str, but can't type-check

1381 # that.

1382 if is_panel_template(wxr, node.template_name):

1383 continue

1384 # skip these templates

1385 # if node.largs[0][0] in skip_these_templates_in_head:

1386 # first_head_tmplt = False # no first_head_tmplt at all

1387 # start_of_paragraph = False

1388 # continue

1389

1390 if first_head_tmplt and pre[-1]:

1391 first_head_tmplt = False

1392 start_of_paragraph = False

1393 pre[-1].append(node)

1394 elif pre[-1] and start_of_paragraph:

1395 pre.append([]) # Switch to the next head

1396 lists.append([]) # lists parallel pre

1397 collecting_head = True

1398 start_of_paragraph = False

1399 pre[-1].append(node)

1400 else:

1401 pre[-1].append(node)

1402 elif first_para:

1403 start_of_paragraph = False

1404 if collecting_head: 1404 ↛ 1302line 1404 didn't jump to line 1302 because the condition on line 1404 was always true

1405 pre[-1].append(node)

1406 # XXX use template_fn in clean_node to check that the head macro

1407 # is compatible with the current part-of-speech and generate warning

1408 # if not. Use template_allowed_pos_map.

1409

1410 # Clean up empty pairs, and fix messes with extra newlines that

1411 # separate templates that are followed by lists wiktextract issue #314

1412

1413 cleaned_pre: list[list[Union[str, WikiNode]]] = []

1414 cleaned_lists: list[list[WikiNode]] = []

1415 pairless_pre_index = None

1416

1417 for pre1, ls in zip(pre, lists):

1418 if pre1 and not ls:

1419 pairless_pre_index = len(cleaned_pre)

1420 if not pre1 and not ls: 1420 ↛ 1422line 1420 didn't jump to line 1422 because the condition on line 1420 was never true

1421 # skip [] + []

1422 continue

1423 if not ls and all(

1424 (isinstance(x, str) and not x.strip()) for x in pre1

1425 ):

1426 # skip ["\n", " "] + []

1427 continue

1428 if ls and not pre1:

1429 if pairless_pre_index is not None: 1429 ↛ 1430line 1429 didn't jump to line 1430 because the condition on line 1429 was never true

1430 cleaned_lists[pairless_pre_index] = ls

1431 pairless_pre_index = None

1432 continue

1433 cleaned_pre.append(pre1)

1434 cleaned_lists.append(ls)

1435

1436 pre = cleaned_pre

1437 lists = cleaned_lists

1438

1439 there_are_many_heads = len(pre) > 1

1440 header_tags: list[str] = []

1441 header_topics: list[str] = []

1442 previous_head_had_list = False

1443

1444 if not any(g for g in lists):

1445 process_gloss_without_list(

1446 poschildren, pos, pos_data, header_tags, header_topics

1447 )

1448 else:

1449 for i, (pre1, ls) in enumerate(zip(pre, lists)):

1450 # if len(ls) == 0:

1451 # # don't have gloss list

1452 # # XXX add code here to filter out 'garbage', like text

1453 # # that isn't a head template or head.

1454 # continue

1455

1456 if all(not sl for sl in lists[i:]):

1457 if i == 0: 1457 ↛ 1458line 1457 didn't jump to line 1458 because the condition on line 1457 was never true

1458 if isinstance(node, str):

1459 wxr.wtp.debug(

1460 "first head without list of senses,"

1461 "string: '{}[...]', {}/{}".format(

1462 node[:20], word, language

1463 ),

1464 sortid="page/1689/20221215",

1465 )

1466 if isinstance(node, WikiNode):

1467 if node.largs and node.largs[0][0] in [

1468 "Han char",

1469 ]:

1470 # just ignore these templates

1471 pass

1472 else:

1473 wxr.wtp.debug(

1474 "first head without "

1475 "list of senses, "

1476 "template node "

1477 "{}, {}/{}".format(

1478 node.largs, word, language

1479 ),

1480 sortid="page/1694/20221215",

1481 )

1482 else:

1483 wxr.wtp.debug(

1484 "first head without list of senses, "

1485 "{}/{}".format(word, language),

1486 sortid="page/1700/20221215",

1487 )

1488 # no break here so that the first head always

1489 # gets processed.

1490 else:

1491 if isinstance(node, str): 1491 ↛ 1492line 1491 didn't jump to line 1492 because the condition on line 1491 was never true

1492 wxr.wtp.debug(

1493 "later head without list of senses,"

1494 "string: '{}[...]', {}/{}".format(

1495 node[:20], word, language

1496 ),

1497 sortid="page/1708/20221215",

1498 )

1499 if isinstance(node, WikiNode): 1499 ↛ 1511line 1499 didn't jump to line 1511 because the condition on line 1499 was always true

1500 wxr.wtp.debug(

1501 "later head without list of senses,"

1502 "template node "

1503 "{}, {}/{}".format(

1504 node.sarg if node.sarg else node.largs,

1505 word,

1506 language,

1507 ),

1508 sortid="page/1713/20221215",

1509 )

1510 else:

1511 wxr.wtp.debug(

1512 "later head without list of senses, "

1513 "{}/{}".format(word, language),

1514 sortid="page/1719/20221215",

1515 )

1516 break

1517 head_group = i + 1 if there_are_many_heads else None

1518 # print("parse_part_of_speech: {}: {}: pre={}"

1519 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1))

1520

1521 if previous_head_had_list:

1522 # We use a boolean flag here because we want to be able

1523 # let the header_tags data pass through after the loop

1524 # is over without accidentally emptying it, if there are

1525 # no pos_datas and we need a dummy data.

1526 header_tags.clear()

1527 header_topics.clear()

1528

1529 process_gloss_header(

1530 pre1, pos, head_group, pos_data, header_tags, header_topics

1531 )

1532 for ln in ls:

1533 # Parse each list associated with this head.

1534 for node in ln.children:

1535 # Parse nodes in l.children recursively.

1536 # The recursion function uses push_sense() to

1537 # add stuff into pos_data, and returns True or

1538 # False if something is added, which bubbles upward.

1539 # If the bubble is "True", then higher levels of

1540 # the recursion will not push_sense(), because

1541 # the data is already pushed into a sub-gloss

1542 # downstream, unless the higher level has examples

1543 # that need to be put somewhere.

1544 common_data: SenseData = {

1545 "tags": list(header_tags),

1546 "topics": list(header_topics),

1547 }

1548 if head_group:

1549 common_data["head_nr"] = head_group

1550 parse_sense_node(node, common_data, pos) # type: ignore[arg-type]

1551

1552 if len(ls) > 0:

1553 previous_head_had_list = True

1554 else:

1555 previous_head_had_list = False

1556

1557 # If there are no senses extracted, add a dummy sense. We want to

1558 # keep tags extracted from the head for the dummy sense.

1559 push_sense() # Make sure unfinished data pushed, and start clean sense

1560 if len(pos_datas) == 0:

1561 data_extend(sense_data, "tags", header_tags)

1562 data_extend(sense_data, "topics", header_topics)

1563 data_append(sense_data, "tags", "no-gloss")

1564 push_sense()

1565

1566 def process_gloss_header(

1567 header_nodes: list[Union[WikiNode, str]],

1568 pos_type: str,

1569 header_group: Optional[int],

1570 pos_data: WordData,

1571 header_tags: list[str],

1572 header_topics: list[str],

1573 ) -> None:

1574 ruby = []

1575 links: list[str] = []

1576

1577 # process template parse nodes here

1578 new_nodes = []

1579 info_template_data = []

1580 for node in header_nodes:

1581 # print(f"{node=}")

1582 info_data, info_out = parse_info_template_node(wxr, node, "head")

1583 if info_data or info_out:

1584 if info_data: 1584 ↛ 1586line 1584 didn't jump to line 1586 because the condition on line 1584 was always true

1585 info_template_data.append(info_data)

1586 if info_out: # including just the original node 1586 ↛ 1587line 1586 didn't jump to line 1587 because the condition on line 1586 was never true

1587 new_nodes.append(info_out)

1588 else:

1589 new_nodes.append(node)

1590 header_nodes = new_nodes

1591

1592 if info_template_data:

1593 if "info_templates" not in pos_data: 1593 ↛ 1596line 1593 didn't jump to line 1596 because the condition on line 1593 was always true

1594 pos_data["info_templates"] = info_template_data

1595 else:

1596 pos_data["info_templates"].extend(info_template_data)

1597

1598 if not word.isalnum():

1599 # `-` is kosher, add more of these if needed.

1600 if word.replace("-", "").isalnum():

1601 pass

1602 else:

1603 # if the word contains non-letter or -number characters, it

1604 # might have something that messes with split-at-semi-comma; we

1605 # collect links so that we can skip splitting them.

1606 exp = wxr.wtp.parse(

1607 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True

1608 )

1609 link_nodes, _ = recursively_extract(

1610 exp.children,

1611 lambda x: isinstance(x, WikiNode)

1612 and x.kind == NodeKind.LINK,

1613 )

1614 for ln in link_nodes:

1615 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr]

1616 if not ltext.isalnum():

1617 links.append(ltext)

1618 if word not in links: 1618 ↛ 1621line 1618 didn't jump to line 1621 because the condition on line 1618 was always true

1619 links.append(word)

1620

1621 if lang_code == "ja":

1622 exp = wxr.wtp.parse(

1623 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True

1624 )

1625 rub, _ = recursively_extract(

1626 exp.children,

1627 lambda x: isinstance(x, WikiNode)

1628 and x.kind == NodeKind.HTML

1629 and x.sarg == "ruby",

1630 )

1631 if rub is not None: 1631 ↛ 1676line 1631 didn't jump to line 1676 because the condition on line 1631 was always true

1632 for r in rub:

1633 if TYPE_CHECKING:

1634 # we know the lambda above in recursively_extract

1635 # returns only WikiNodes in rub

1636 assert isinstance(r, WikiNode)

1637 rt = parse_ruby(wxr, r)

1638 if rt is not None:

1639 ruby.append(rt)

1640 elif lang_code == "vi":

1641 # Handle vi-readings templates that have a weird structures for

1642 # Chu Nom vietnamese characters heads

1643 # https://en.wiktionary.org/wiki/Template:vi-readings

1644 new_header_nodes = []

1645 related_readings: list[LinkageData] = []

1646 for node in header_nodes:

1647 if ( 1647 ↛ 1671line 1647 didn't jump to line 1671 because the condition on line 1647 was always true

1648 isinstance(node, TemplateNode)

1649 and node.template_name == "vi-readings"

1650 ):

1651 print(node.template_parameters)

1652 for parameter, tag in (

1653 ("hanviet", "han-viet-reading"),

1654 ("nom", "nom-reading"),

1655 # we ignore the fanqie parameter "phienthiet"

1656 ):

1657 arg = node.template_parameters.get(parameter)

1658 if arg is not None: 1658 ↛ 1652line 1658 didn't jump to line 1652 because the condition on line 1658 was always true

1659 text = clean_node(wxr, None, arg)

1660 for w in text.split(","):

1661 # ignore - separated references

1662 if "-" in w:

1663 w = w[: w.index("-")]

1664 w = w.strip()

1665 related_readings.append(

1666 LinkageData(word=w, tags=[tag])

1667 )

1668 continue

1669

1670 # Skip the vi-reading template for the rest of the head parsing

1671 new_header_nodes.append(node)

1672 if len(related_readings) > 0: 1672 ↛ 1676line 1672 didn't jump to line 1676 because the condition on line 1672 was always true

1673 data_extend(pos_data, "related", related_readings)

1674 header_nodes = new_header_nodes

1675

1676 header_text = clean_node(

1677 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn

1678 )

1679

1680 if not header_text.strip():

1681 return

1682

1683 term_label_tags: list[str] = []

1684 term_label_topics: list[str] = []

1685 if len(term_label_templates) > 0:

1686 # parse term label templates; if there are other similar kinds

1687 # of templates in headers that you want to squash and apply as

1688 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES

1689 for templ_data in term_label_templates:

1690 # print(templ_data)

1691 expan = templ_data.get("expansion", "").strip("().,; ")

1692 if not expan: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true

1693 continue

1694 tlb_tagsets, tlb_topics = decode_tags(expan)

1695 for tlb_tags in tlb_tagsets:

1696 if len(tlb_tags) > 0 and not any(

1697 t.startswith("error-") for t in tlb_tags

1698 ):

1699 term_label_tags.extend(tlb_tags)

1700 term_label_topics.extend(tlb_topics)

1701 # print(f"{tlb_tagsets=}, {tlb_topicsets=}")

1702

1703 header_text = re.sub(r"\s+", " ", header_text)

1704 # print(f"{header_text=}")

1705 parse_word_head(

1706 wxr,

1707 pos_type,

1708 header_text,

1709 pos_data,

1710 is_reconstruction,

1711 header_group,

1712 ruby=ruby,

1713 links=links,

1714 )

1715 if "tags" in pos_data:

1716 # pos_data can get "tags" data from some source; type-checkers

1717 # doesn't like it, so let's ignore it.

1718 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item]

1719 del pos_data["tags"] # type: ignore[typeddict-item]

1720 if len(term_label_tags) > 0:

1721 header_tags.extend(term_label_tags)

1722 if len(term_label_topics) > 0:

1723 header_topics.extend(term_label_topics)

1724

1725 def process_gloss_without_list(

1726 nodes: list[Union[WikiNode, str]],

1727 pos_type: str,

1728 pos_data: WordData,

1729 header_tags: list[str],

1730 header_topics: list[str],

1731 ) -> None:

1732 # gloss text might not inside a list

1733 header_nodes: list[Union[str, WikiNode]] = []

1734 gloss_nodes: list[Union[str, WikiNode]] = []

1735 for node in strip_nodes(nodes):

1736 if isinstance(node, WikiNode):

1737 if isinstance(node, TemplateNode):

1738 if node.template_name in (

1739 "zh-see",

1740 "ja-see",

1741 "ja-see-kango",

1742 ):

1743 continue # soft redirect

1744 elif (

1745 node.template_name == "head"

1746 or node.template_name.startswith(f"{lang_code}-")

1747 ):

1748 header_nodes.append(node)

1749 continue

1750 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1750 ↛ 1752line 1750 didn't jump to line 1752 because the condition on line 1750 was always true

1751 break

1752 gloss_nodes.append(node)

1753

1754 if len(header_nodes) > 0:

1755 process_gloss_header(

1756 header_nodes,

1757 pos_type,

1758 None,

1759 pos_data,

1760 header_tags,

1761 header_topics,

1762 )

1763 if len(gloss_nodes) > 0:

1764 process_gloss_contents(

1765 gloss_nodes,

1766 pos_type,

1767 {"tags": list(header_tags), "topics": list(header_topics)},

1768 )

1769

1770 def parse_sense_node(

1771 node: Union[str, WikiNode], # never receives str

1772 sense_base: SenseData,

1773 pos: str,

1774 ) -> bool:

1775 """Recursively (depth first) parse LIST_ITEM nodes for sense data.

1776 Uses push_sense() to attempt adding data to pos_data in the scope

1777 of parse_language() when it reaches deep in the recursion. push_sense()

1778 returns True if it succeeds, and that is bubbled up the stack; if

1779 a sense was added downstream, the higher levels (whose shared data

1780 was already added by a subsense) do not push_sense(), unless it

1781 has examples that need to be put somewhere.

1782 """

1783 assert isinstance(sense_base, dict) # Added to every sense deeper in

1784 if not isinstance(node, WikiNode): 1784 ↛ 1786line 1784 didn't jump to line 1786 because the condition on line 1784 was never true

1785 # This doesn't seem to ever happen in practice.

1786 wxr.wtp.debug(

1787 "{}: parse_sense_node called with"

1788 "something that isn't a WikiNode".format(pos),

1789 sortid="page/1287/20230119",

1790 )

1791 return False

1792

1793 if node.kind != NodeKind.LIST_ITEM: 1793 ↛ 1794line 1793 didn't jump to line 1794 because the condition on line 1793 was never true

1794 wxr.wtp.debug(

1795 "{}: non-list-item inside list".format(pos), sortid="page/1678"

1796 )

1797 return False

1798

1799 if node.sarg == ":":

1800 # Skip example entries at the highest level, ones without

1801 # a sense ("...#") above them.

1802 # If node.sarg is exactly and only ":", then it's at

1803 # the highest level; lower levels would have more

1804 # "indentation", like "#:" or "##:"

1805 return False

1806

1807 # If a recursion call succeeds in push_sense(), bubble it up with

1808 # `added`.

1809 # added |= push_sense() or added |= parse_sense_node(...) to OR.

1810 added = False

1811

1812 gloss_template_args: set[str] = set()

1813

1814 # For LISTs and LIST_ITEMS, their argument is something like

1815 # "##" or "##:", and using that we can rudimentally determine

1816 # list 'depth' if need be, and also what kind of list or

1817 # entry it is; # is for normal glosses, : for examples (indent)

1818 # and * is used for quotations on wiktionary.

1819 current_depth = node.sarg

1820

1821 children = node.children

1822

1823 # subentries, (presumably) a list

1824 # of subglosses below this. The list's

1825 # argument ends with #, and its depth should

1826 # be bigger than parent node.

1827 subentries = [

1828 x

1829 for x in children

1830 if isinstance(x, WikiNode)

1831 and x.kind == NodeKind.LIST

1832 and x.sarg == current_depth + "#"

1833 ]

1834

1835 # sublists of examples and quotations. .sarg

1836 # does not end with "#".

1837 others = [

1838 x

1839 for x in children

1840 if isinstance(x, WikiNode)

1841 and x.kind == NodeKind.LIST

1842 and x.sarg != current_depth + "#"

1843 ]

1844

1845 # the actual contents of this particular node.

1846 # can be a gloss (or a template that expands into

1847 # many glosses which we can't easily pre-expand)

1848 # or could be an "outer gloss" with more specific

1849 # subglosses, or could be a qualfier for the subglosses.

1850 contents = [

1851 x

1852 for x in children

1853 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST

1854 ]

1855 # If this entry has sublists of entries, we should combine

1856 # gloss information from both the "outer" and sublist content.

1857 # Sometimes the outer gloss

1858 # is more non-gloss or tags, sometimes it is a coarse sense

1859 # and the inner glosses are more specific. The outer one

1860 # does not seem to have qualifiers.

1861

1862 # If we have one sublist with one element, treat it

1863 # specially as it may be a Wiktionary error; raise

1864 # that nested element to the same level.

1865 # XXX If need be, this block can be easily removed in

1866 # the current recursive logicand the result is one sense entry

1867 # with both glosses in the glosses list, as you would

1868 # expect. If the higher entry has examples, there will

1869 # be a higher entry with some duplicated data.

1870 if len(subentries) == 1:

1871 slc = subentries[0].children

1872 if len(slc) == 1:

1873 # copy current node and modify it so it doesn't

1874 # loop infinitely.

1875 cropped_node = copy.copy(node)

1876 cropped_node.children = [

1877 x

1878 for x in children

1879 if not (

1880 isinstance(x, WikiNode)

1881 and x.kind == NodeKind.LIST

1882 and x.sarg == current_depth + "#"

1883 )

1884 ]

1885 added |= parse_sense_node(cropped_node, sense_base, pos)

1886 nonlocal sense_data # this kludge causes duplicated raw_

1887 # glosses data if this is not done;

1888 # if the top-level (cropped_node)

1889 # does not push_sense() properly or

1890 # parse_sense_node() returns early,

1891 # sense_data is not reset. This happens

1892 # for example when you have a no-gloss

1893 # string like "(intransitive)":

1894 # no gloss, push_sense() returns early

1895 # and sense_data has duplicate data with

1896 # sense_base

1897 sense_data = {}

1898 added |= parse_sense_node(slc[0], sense_base, pos)

1899 return added

1900

1901 return process_gloss_contents(

1902 contents,

1903 pos,

1904 sense_base,

1905 subentries,

1906 others,

1907 gloss_template_args,

1908 added,

1909 )

1910

1911 def process_gloss_contents(

1912 contents: list[Union[str, WikiNode]],

1913 pos: str,

1914 sense_base: SenseData,

1915 subentries: list[WikiNode] = [],

1916 others: list[WikiNode] = [],

1917 gloss_template_args: Set[str] = set(),

1918 added: bool = False,

1919 ) -> bool:

1920 def sense_template_fn(

1921 name: str, ht: TemplateArgs, is_gloss: bool = False

1922 ) -> Optional[str]:

1923 # print(f"sense_template_fn: {name}, {ht}")

1924 if name in wikipedia_templates:

1925 # parse_wikipedia_template(wxr, pos_data, ht)

1926 return None

1927 if is_panel_template(wxr, name):

1928 return ""

1929 if name in INFO_TEMPLATE_FUNCS:

1930 info_data, info_exp = parse_info_template_arguments(

1931 wxr, name, ht, "sense"

1932 )

1933 if info_data or info_exp: 1933 ↛ 1939line 1933 didn't jump to line 1939 because the condition on line 1933 was always true

1934 if info_data: 1934 ↛ 1936line 1934 didn't jump to line 1936 because the condition on line 1934 was always true

1935 data_append(sense_base, "info_templates", info_data)

1936 if info_exp and isinstance(info_exp, str): 1936 ↛ 1938line 1936 didn't jump to line 1938 because the condition on line 1936 was always true

1937 return info_exp

1938 return ""

1939 if name in ("defdate",):

1940 date = clean_node(wxr, None, ht.get(1, ()))

1941 if part_two := ht.get(2): 1941 ↛ 1943line 1941 didn't jump to line 1943 because the condition on line 1941 was never true

1942 # Unicode mdash, not '-'

1943 date += "–" + clean_node(wxr, None, part_two)

1944 refs: dict[str, ReferenceData] = {}

1945 # ref, refn, ref2, ref2n, ref3, ref3n

1946 # ref1 not valid

1947 for k, v in sorted(

1948 (k, v) for k, v in ht.items() if isinstance(k, str)

1949 ):

1950 if m := re.match(r"ref(\d?)(n?)", k): 1950 ↛ 1947line 1950 didn't jump to line 1947 because the condition on line 1950 was always true

1951 ref_v = clean_node(wxr, None, v)

1952 if m.group(1) not in refs: # empty string or digit

1953 refs[m.group(1)] = ReferenceData()

1954 if m.group(2):

1955 refs[m.group(1)]["refn"] = ref_v

1956 else:

1957 refs[m.group(1)]["text"] = ref_v

1958 data_append(

1959 sense_base,

1960 "attestations",

1961 AttestationData(date=date, references=list(refs.values())),

1962 )

1963 return ""

1964 if name == "senseid":

1965 langid = clean_node(wxr, None, ht.get(1, ()))

1966 arg = clean_node(wxr, sense_base, ht.get(2, ()))

1967 if re.match(r"Q\d+$", arg):

1968 data_append(sense_base, "wikidata", arg)

1969 data_append(sense_base, "senseid", langid + ":" + arg)

1970 if name in sense_linkage_templates:

1971 # print(f"SENSE_TEMPLATE_FN: {name}")

1972 parse_sense_linkage(wxr, sense_base, name, ht, pos)

1973 return ""

1974 if name == "†" or name == "zh-obsolete":

1975 data_append(sense_base, "tags", "obsolete")

1976 return ""

1977 if name in {

1978 "ux",

1979 "uxi",

1980 "usex",

1981 "afex",

1982 "prefixusex",

1983 "ko-usex",

1984 "ko-x",

1985 "hi-x",

1986 "ja-usex-inline",

1987 "ja-x",

1988 "quotei",

1989 "he-x",

1990 "hi-x",

1991 "km-x",

1992 "ne-x",

1993 "shn-x",

1994 "th-x",

1995 "ur-x",

1996 }:

1997 # Usage examples are captured separately below. We don't

1998 # want to expand them into glosses even when unusual coding

1999 # is used in the entry.

2000 # These templates may slip through inside another item, but

2001 # currently we're separating out example entries (..#:)

2002 # well enough that there seems to very little contamination.

2003 if is_gloss:

2004 wxr.wtp.warning(

2005 "Example template is used for gloss text",

2006 sortid="extractor.en.page.sense_template_fn/1415",

2007 )

2008 else:

2009 return ""

2010 if name == "w": 2010 ↛ 2011line 2010 didn't jump to line 2011 because the condition on line 2010 was never true

2011 if ht.get(2) == "Wp":

2012 return ""

2013 for k, v in ht.items():

2014 v = v.strip()

2015 if v and "<" not in v:

2016 gloss_template_args.add(v)

2017 return None

2018

2019 def extract_link_texts(item: GeneralNode) -> None:

2020 """Recursively extracts link texts from the gloss source. This

2021 information is used to select whether to remove final "." from

2022 form_of/alt_of (e.g., ihm/Hunsrik)."""

2023 if isinstance(item, (list, tuple)):

2024 for x in item:

2025 extract_link_texts(x)

2026 return

2027 if isinstance(item, str):

2028 # There seem to be HTML sections that may futher contain

2029 # unparsed links.

2030 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2030 ↛ 2031line 2030 didn't jump to line 2031 because the loop on line 2030 never started

2031 print("ITER:", m.group(0))

2032 v = m.group(1).split("|")[-1].strip()

2033 if v:

2034 gloss_template_args.add(v)

2035 return

2036 if not isinstance(item, WikiNode): 2036 ↛ 2037line 2036 didn't jump to line 2037 because the condition on line 2036 was never true

2037 return

2038 if item.kind == NodeKind.LINK:

2039 v = item.largs[-1]

2040 if ( 2040 ↛ 2046line 2040 didn't jump to line 2046 because the condition on line 2040 was always true

2041 isinstance(v, list)

2042 and len(v) == 1

2043 and isinstance(v[0], str)

2044 ):

2045 gloss_template_args.add(v[0].strip())

2046 for x in item.children:

2047 extract_link_texts(x)

2048

2049 extract_link_texts(contents)

2050

2051 # get the raw text of non-list contents of this node, and other stuff

2052 # like tag and category data added to sense_base

2053 # cast = no-op type-setter for the type-checker

2054 partial_template_fn = cast(

2055 TemplateFnCallable,

2056 partial(sense_template_fn, is_gloss=True),

2057 )

2058 rawgloss = clean_node(

2059 wxr,

2060 sense_base,

2061 contents,

2062 template_fn=partial_template_fn,

2063 collect_links=True,

2064 )

2065

2066 if not rawgloss: 2066 ↛ 2067line 2066 didn't jump to line 2067 because the condition on line 2066 was never true

2067 return False

2068

2069 # remove manually typed ordered list text at the start("1. ")

2070 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip()

2071

2072 # get stuff like synonyms and categories from "others",

2073 # maybe examples and quotations

2074 clean_node(wxr, sense_base, others, template_fn=sense_template_fn)

2075

2076 # The gloss could contain templates that produce more list items.

2077 # This happens commonly with, e.g., {{inflection of|...}}. Split

2078 # to parts. However, e.g. Interlingua generates multiple glosses

2079 # in HTML directly without Wikitext markup, so we must also split

2080 # by just newlines.

2081 subglosses = rawgloss.splitlines()

2082

2083 if len(subglosses) == 0: 2083 ↛ 2084line 2083 didn't jump to line 2084 because the condition on line 2083 was never true

2084 return False

2085

2086 if any(s.startswith("#") for s in subglosses):

2087 subtree = wxr.wtp.parse(rawgloss)

2088 # from wikitextprocessor.parser import print_tree

2089 # print("SUBTREE GENERATED BY TEMPLATE:")

2090 # print_tree(subtree)

2091 new_subentries = [

2092 x

2093 for x in subtree.children

2094 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST

2095 ]

2096

2097 new_others = [

2098 x

2099 for x in subtree.children

2100 if isinstance(x, WikiNode)

2101 and x.kind == NodeKind.LIST

2102 and not x.sarg.endswith("#")

2103 ]

2104

2105 new_contents = [

2106 clean_node(wxr, [], x)

2107 for x in subtree.children

2108 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST

2109 ]

2110

2111 subentries = subentries or new_subentries

2112 others = others or new_others

2113 subglosses = new_contents

2114 rawgloss = "".join(subglosses)

2115 # Generate no gloss for translation hub pages, but add the

2116 # "translation-hub" tag for them

2117 if rawgloss == "(This entry is a translation hub.)": 2117 ↛ 2118line 2117 didn't jump to line 2118 because the condition on line 2117 was never true

2118 data_append(sense_data, "tags", "translation-hub")

2119 return push_sense()

2120

2121 # Remove certain substrings specific to outer glosses

2122 strip_ends = [", particularly:"]

2123 for x in strip_ends:

2124 if rawgloss.endswith(x):

2125 rawgloss = rawgloss[: -len(x)].strip()

2126 break

2127

2128 # A single gloss, or possibly an outer gloss.

2129 # Check if the possible outer gloss starts with

2130 # parenthesized tags/topics

2131

2132 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()):

2133 data_append(sense_base, "raw_glosses", subglosses[0].strip())

2134 m = QUALIFIERS_RE.match(rawgloss)

2135 # (...): ... or (...(...)...): ...

2136 if m:

2137 q = m.group(1)

2138 rawgloss = rawgloss[m.end() :].strip()

2139 parse_sense_qualifier(wxr, q, sense_base)

2140 if rawgloss == "A pejorative:": 2140 ↛ 2141line 2140 didn't jump to line 2141 because the condition on line 2140 was never true

2141 data_append(sense_base, "tags", "pejorative")

2142 rawgloss = ""

2143 elif rawgloss == "Short forms.": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true

2144 data_append(sense_base, "tags", "abbreviation")

2145 rawgloss = ""

2146 elif rawgloss == "Technical or specialized senses.": 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true

2147 rawgloss = ""

2148 elif rawgloss.startswith("inflection of "):

2149 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set())

2150 if parsed is not None: 2150 ↛ 2159line 2150 didn't jump to line 2159 because the condition on line 2150 was always true

2151 tags, origins = parsed

2152 if origins is not None: 2152 ↛ 2154line 2152 didn't jump to line 2154 because the condition on line 2152 was always true

2153 data_extend(sense_base, "form_of", origins)

2154 if tags is not None: 2154 ↛ 2157line 2154 didn't jump to line 2157 because the condition on line 2154 was always true

2155 data_extend(sense_base, "tags", tags)

2156 else:

2157 data_append(sense_base, "tags", "form-of")

2158 else:

2159 data_append(sense_base, "tags", "form-of")

2160 if rawgloss: 2160 ↛ 2191line 2160 didn't jump to line 2191 because the condition on line 2160 was always true

2161 # Code duplicating a lot of clean-up operations from later in

2162 # this block. We want to clean up the "supergloss" as much as

2163 # possible, in almost the same way as a normal gloss.

2164 supergloss = rawgloss

2165

2166 if supergloss.startswith("; "): 2166 ↛ 2167line 2166 didn't jump to line 2167 because the condition on line 2166 was never true

2167 supergloss = supergloss[1:].strip()

2168

2169 if supergloss.startswith(("^†", "†")):

2170 data_append(sense_base, "tags", "obsolete")

2171 supergloss = supergloss[2:].strip()

2172 elif supergloss.startswith("^‡"): 2172 ↛ 2173line 2172 didn't jump to line 2173 because the condition on line 2172 was never true

2173 data_extend(sense_base, "tags", ["obsolete", "historical"])

2174 supergloss = supergloss[2:].strip()

2175

2176 # remove [14th century...] style brackets at the end

2177 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss)

2178

2179 if supergloss.startswith((",", ":")):

2180 supergloss = supergloss[1:]

2181 supergloss = supergloss.strip()

2182 if supergloss.startswith("N. of "): 2182 ↛ 2183line 2182 didn't jump to line 2183 because the condition on line 2182 was never true

2183 supergloss = "Name of " + supergloss[6:]

2184 supergloss = supergloss[2:]

2185 data_append(sense_base, "glosses", supergloss)

2186 if supergloss in ("A person:",):

2187 data_append(sense_base, "tags", "g-person")

2188

2189 # The main recursive call (except for the exceptions at the

2190 # start of this function).

2191 for sublist in subentries:

2192 if not ( 2192 ↛ 2195line 2192 didn't jump to line 2195 because the condition on line 2192 was never true

2193 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST

2194 ):

2195 wxr.wtp.debug(

2196 f"'{repr(rawgloss[:20])}.' gloss has `subentries`"

2197 f"with items that are not LISTs",

2198 sortid="page/1511/20230119",

2199 )

2200 continue

2201 for item in sublist.children:

2202 if not ( 2202 ↛ 2206line 2202 didn't jump to line 2206 because the condition on line 2202 was never true

2203 isinstance(item, WikiNode)

2204 and item.kind == NodeKind.LIST_ITEM

2205 ):

2206 continue

2207 # copy sense_base to prevent cross-contamination between

2208 # subglosses and other subglosses and superglosses

2209 sense_base2 = copy.deepcopy(sense_base)

2210 if parse_sense_node(item, sense_base2, pos): 2210 ↛ 2201line 2210 didn't jump to line 2201 because the condition on line 2210 was always true

2211 added = True

2212

2213 # Capture examples.

2214 # This is called after the recursive calls above so that

2215 # sense_base is not contaminated with meta-data from

2216 # example entries for *this* gloss.

2217 examples = []

2218 if wxr.config.capture_examples: 2218 ↛ 2222line 2218 didn't jump to line 2222 because the condition on line 2218 was always true

2219 examples = extract_examples(others, sense_base)

2220

2221 # push_sense() succeeded somewhere down-river, so skip this level

2222 if added:

2223 if examples:

2224 # this higher-up gloss has examples that we do not want to skip

2225 wxr.wtp.debug(

2226 "'{}[...]' gloss has examples we want to keep, "

2227 "but there are subglosses.".format(repr(rawgloss[:30])),

2228 sortid="page/1498/20230118",

2229 )

2230 else:

2231 return True

2232

2233 # Some entries, e.g., "iacebam", have weird sentences in quotes

2234 # after the gloss, but these sentences don't seem to be intended

2235 # as glosses. Skip them.

2236 indexed_subglosses = list(

2237 (i, gl)

2238 for i, gl in enumerate(subglosses)

2239 if gl.strip() and not re.match(r'\s*($[^)]*$\s*)?"[^"]*"\s*$', gl)

2240 )

2241

2242 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2242 ↛ 2243line 2242 didn't jump to line 2243 because the condition on line 2242 was never true

2243 gl = indexed_subglosses[0][1].strip()

2244 if gl.endswith(":"):

2245 gl = gl[:-1].strip()

2246 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args)

2247 if parsed is not None:

2248 infl_tags, infl_dts = parsed

2249 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1:

2250 # Interpret others as a particular form under

2251 # "inflection of"

2252 data_extend(sense_base, "tags", infl_tags)

2253 data_extend(sense_base, "form_of", infl_dts)

2254 indexed_subglosses = indexed_subglosses[1:]

2255 elif not infl_dts:

2256 data_extend(sense_base, "tags", infl_tags)

2257 indexed_subglosses = indexed_subglosses[1:]

2258

2259 # Create senses for remaining subglosses

2260 for i, (gloss_i, gloss) in enumerate(indexed_subglosses):

2261 gloss = gloss.strip()

2262 if not gloss and len(indexed_subglosses) > 1: 2262 ↛ 2263line 2262 didn't jump to line 2263 because the condition on line 2262 was never true

2263 continue

2264 # Push a new sense (if the last one is not empty)

2265 if push_sense(): 2265 ↛ 2266line 2265 didn't jump to line 2266 because the condition on line 2265 was never true

2266 added = True

2267 # if gloss not in sense_data.get("raw_glosses", ()):

2268 # data_append(sense_data, "raw_glosses", gloss)

2269 if i == 0 and examples:

2270 # In a multi-line gloss, associate examples

2271 # with only one of them.

2272 # XXX or you could use gloss_i == len(indexed_subglosses)

2273 # to associate examples with the *last* one.

2274 data_extend(sense_data, "examples", examples)

2275 if gloss.startswith("; ") and gloss_i > 0: 2275 ↛ 2276line 2275 didn't jump to line 2276 because the condition on line 2275 was never true

2276 gloss = gloss[1:].strip()

2277 # If the gloss starts with †, mark as obsolete

2278 if gloss.startswith("^†"): 2278 ↛ 2279line 2278 didn't jump to line 2279 because the condition on line 2278 was never true

2279 data_append(sense_data, "tags", "obsolete")

2280 gloss = gloss[2:].strip()

2281 elif gloss.startswith("^‡"): 2281 ↛ 2282line 2281 didn't jump to line 2282 because the condition on line 2281 was never true

2282 data_extend(sense_data, "tags", ["obsolete", "historical"])

2283 gloss = gloss[2:].strip()

2284 # Copy data for all senses to this sense

2285 for k, v in sense_base.items():

2286 if isinstance(v, (list, tuple)):

2287 if k != "tags":

2288 # Tags handled below (countable/uncountable special)

2289 data_extend(sense_data, k, v)

2290 else:

2291 assert k not in ("tags", "categories", "topics")

2292 sense_data[k] = v # type:ignore[literal-required]

2293 # Parse the gloss for this particular sense

2294 m = QUALIFIERS_RE.match(gloss)

2295 # (...): ... or (...(...)...): ...

2296 if m:

2297 parse_sense_qualifier(wxr, m.group(1), sense_data)

2298 gloss = gloss[m.end() :].strip()

2299

2300 # Remove common suffix "[from 14th c.]" and similar

2301 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss)

2302

2303 # Check to make sure we don't have unhandled list items in gloss

2304 ofs = max(gloss.find("#"), gloss.find("* "))

2305 if ofs > 10 and "(#)" not in gloss:

2306 wxr.wtp.debug(

2307 "gloss may contain unhandled list items: {}".format(gloss),

2308 sortid="page/1412",

2309 )

2310 elif "\n" in gloss: 2310 ↛ 2311line 2310 didn't jump to line 2311 because the condition on line 2310 was never true

2311 wxr.wtp.debug(

2312 "gloss contains newline: {}".format(gloss),

2313 sortid="page/1416",

2314 )

2315

2316 # Kludge, some glosses have a comma after initial qualifiers in

2317 # parentheses

2318 if gloss.startswith((",", ":")):

2319 gloss = gloss[1:]

2320 gloss = gloss.strip()

2321 if gloss.endswith(":"):

2322 gloss = gloss[:-1].strip()

2323 if gloss.startswith("N. of "): 2323 ↛ 2324line 2323 didn't jump to line 2324 because the condition on line 2323 was never true

2324 gloss = "Name of " + gloss[6:]

2325 if gloss.startswith("†"): 2325 ↛ 2326line 2325 didn't jump to line 2326 because the condition on line 2325 was never true

2326 data_append(sense_data, "tags", "obsolete")

2327 gloss = gloss[1:]

2328 elif gloss.startswith("^†"): 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true

2329 data_append(sense_data, "tags", "obsolete")

2330 gloss = gloss[2:]

2331

2332 # Copy tags from sense_base if any. This will not copy

2333 # countable/uncountable if either was specified in the sense,

2334 # as sometimes both are specified in word head but only one

2335 # in individual senses.

2336 countability_tags = []

2337 base_tags = sense_base.get("tags", ())

2338 sense_tags = sense_data.get("tags", ())

2339 for tag in base_tags:

2340 if tag in ("countable", "uncountable"):

2341 if tag not in countability_tags: 2341 ↛ 2343line 2341 didn't jump to line 2343 because the condition on line 2341 was always true

2342 countability_tags.append(tag)

2343 continue

2344 if tag not in sense_tags:

2345 data_append(sense_data, "tags", tag)

2346 if countability_tags:

2347 if ( 2347 ↛ 2356line 2347 didn't jump to line 2356 because the condition on line 2347 was always true

2348 "countable" not in sense_tags

2349 and "uncountable" not in sense_tags

2350 ):

2351 data_extend(sense_data, "tags", countability_tags)

2352

2353 # If outer gloss specifies a form-of ("inflection of", see

2354 # aquamarine/German), try to parse the inner glosses as

2355 # tags for an inflected form.

2356 if "form-of" in sense_base.get("tags", ()):

2357 parsed = parse_alt_or_inflection_of(

2358 wxr, gloss, gloss_template_args

2359 )

2360 if parsed is not None: 2360 ↛ 2366line 2360 didn't jump to line 2366 because the condition on line 2360 was always true

2361 infl_tags, infl_dts = parsed

2362 if not infl_dts and infl_tags: 2362 ↛ 2366line 2362 didn't jump to line 2366 because the condition on line 2362 was always true

2363 # Interpret as a particular form under "inflection of"

2364 data_extend(sense_data, "tags", infl_tags)

2365

2366 if not gloss: 2366 ↛ 2367line 2366 didn't jump to line 2367 because the condition on line 2366 was never true

2367 data_append(sense_data, "tags", "empty-gloss")

2368 elif gloss != "-" and gloss not in sense_data.get("glosses", []):

2369 if ( 2369 ↛ 2380line 2369 didn't jump to line 2380 because the condition on line 2369 was always true

2370 gloss_i == 0

2371 and len(sense_data.get("glosses", tuple())) >= 1

2372 ):

2373 # If we added a "high-level gloss" from rawgloss, but this

2374 # is that same gloss_i, add this instead of the raw_gloss

2375 # from before if they're different: the rawgloss was not

2376 # cleaned exactly the same as this later gloss

2377 sense_data["glosses"][-1] = gloss

2378 else:

2379 # Add the gloss for the sense.

2380 data_append(sense_data, "glosses", gloss)

2381

2382 # Kludge: there are cases (e.g., etc./Swedish) where there are

2383 # two abbreviations in the same sense, both generated by the

2384 # {{abbreviation of|...}} template. Handle these with some magic.

2385 position = 0

2386 split_glosses = []

2387 for m in re.finditer(r"Abbreviation of ", gloss):

2388 if m.start() != position: 2388 ↛ 2387line 2388 didn't jump to line 2387 because the condition on line 2388 was always true

2389 split_glosses.append(gloss[position : m.start()])

2390 position = m.start()

2391 split_glosses.append(gloss[position:])

2392 for gloss in split_glosses:

2393 # Check if this gloss describes an alt-of or inflection-of

2394 if (

2395 lang_code != "en"

2396 and " " not in gloss

2397 and distw([word], gloss) < 0.3

2398 ):

2399 # Don't try to parse gloss if it is one word

2400 # that is close to the word itself for non-English words

2401 # (probable translations of a tag/form name)

2402 continue

2403 parsed = parse_alt_or_inflection_of(

2404 wxr, gloss, gloss_template_args

2405 )

2406 if parsed is None:

2407 continue

2408 tags, dts = parsed

2409 if not dts and tags:

2410 data_extend(sense_data, "tags", tags)

2411 continue

2412 for dt in dts: # type:ignore[union-attr]

2413 ftags = list(tag for tag in tags if tag != "form-of")

2414 if "alt-of" in tags:

2415 data_extend(sense_data, "tags", ftags)

2416 data_append(sense_data, "alt_of", dt)

2417 elif "compound-of" in tags: 2417 ↛ 2418line 2417 didn't jump to line 2418 because the condition on line 2417 was never true

2418 data_extend(sense_data, "tags", ftags)

2419 data_append(sense_data, "compound_of", dt)

2420 elif "synonym-of" in tags: 2420 ↛ 2421line 2420 didn't jump to line 2421 because the condition on line 2420 was never true

2421 data_extend(dt, "tags", ftags)

2422 data_append(sense_data, "synonyms", dt)

2423 elif tags and dt.get("word", "").startswith("of "): 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true

2424 dt["word"] = dt["word"][3:]

2425 data_append(sense_data, "tags", "form-of")

2426 data_extend(sense_data, "tags", ftags)

2427 data_append(sense_data, "form_of", dt)

2428 elif "form-of" in tags: 2428 ↛ 2412line 2428 didn't jump to line 2412 because the condition on line 2428 was always true

2429 data_extend(sense_data, "tags", tags)

2430 data_append(sense_data, "form_of", dt)

2431

2432 if len(sense_data) == 0:

2433 if len(sense_base.get("tags", [])) == 0: 2433 ↛ 2435line 2433 didn't jump to line 2435 because the condition on line 2433 was always true

2434 del sense_base["tags"]

2435 sense_data.update(sense_base)

2436 if push_sense(): 2436 ↛ 2440line 2436 didn't jump to line 2440 because the condition on line 2436 was always true

2437 # push_sense succeded in adding a sense to pos_data

2438 added = True

2439 # print("PARSE_SENSE DONE:", pos_datas[-1])

2440 return added

2441

2442 def parse_inflection(

2443 node: WikiNode, section: str, pos: Optional[str]

2444 ) -> None:

2445 """Parses inflection data (declension, conjugation) from the given

2446 page. This retrieves the actual inflection template

2447 parameters, which are very useful for applications that need

2448 to learn the inflection classes and generate inflected

2449 forms."""

2450 assert isinstance(node, WikiNode)

2451 assert isinstance(section, str)

2452 assert pos is None or isinstance(pos, str)

2453 # print("parse_inflection:", node)

2454

2455 if pos is None: 2455 ↛ 2456line 2455 didn't jump to line 2456 because the condition on line 2455 was never true

2456 wxr.wtp.debug(

2457 "inflection table outside part-of-speech", sortid="page/1812"

2458 )

2459 return

2460

2461 def inflection_template_fn(

2462 name: str, ht: TemplateArgs

2463 ) -> Optional[str]:

2464 # print("decl_conj_template_fn", name, ht)

2465 if is_panel_template(wxr, name): 2465 ↛ 2466line 2465 didn't jump to line 2466 because the condition on line 2465 was never true

2466 return ""

2467 if name in ("is-u-mutation",): 2467 ↛ 2470line 2467 didn't jump to line 2470 because the condition on line 2467 was never true

2468 # These are not to be captured as an exception to the

2469 # generic code below

2470 return None

2471 m = re.search(

2473 r"declension|inflection|mut|mutation)($|-)",

2474 name,

2475 )

2476 if m:

2477 args_ht = clean_template_args(wxr, ht)

2478 dt = {"name": name, "args": args_ht}

2479 data_append(pos_data, "inflection_templates", dt)

2480

2481 return None

2482

2483 # Convert the subtree back to Wikitext, then expand all and parse,

2484 # capturing templates in the process

2485 text = wxr.wtp.node_to_wikitext(node.children)

2486

2487 # Split text into separate sections for each to-level template

2488 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text)

2489 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"]

2490 # The (?:...) creates a non-capturing regex group; if it was capturing,

2491 # like the group around it, it would create elements in brace_matches,

2492 # including None if it doesn't match.

2493 # 20250114: Added {| and |} into the regex because tables were being

2494 # cut into pieces by this code. Issue #973, introduction of two-part

2495 # book-end templates similar to trans-top and tran-bottom.

2496 template_sections = []

2497 template_nesting = 0 # depth of SINGLE BRACES { { nesting } }

2498 # Because there is the possibility of triple curly braces

2499 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not

2500 # count nesting depth using pairs of two brackets, but

2501 # instead use singular braces ("{ }").

2502 # Because template delimiters should be balanced, regardless

2503 # of whether {{ or {{{ is used, and because we only care

2504 # about the outer-most delimiters (the highest level template)

2505 # we can just count the single braces when those single

2506 # braces are part of a group.

2507 table_nesting = 0

2508 # However, if we have a stray table ({| ... |}) that should always

2509 # be its own section, and should prevent templates from cutting it

2510 # into sections.

2511

2512 # print(f"Parse inflection: {text=}")

2513 # print(f"Brace matches: {repr('///'.join(brace_matches))}")

2514 if len(brace_matches) > 1:

2515 tsection: list[str] = []

2516 after_templates = False # kludge to keep any text

2517 # before first template

2518 # with the first template;

2519 # otherwise, text

2520 # goes with preceding template

2521 for m in brace_matches:

2522 if m.startswith("\n; ") and after_templates: 2522 ↛ 2523line 2522 didn't jump to line 2523 because the condition on line 2522 was never true

2523 after_templates = False

2524 template_sections.append(tsection)

2525 tsection = []

2526 tsection.append(m)

2527 elif m.startswith("{{") or m.endswith("{|"):

2528 if (

2529 template_nesting == 0

2530 and after_templates

2531 and table_nesting == 0

2532 ):

2533 template_sections.append(tsection)

2534 tsection = []

2535 # start new section

2536 after_templates = True

2537 if m.startswith("{{"):

2538 template_nesting += 1

2539 else:

2540 # m.endswith("{|")

2541 table_nesting += 1

2542 tsection.append(m)

2543 elif m.startswith("}}") or m.endswith("|}"):

2544 if m.startswith("}}"):

2545 template_nesting -= 1

2546 if template_nesting < 0: 2546 ↛ 2547line 2546 didn't jump to line 2547 because the condition on line 2546 was never true

2547 wxr.wtp.error(

2548 "Negatively nested braces, "

2549 "couldn't split inflection templates, "

2550 "{}/{} section {}".format(

2551 word, language, section

2552 ),

2553 sortid="page/1871",

2554 )

2555 template_sections = [] # use whole text

2556 break

2557 else:

2558 table_nesting -= 1

2559 if table_nesting < 0: 2559 ↛ 2560line 2559 didn't jump to line 2560 because the condition on line 2559 was never true

2560 wxr.wtp.error(

2561 "Negatively nested table braces, "

2562 "couldn't split inflection section, "

2563 "{}/{} section {}".format(

2564 word, language, section

2565 ),

2566 sortid="page/20250114",

2567 )

2568 template_sections = [] # use whole text

2569 break

2570 tsection.append(m)

2571 else:

2572 tsection.append(m)

2573 if tsection: # dangling tsection 2573 ↛ 2581line 2573 didn't jump to line 2581 because the condition on line 2573 was always true

2574 template_sections.append(tsection)

2575 # Why do it this way around? The parser has a preference

2576 # to associate bits outside of tables with the preceding

2577 # table (`after`-variable), so a new tsection begins

2578 # at {{ and everything before it belongs to the previous

2579 # template.

2580

2581 texts = []

2582 if not template_sections:

2583 texts = [text]

2584 else:

2585 for tsection in template_sections:

2586 texts.append("".join(tsection))

2587 if template_nesting != 0: 2587 ↛ 2588line 2587 didn't jump to line 2588 because the condition on line 2587 was never true

2588 wxr.wtp.error(

2589 "Template nesting error: "

2590 "template_nesting = {} "

2591 "couldn't split inflection templates, "

2592 "{}/{} section {}".format(

2593 template_nesting, word, language, section

2594 ),

2595 sortid="page/1896",

2596 )

2597 texts = [text]

2598 for text in texts:

2599 tree = wxr.wtp.parse(

2600 text, expand_all=True, template_fn=inflection_template_fn

2601 )

2602

2603 if not text.strip():

2604 continue

2605

2606 # Parse inflection tables from the section. The data is stored

2607 # under "forms".

2608 if wxr.config.capture_inflections: 2608 ↛ 2598line 2608 didn't jump to line 2598 because the condition on line 2608 was always true

2609 tablecontext = None

2610 m = re.search(r"{{([^}{|]+)\|?", text)

2611 if m:

2612 template_name = m.group(1)

2613 tablecontext = TableContext(template_name)

2614

2615 parse_inflection_section(

2616 wxr,

2617 pos_data,

2618 word,

2619 language,

2620 pos,

2621 section,

2622 tree,

2623 tablecontext=tablecontext,

2624 )

2625

2626 def get_subpage_section(

2627 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]]

2628 ) -> Optional[Union[WikiNode, str]]:

2629 """Loads a subpage of the given page, and finds the section

2630 for the given language, part-of-speech, and section title. This

2631 is used for finding translations and other sections on subpages."""

2632 assert isinstance(language, str)

2633 assert isinstance(title, str)

2634 assert isinstance(subtitle, str)

2635 assert isinstance(seqs, (list, tuple))

2636 for seq in seqs:

2637 for x in seq:

2638 assert isinstance(x, str)

2639 subpage_title = word + "/" + subtitle

2640 subpage_content = wxr.wtp.get_page_body(subpage_title, 0)

2641 if subpage_content is None:

2642 wxr.wtp.error(

2643 "/translations not found despite "

2644 "{{see translation subpage|...}}",

2645 sortid="page/1934",

2646 )

2647 return None

2648

2649 def recurse(

2650 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]]

2651 ) -> Optional[Union[str, WikiNode]]:

2652 # print(f"seq: {seq}")

2653 if not seq:

2654 return node

2655 if not isinstance(node, WikiNode):

2656 return None

2657 # print(f"node.kind: {node.kind}")

2658 if node.kind in LEVEL_KINDS:

2659 t = clean_node(wxr, None, node.largs[0])

2660 # print(f"t: {t} == seq[0]: {seq[0]}?")

2661 if t.lower() == seq[0].lower():

2662 seq = seq[1:]

2663 if not seq:

2664 return node

2665 for n in node.children:

2666 ret = recurse(n, seq)

2667 if ret is not None:

2668 return ret

2669 return None

2670

2671 tree = wxr.wtp.parse(

2672 subpage_content,

2673 pre_expand=True,

2674 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,

2675 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,

2676 )

2677 assert tree.kind == NodeKind.ROOT

2678 for seq in seqs:

2679 ret = recurse(tree, seq)

2680 if ret is None:

2681 wxr.wtp.debug(

2682 "Failed to find subpage section {}/{} seq {}".format(

2683 title, subtitle, seq

2684 ),

2685 sortid="page/1963",

2686 )

2687 return ret

2688

2689 def parse_linkage(

2690 data: WordData, field: str, linkagenode: LevelNode

2691 ) -> None:

2692 assert isinstance(data, dict)

2693 assert isinstance(field, str)

2694 assert isinstance(linkagenode, WikiNode)

2695 # if field == "synonyms":

2696 # print("field", field)

2697 # print("data", data)

2698 # print("children:")

2699 # print(linkagenode.children)

2700 if not wxr.config.capture_linkages: 2700 ↛ 2701line 2700 didn't jump to line 2701 because the condition on line 2700 was never true

2701 return

2702 have_panel_template = False

2703 toplevel_text = []

2704 next_navframe_sense = None # Used for "(sense):" before NavFrame

2705

2706 def parse_linkage_item(

2707 contents: list[Union[str, WikiNode]],

2708 field: str,

2709 sense: Optional[str] = None,

2710 ):

2711 assert isinstance(contents, (list, tuple))

2712 assert isinstance(field, str)

2713 assert sense is None or isinstance(sense, str)

2714

2715 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"

2716 # .format(field, sense, contents))

2717

2718 parts: list[str] = []

2719 ruby: list[tuple[str, str]] = []

2720 urls: list[str] = []

2721 # data about link text; this is used to skip splitting on

2722 # linkage text items that contain stuff like commas; for

2723 # example "Hunde, die bellen, beißen nicht" in article

2724 # beißen is split into "Hunde", "die bellen" etc.

2725 # We take that link text and use it, eventually,

2726 # in split_at_comma_semi to skip splitting on those

2727 # commas.

2728 links_that_should_not_be_split: list[str] = []

2729

2730 def item_recurse(

2731 contents: list[Union[str, WikiNode]], italic=False

2732 ) -> None:

2733 assert isinstance(contents, (list, tuple))

2734 nonlocal sense

2735 nonlocal ruby

2736 nonlocal parts

2737 # print("ITEM_RECURSE:", contents)

2738 for node in contents:

2739 if isinstance(node, str):

2740 parts.append(node)

2741 continue

2742 kind = node.kind

2743 # print("ITEM_RECURSE KIND:", kind,

2744 # node.sarg if node.sarg else node.largs)

2745 if kind == NodeKind.LIST:

2746 if parts: 2746 ↛ 2761line 2746 didn't jump to line 2761 because the condition on line 2746 was always true

2747 sense1: Optional[str]

2748 sense1 = clean_node(wxr, None, parts)

2749 if sense1.endswith(":"):

2750 sense1 = sense1[:-1].strip()

2751 if sense1.startswith("(") and sense1.endswith(")"): 2751 ↛ 2752line 2751 didn't jump to line 2752 because the condition on line 2751 was never true

2752 sense1 = sense1[1:-1].strip()

2753 if sense1.lower() == TRANSLATIONS_TITLE: 2753 ↛ 2754line 2753 didn't jump to line 2754 because the condition on line 2753 was never true

2754 sense1 = None

2755 # print("linkage item_recurse LIST sense1:", sense1)

2756 parse_linkage_recurse(

2757 node.children, field, sense=sense1 or sense

2758 )

2759 parts = []

2760 else:

2761 parse_linkage_recurse(node.children, field, sense)

2762 elif kind in ( 2762 ↛ 2767line 2762 didn't jump to line 2767 because the condition on line 2762 was never true

2763 NodeKind.TABLE,

2764 NodeKind.TABLE_ROW,

2765 NodeKind.TABLE_CELL,

2766 ):

2767 parse_linkage_recurse(node.children, field, sense)

2768 elif kind in ( 2768 ↛ 2772line 2768 didn't jump to line 2772 because the condition on line 2768 was never true

2769 NodeKind.TABLE_HEADER_CELL,

2770 NodeKind.TABLE_CAPTION,

2771 ):

2772 continue

2773 elif kind == NodeKind.HTML: 2773 ↛ 2774line 2773 didn't jump to line 2774 because the condition on line 2773 was never true

2774 classes = (node.attrs.get("class") or "").split()

2775 if node.sarg in ("gallery", "ref", "cite", "caption"):

2776 continue

2777 elif node.sarg == "ruby":

2778 rb = parse_ruby(wxr, node)

2779 if rb:

2780 ruby.append(rb)

2781 parts.append(rb[0])

2782 continue

2783 elif node.sarg == "math":

2784 parts.append(clean_node(wxr, None, node))

2785 continue

2786 elif "interProject" in classes:

2787 continue # These do not seem to be displayed

2788 if "NavFrame" in classes:

2789 parse_linkage_recurse(node.children, field, sense)

2790 else:

2791 item_recurse(node.children, italic=italic)

2792 elif kind == NodeKind.ITALIC:

2793 item_recurse(node.children, italic=True)

2794 elif kind == NodeKind.LINK:

2795 ignore = False

2796 if isinstance(node.largs[0][0], str): 2796 ↛ 2738line 2796 didn't jump to line 2738 because the condition on line 2796 was always true

2797 v1 = node.largs[0][0].strip().lower()

2798 if v1.startswith( 2798 ↛ 2802line 2798 didn't jump to line 2802 because the condition on line 2798 was never true

2799 ns_title_prefix_tuple(wxr, "Category", True)

2800 + ns_title_prefix_tuple(wxr, "File", True)

2801 ):

2802 ignore = True

2803 if not ignore: 2803 ↛ 2738line 2803 didn't jump to line 2738 because the condition on line 2803 was always true

2804 v = node.largs[-1]

2805 if (

2806 len(node.largs) == 1

2807 and len(v) > 0

2808 and isinstance(v[0], str)

2809 and v[0][0] == ":"

2810 ):

2811 v = [v[0][1:]] + list(v[1:]) # type:ignore

2812 if isinstance(v[0], str) and not v[0].isalnum():

2813 links_that_should_not_be_split.append(

2814 "".join(v[0])

2815 ) # type: ignore

2816 item_recurse(v, italic=italic)

2817 elif kind == NodeKind.URL:

2818 if len(node.largs) < 2 and node.largs:

2819 # Naked url captured

2820 urls.extend(node.largs[-1]) # type:ignore[arg-type]

2821 continue

2822 if len(node.largs) == 2: 2822 ↛ 2827line 2822 didn't jump to line 2827 because the condition on line 2822 was always true

2823 # Url from link with text

2824 urls.append(node.largs[0][-1]) # type:ignore[arg-type]

2825 # print(f"{node.largs=!r}")

2826 # print("linkage recurse URL {}".format(node))

2827 item_recurse(node.largs[-1], italic=italic)

2828 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2828 ↛ 2831line 2828 didn't jump to line 2831 because the condition on line 2828 was always true

2829 item_recurse(node.children, italic=italic)

2830 else:

2831 wxr.wtp.debug(

2832 "linkage item_recurse unhandled {}: {}".format(

2833 node.kind, node

2834 ),

2835 sortid="page/2073",

2836 )

2837

2838 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"

2839 # .format(contents))

2840

2841 item_recurse(contents)

2842 item = clean_node(wxr, None, parts)

2843 # print("LINKAGE ITEM CONTENTS:", parts)

2844 # print("CLEANED ITEM: {!r}".format(item))

2845 # print(f"URLS {urls=!r}")

2846

2847 return parse_linkage_item_text(

2848 wxr,

2849 word,

2850 data,

2851 field,

2852 item,

2853 sense,

2854 ruby,

2855 pos_datas,

2856 is_reconstruction,

2857 urls or None,

2858 links_that_should_not_be_split or None,

2859 )

2860

2861 def parse_linkage_recurse(

2862 contents: list[Union[WikiNode, str]],

2863 field: str,

2864 sense: Optional[str],

2865 ) -> None:

2866 assert isinstance(contents, (list, tuple))

2867 assert sense is None or isinstance(sense, str)

2868 nonlocal next_navframe_sense

2869 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))

2870 for node in contents:

2871 if isinstance(node, str):

2872 # Ignore top-level text, generally comments before the

2873 # linkages list. However, if no linkages are found, then

2874 # use this for linkages (not all words use bullet points

2875 # for linkages).

2876 toplevel_text.append(node)

2877 continue

2878 assert isinstance(node, WikiNode)

2879 kind = node.kind

2880 # print("PARSE_LINKAGE_RECURSE CHILD", kind)

2881 if kind == NodeKind.LIST:

2882 parse_linkage_recurse(node.children, field, sense)

2883 elif kind == NodeKind.LIST_ITEM:

2884 v = parse_linkage_item(node.children, field, sense)

2885 if v: 2885 ↛ 2889line 2885 didn't jump to line 2889 because the condition on line 2885 was never true

2886 # parse_linkage_item() can return a value that should

2887 # be used as the sense for the follow-on linkages,

2888 # which are typically provided in a table (see 滿)

2889 next_navframe_sense = v

2890 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):

2891 parse_linkage_recurse(node.children, field, sense)

2892 elif kind == NodeKind.TABLE_CELL:

2893 parse_linkage_item(node.children, field, sense)

2894 elif kind in (

2895 NodeKind.TABLE_CAPTION,

2896 NodeKind.TABLE_HEADER_CELL,

2897 NodeKind.PREFORMATTED,

2898 NodeKind.BOLD,

2899 ):

2900 continue

2901 elif kind == NodeKind.HTML: 2901 ↛ 2903line 2901 didn't jump to line 2903 because the condition on line 2901 was never true

2902 # Recurse to process inside the HTML for most tags

2903 if node.sarg in ("gallery", "ref", "cite", "caption"):

2904 continue

2905 classes = (node.attrs.get("class") or "").split()

2906 if node.sarg == "li":

2907 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑

2908 v = parse_linkage_item(node.children, field, sense)

2909 if v:

2910 next_navframe_sense = v

2911 elif "qualifier-content" in classes:

2912 sense1 = clean_node(wxr, None, node.children)

2913 if sense1.endswith(":"):

2914 sense1 = sense1[:-1].strip()

2915 if sense and sense1:

2916 wxr.wtp.debug(

2917 "linkage qualifier-content on multiple "

2918 "levels: {!r} and {!r}".format(sense, sense1),

2919 sortid="page/2170",

2920 )

2921 parse_linkage_recurse(node.children, field, sense1)

2922 elif "NavFrame" in classes:

2923 # NavFrame uses previously assigned next_navframe_sense

2924 # (from a "(sense):" item) and clears it afterwards

2925 parse_linkage_recurse(

2926 node.children, field, sense or next_navframe_sense

2927 )

2928 next_navframe_sense = None

2929 else:

2930 parse_linkage_recurse(node.children, field, sense)

2931 elif kind in LEVEL_KINDS: 2931 ↛ 2933line 2931 didn't jump to line 2933 because the condition on line 2931 was never true

2932 # Just recurse to any possible subsections

2933 parse_linkage_recurse(node.children, field, sense)

2934 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):

2935 # Skip these on top level; at least sometimes bold is

2936 # used for indicating a subtitle

2937 continue

2938 elif kind == NodeKind.LINK: 2938 ↛ 2944line 2938 didn't jump to line 2944 because the condition on line 2938 was always true

2939 # Recurse into the last argument

2940 # Apparently ":/" is used as a link to "/", so strip

2941 # initial value

2942 parse_linkage_recurse(node.largs[-1], field, sense)

2943 else:

2944 wxr.wtp.debug(

2945 "parse_linkage_recurse unhandled {}: {}".format(

2946 kind, node

2947 ),

2948 sortid="page/2196",

2949 )

2950

2951 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:

2952 nonlocal have_panel_template

2953 if is_panel_template(wxr, name):

2954 have_panel_template = True

2955 return ""

2956 return None

2957

2958 # Main body of parse_linkage()

2959 l_nodes = []

2960 l_sense = ""

2961 for node in linkagenode.children:

2962 if (

2963 isinstance(node, TemplateNode)

2964 and node.template_name == "zh-dial"

2965 ):

2966 extract_zh_dial_template(wxr, data, node, l_sense)

2967 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

2968 for list_item in node.find_child(NodeKind.LIST_ITEM):

2969 for t_node in list_item.find_child(NodeKind.TEMPLATE):

2970 if t_node.template_name in ["s", "sense"]:

2971 l_sense = clean_node(wxr, None, t_node).strip(

2972 "(): "

2973 )

2974 l_nodes.append(node)

2975 else:

2976 l_nodes.append(node)

2977 text = wxr.wtp.node_to_wikitext(l_nodes)

2978 parsed = wxr.wtp.parse(

2979 text, expand_all=True, template_fn=linkage_template_fn1

2980 )

2981 parse_linkage_recurse(parsed.children, field, None)

2982 if not data.get(field) and not have_panel_template:

2983 text = "".join(toplevel_text).strip()

2984 if "\n" not in text and "," in text and text.count(",") > 3:

2985 if not text.startswith("See "): 2985 ↛ exitline 2985 didn't return from function 'parse_linkage' because the condition on line 2985 was always true

2986 parse_linkage_item([text], field, None)

2987

2988 def parse_translations(data: WordData, xlatnode: WikiNode) -> None:

2989 """Parses translations for a word. This may also pull in translations

2990 from separate translation subpages."""

2991 assert isinstance(data, dict)

2992 assert isinstance(xlatnode, WikiNode)

2993 # print("===== PARSE_TRANSLATIONS {} {} {}"

2994 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection))

2995 # print("parse_translations xlatnode={}".format(xlatnode))

2996 if not wxr.config.capture_translations: 2996 ↛ 2997line 2996 didn't jump to line 2997 because the condition on line 2996 was never true

2997 return

2998 sense_parts: list[Union[WikiNode, str]] = []

2999 sense: Optional[str] = None

3000

3001 def parse_translation_item(

3002 contents: list[Union[WikiNode, str]], lang: Optional[str] = None

3003 ) -> None:

3004 nonlocal sense

3005 assert isinstance(contents, list)

3006 assert lang is None or isinstance(lang, str)

3007 # print("PARSE_TRANSLATION_ITEM:", contents)

3008

3009 langcode: Optional[str] = None

3010 if sense is None:

3011 sense = clean_node(wxr, data, sense_parts).strip()

3012 # print("sense <- clean_node: ", sense)

3013 idx = sense.find("See also translations at")

3014 if idx > 0: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true

3015 wxr.wtp.debug(

3016 "Skipping translation see also: {}".format(sense),

3017 sortid="page/2361",

3018 )

3019 sense = sense[:idx].strip()

3020 if sense.endswith(":"): 3020 ↛ 3021line 3020 didn't jump to line 3021 because the condition on line 3020 was never true

3021 sense = sense[:-1].strip()

3022 if sense.endswith("—"): 3022 ↛ 3023line 3022 didn't jump to line 3023 because the condition on line 3022 was never true

3023 sense = sense[:-1].strip()

3024 translations_from_template: list[str] = []

3025

3026 def translation_item_template_fn(

3027 name: str, ht: TemplateArgs

3028 ) -> Optional[str]:

3029 nonlocal langcode

3030 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht)

3031 if is_panel_template(wxr, name):

3032 return ""

3033 if name in ("t+check", "t-check", "t-needed"):

3034 # We ignore these templates. They seem to have outright

3035 # garbage in some entries, and very varying formatting in

3036 # others. These should be transitory and unreliable

3037 # anyway.

3038 return "__IGNORE__"

3039 if name in ("t", "t+", "t-simple", "tt", "tt+"):

3040 code = ht.get(1)

3041 if code: 3041 ↛ 3051line 3041 didn't jump to line 3051 because the condition on line 3041 was always true

3042 if langcode and code != langcode:

3043 wxr.wtp.debug(

3044 "inconsistent language codes {} vs "

3045 "{} in translation item: {!r} {}".format(

3046 langcode, code, name, ht

3047 ),

3048 sortid="page/2386",

3049 )

3050 langcode = code

3051 tr = ht.get(2)

3052 if tr:

3053 tr = clean_node(wxr, None, [tr])

3054 translations_from_template.append(tr)

3055 return None

3056 if name == "t-egy":

3057 langcode = "egy"

3058 return None

3059 if name == "ttbc":

3060 code = ht.get(1)

3061 if code: 3061 ↛ 3063line 3061 didn't jump to line 3063 because the condition on line 3061 was always true

3062 langcode = code

3063 return None

3064 if name == "trans-see": 3064 ↛ 3065line 3064 didn't jump to line 3065 because the condition on line 3064 was never true

3065 wxr.wtp.error(

3066 "UNIMPLEMENTED trans-see template", sortid="page/2405"

3067 )

3068 return ""

3069 if name.endswith("-top"): 3069 ↛ 3070line 3069 didn't jump to line 3070 because the condition on line 3069 was never true

3070 return ""

3071 if name.endswith("-bottom"): 3071 ↛ 3072line 3071 didn't jump to line 3072 because the condition on line 3071 was never true

3072 return ""

3073 if name.endswith("-mid"): 3073 ↛ 3074line 3073 didn't jump to line 3074 because the condition on line 3073 was never true

3074 return ""

3075 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}"

3076 # .format(name),

3077 # sortid="page/2414")

3078 return None

3079

3080 sublists = list(

3081 x

3082 for x in contents

3083 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST

3084 )

3085 contents = list(

3086 x

3087 for x in contents

3088 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST

3089 )

3090

3091 item = clean_node(

3092 wxr, data, contents, template_fn=translation_item_template_fn

3093 )

3094 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense))

3095

3096 # Parse the translation item.

3097 if item: 3097 ↛ exitline 3097 didn't return from function 'parse_translation_item' because the condition on line 3097 was always true

3098 lang = parse_translation_item_text(

3099 wxr,

3100 word,

3101 data,

3102 item,

3103 sense,

3104 lang,

3105 langcode,

3106 translations_from_template,

3107 is_reconstruction,

3108 )

3109

3110 # Handle sublists. They are frequently used for different

3111 # scripts for the language and different variants of the

3112 # language. We will include the lower-level header as a

3113 # tag in those cases.

3114 for listnode in sublists:

3115 assert listnode.kind == NodeKind.LIST

3116 for node in listnode.children:

3117 if not isinstance(node, WikiNode): 3117 ↛ 3118line 3117 didn't jump to line 3118 because the condition on line 3117 was never true

3118 continue

3119 if node.kind == NodeKind.LIST_ITEM: 3119 ↛ 3116line 3119 didn't jump to line 3116 because the condition on line 3119 was always true

3120 parse_translation_item(node.children, lang=lang)

3121

3122 def parse_translation_template(node: WikiNode) -> None:

3123 assert isinstance(node, WikiNode)

3124

3125 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]:

3126 nonlocal sense_parts

3127 nonlocal sense

3128 if is_panel_template(wxr, name):

3129 return ""

3130 if name == "see also":

3131 # XXX capture

3132 # XXX for example, "/" has top-level list containing

3133 # see also items. So also should parse those.

3134 return ""

3135 if name == "trans-see":

3136 # XXX capture

3137 return ""

3138 if name == "see translation subpage": 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true

3139 sense_parts = []

3140 sense = None

3141 sub = ht.get(1, "")

3142 if sub:

3143 m = re.match(

3144 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub

3145 )

3146 else:

3147 m = None

3148 etym = ""

3149 etym_numbered = ""

3150 pos = ""

3151 if m:

3152 etym_numbered = m.group(1)

3153 etym = m.group(2)

3154 pos = m.group(3)

3155 if not sub:

3156 wxr.wtp.debug(

3157 "no part-of-speech in "

3158 "{{see translation subpage|...}}, "

3159 "defaulting to just wxr.wtp.section "

3160 "(= language)",

3161 sortid="page/2468",

3162 )

3163 # seq sent to get_subpage_section without sub and pos

3164 seq = [

3165 language,

3166 TRANSLATIONS_TITLE,

3167 ]

3168 elif (

3169 m

3170 and etym.lower().strip() in ETYMOLOGY_TITLES

3171 and pos.lower() in POS_TITLES

3172 ):

3173 seq = [

3174 language,

3175 etym_numbered,

3176 pos,

3177 TRANSLATIONS_TITLE,

3178 ]

3179 elif sub.lower() in POS_TITLES:

3180 # seq with sub but not pos

3181 seq = [

3182 language,

3183 sub,

3184 TRANSLATIONS_TITLE,

3185 ]

3186 else:

3187 # seq with sub and pos

3188 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"

3189 if pos.lower() not in POS_TITLES:

3190 wxr.wtp.debug(

3191 "unhandled see translation subpage: "

3192 "language={} sub={} "

3193 "wxr.wtp.subsection={}".format(

3194 language, sub, wxr.wtp.subsection

3195 ),

3196 sortid="page/2478",

3197 )

3198 seq = [language, sub, pos, TRANSLATIONS_TITLE]

3199 subnode = get_subpage_section(

3200 wxr.wtp.title or "MISSING_TITLE",

3201 TRANSLATIONS_TITLE,

3202 [seq],

3203 )

3204 if subnode is None or not isinstance(subnode, WikiNode):

3205 # Failed to find the normal subpage section

3206 # seq with sub and pos

3207 pos = wxr.wtp.subsection or "MISSING_SUBSECTION"

3208 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}")

3209 seqs: list[list[str] | tuple[str, ...]] = [

3210 [TRANSLATIONS_TITLE],

3211 [language, pos],

3212 ]

3213 subnode = get_subpage_section(

3214 wxr.wtp.title or "MISSING_TITLE",

3215 TRANSLATIONS_TITLE,

3216 seqs,

3217 )

3218 if subnode is not None and isinstance(subnode, WikiNode):

3219 parse_translations(data, subnode)

3220 return ""

3221 if name in (

3222 "c",

3223 "C",

3224 "categorize",

3225 "cat",

3226 "catlangname",

3227 "topics",

3228 "top",

3229 "qualifier",

3230 "cln",

3231 ):

3232 # These are expanded in the default way

3233 return None

3234 if name in (

3235 "trans-top",

3236 "trans-top-see",

3237 ):

3238 # XXX capture id from trans-top? Capture sense here

3239 # instead of trying to parse it from expanded content?

3240 if ht.get(1):

3241 sense_parts = []

3242 sense = ht.get(1)

3243 else:

3244 sense_parts = []

3245 sense = None

3246 return None

3247 if name in (

3248 "trans-bottom",

3249 "trans-mid",

3250 "checktrans-mid",

3251 "checktrans-bottom",

3252 ):

3253 return None

3254 if name == "checktrans-top":

3255 sense_parts = []

3256 sense = None

3257 return ""

3258 if name == "trans-top-also":

3259 # XXX capture?

3260 sense_parts = []

3261 sense = None

3262 return ""

3263 wxr.wtp.error(

3264 "UNIMPLEMENTED parse_translation_template: {} {}".format(

3265 name, ht

3266 ),

3267 sortid="page/2517",

3268 )

3269 return ""

3270

3271 wxr.wtp.expand(

3272 wxr.wtp.node_to_wikitext(node), template_fn=template_fn

3273 )

3274

3275 def parse_translation_recurse(xlatnode: WikiNode) -> None:

3276 nonlocal sense

3277 nonlocal sense_parts

3278 for node in xlatnode.children:

3279 # print(node)

3280 if isinstance(node, str):

3281 if sense:

3282 if not node.isspace():

3283 wxr.wtp.debug(

3284 "skipping string in the middle of "

3285 "translations: {}".format(node),

3286 sortid="page/2530",

3287 )

3288 continue

3289 # Add a part to the sense

3290 sense_parts.append(node)

3291 sense = None

3292 continue

3293 assert isinstance(node, WikiNode)

3294 kind = node.kind

3295 if kind == NodeKind.LIST:

3296 for item in node.children:

3297 if not isinstance(item, WikiNode): 3297 ↛ 3298line 3297 didn't jump to line 3298 because the condition on line 3297 was never true

3298 continue

3299 if item.kind != NodeKind.LIST_ITEM: 3299 ↛ 3300line 3299 didn't jump to line 3300 because the condition on line 3299 was never true

3300 continue

3301 if item.sarg == ":": 3301 ↛ 3302line 3301 didn't jump to line 3302 because the condition on line 3301 was never true

3302 continue

3303 parse_translation_item(item.children)

3304 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3304 ↛ 3308line 3304 didn't jump to line 3308 because the condition on line 3304 was never true

3305 # Silently skip list items that are just indented; these

3306 # are used for text between translations, such as indicating

3307 # translations that need to be checked.

3308 pass

3309 elif kind == NodeKind.TEMPLATE:

3310 parse_translation_template(node)

3311 elif kind in ( 3311 ↛ 3316line 3311 didn't jump to line 3316 because the condition on line 3311 was never true

3312 NodeKind.TABLE,

3313 NodeKind.TABLE_ROW,

3314 NodeKind.TABLE_CELL,

3315 ):

3316 parse_translation_recurse(node)

3317 elif kind == NodeKind.HTML:

3318 if node.attrs.get("class") == "NavFrame": 3318 ↛ 3324line 3318 didn't jump to line 3324 because the condition on line 3318 was never true

3319 # Reset ``sense_parts`` (and force recomputing

3320 # by clearing ``sense``) as each NavFrame specifies

3321 # its own sense. This helps eliminate garbage coming

3322 # from text at the beginning at the translations

3323 # section.

3324 sense_parts = []

3325 sense = None

3326 # for item in node.children:

3327 # if not isinstance(item, WikiNode):

3328 # continue

3329 # parse_translation_recurse(item)

3330 parse_translation_recurse(node)

3331 elif kind in LEVEL_KINDS: 3331 ↛ 3333line 3331 didn't jump to line 3333 because the condition on line 3331 was never true

3332 # Sub-levels will be recursed elsewhere

3333 pass

3334 elif kind in (NodeKind.ITALIC, NodeKind.BOLD):

3335 parse_translation_recurse(node)

3336 elif kind == NodeKind.PREFORMATTED: 3336 ↛ 3337line 3336 didn't jump to line 3337 because the condition on line 3336 was never true

3337 print("parse_translation_recurse: PREFORMATTED:", node)

3338 elif kind == NodeKind.LINK: 3338 ↛ 3392line 3338 didn't jump to line 3392 because the condition on line 3338 was always true

3339 arg0 = node.largs[0]

3340 # Kludge: I've seen occasional normal links to translation

3341 # subpages from main pages (e.g., language/English/Noun

3342 # in July 2021) instead of the normal

3343 # {{see translation subpage|...}} template. This should

3344 # handle them. Note: must be careful not to read other

3345 # links, particularly things like in "human being":

3346 # "a human being -- see [[man/translations]]" (group title)

3347 if ( 3347 ↛ 3355line 3347 didn't jump to line 3355 because the condition on line 3347 was never true

3348 isinstance(arg0, (list, tuple))

3349 and arg0

3350 and isinstance(arg0[0], str)

3351 and arg0[0].endswith("/" + TRANSLATIONS_TITLE)

3352 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))]

3353 == wxr.wtp.title

3354 ):

3355 wxr.wtp.debug(

3356 "translations subpage link found on main "

3357 "page instead "

3358 "of normal {{see translation subpage|...}}",

3359 sortid="page/2595",

3360 )

3361 sub = wxr.wtp.subsection or "MISSING_SUBSECTION"

3362 if sub.lower() in POS_TITLES:

3363 seq = [

3364 language,

3365 sub,

3366 TRANSLATIONS_TITLE,

3367 ]

3368 subnode = get_subpage_section(

3369 wxr.wtp.title,

3370 TRANSLATIONS_TITLE,

3371 [seq],

3372 )

3373 if subnode is not None and isinstance(

3374 subnode, WikiNode

3375 ):

3376 parse_translations(data, subnode)

3377 else:

3378 wxr.wtp.error(

3379 "/translations link outside part-of-speech"

3380 )

3381

3382 if (

3383 len(arg0) >= 1

3384 and isinstance(arg0[0], str)

3385 and not arg0[0].lower().startswith("category:")

3386 ):

3387 for x in node.largs[-1]:

3388 if isinstance(x, str): 3388 ↛ 3391line 3388 didn't jump to line 3391 because the condition on line 3388 was always true

3389 sense_parts.append(x)

3390 else:

3391 parse_translation_recurse(x)

3392 elif not sense:

3393 sense_parts.append(node)

3394 else:

3395 wxr.wtp.debug(

3396 "skipping text between translation items/senses: "

3397 "{}".format(node),

3398 sortid="page/2621",

3399 )

3400

3401 # Main code of parse_translation(). We want ``sense`` to be assigned

3402 # regardless of recursion levels, and thus the code is structured

3403 # to define at this level and recurse in parse_translation_recurse().

3404 parse_translation_recurse(xlatnode)

3405

3406 def parse_etymology(data: WordData, node: WikiNode) -> None:

3407 """Parses an etymology section."""

3408 assert isinstance(data, dict)

3409 assert isinstance(node, WikiNode)

3410

3411 templates: list[TemplateData] = []

3412

3413 # Counter for preventing the capture of etymology templates

3414 # when we are inside templates that we want to ignore (i.e.,

3415 # not capture).

3416 ignore_count = 0

3417

3418 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:

3419 nonlocal ignore_count

3420 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]:

3421 return ""

3422 if re.match(ignored_etymology_templates_re, name):

3423 ignore_count += 1

3424 return None

3425

3426 # CONTINUE_HERE

3427

3428 def etym_post_template_fn(

3429 name: str, ht: TemplateArgs, expansion: str

3430 ) -> None:

3431 nonlocal ignore_count

3432 if name in wikipedia_templates:

3433 parse_wikipedia_template(wxr, data, ht)

3434 return None

3435 if re.match(ignored_etymology_templates_re, name):

3436 ignore_count -= 1

3437 return None

3438 if ignore_count == 0: 3438 ↛ 3444line 3438 didn't jump to line 3444 because the condition on line 3438 was always true

3439 ht = clean_template_args(wxr, ht)

3440 expansion = clean_node(wxr, None, expansion)

3441 templates.append(

3442 {"name": name, "args": ht, "expansion": expansion}

3443 )

3444 return None

3445

3446 # Remove any subsections

3447 contents = list(

3448 x

3449 for x in node.children

3450 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS

3451 )

3452 # Convert to text, also capturing templates using post_template_fn

3453 text = clean_node(

3454 wxr,

3455 None,

3456 contents,

3457 template_fn=etym_template_fn,

3458 post_template_fn=etym_post_template_fn,

3459 ).strip(": \n") # remove ":" indent wikitext before zh-x template

3460 # Save the collected information.

3461 if len(text) > 0:

3462 data["etymology_text"] = text

3463 if len(templates) > 0:

3464 # Some etymology templates, like Template:root do not generate

3465 # text, so they should be added here. Elsewhere, we check

3466 # for Template:root and add some text to the expansion to please

3467 # the validation.

3468 data["etymology_templates"] = templates

3469

3470 for child_node in node.find_child_recursively( 3470 ↛ exitline 3470 didn't return from function 'parse_etymology' because the loop on line 3470 didn't complete

3471 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE

3472 ):

3473 if child_node.kind in LEVEL_KIND_FLAGS:

3474 break

3475 elif isinstance( 3475 ↛ 3478line 3475 didn't jump to line 3478 because the condition on line 3475 was never true

3476 child_node, TemplateNode

3477 ) and child_node.template_name in ["zh-x", "zh-q"]:

3478 if "etymology_examples" not in data:

3479 data["etymology_examples"] = []

3480 data["etymology_examples"].extend(

3481 extract_template_zh_x(

3482 wxr, child_node, None, ExampleData(raw_tags=[], tags=[])

3483 )

3484 )

3485

3486 def parse_descendants(

3487 data: WordData, node: WikiNode, is_proto_root_derived_section=False

3488 ) -> None:

3489 """Parses a Descendants section. Also used on Derived terms and

3490 Extensions sections when we are dealing with a root of a reconstructed

3491 language (i.e. is_proto_root_derived_section == True), as they use the

3492 same structure. In the latter case, The wiktionary convention is not to

3493 title the section as descendants since the immediate offspring of the

3494 roots are morphologically derived terms within the same proto-language.

3495 Still, since the rest of the section lists true descendants, we use the

3496 same function. Entries in the descendants list that are technically

3497 derived terms will have a field "tags": ["derived"]."""

3498 assert isinstance(data, dict)

3499 assert isinstance(node, WikiNode)

3500 assert isinstance(is_proto_root_derived_section, bool)

3501

3502 descendants = []

3503

3504 # Most templates that are not in a LIST should be ignored as they only

3505 # add formatting, like "desc-top", "der-top3", etc. Any template in

3506 # unignored_non_list_templates actually contains relevant descendant

3507 # info. E.g. "CJKV" is often the only line at all in descendants

3508 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would

3509 # be skipped if we didn't handle it specially as it is not part of a

3510 # LIST, and additionally is in panel_templates. There are probably more

3511 # such templates that should be added to this...

3512 unignored_non_list_templates: list[str] = ["CJKV"]

3513

3514 def process_list_item_children(

3515 sarg: str, children: list[Union[str, WikiNode]]

3516 ) -> None:

3517 assert isinstance(sarg, str)

3518 assert isinstance(children, list)

3519 # The descendants section is a hierarchical bulleted listed. sarg is

3520 # usually some number of "*" characters indicating the level of

3521 # indentation of the line, e.g. "***" indicates the line will be

3522 # thrice-indented. A bare ";" is used to indicate a subtitle-like

3523 # line with no indentation. ":" at the end of one or more "*"s is

3524 # used to indicate that the bullet will not be displayed.

3525 item_data: DescendantData = {"depth": sarg.count("*")}

3526 templates: list[TemplateData] = []

3527 is_derived = False

3528

3529 # Counter for preventing the capture of templates when we are inside

3530 # templates that we want to ignore (i.e., not capture).

3531 ignore_count = 0

3532

3533 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:

3534 nonlocal ignore_count

3535 if ( 3535 ↛ 3539line 3535 didn't jump to line 3539 because the condition on line 3535 was never true

3536 is_panel_template(wxr, name)

3537 and name not in unignored_non_list_templates

3538 ):

3539 return ""

3540 if re.match(ignored_descendants_templates_re, name):

3541 ignore_count += 1

3542 return None

3543

3544 def desc_post_template_fn(

3545 name: str, ht: TemplateArgs, expansion: str

3546 ) -> None:

3547 nonlocal ignore_count

3548 if name in wikipedia_templates: 3548 ↛ 3549line 3548 didn't jump to line 3549 because the condition on line 3548 was never true

3549 parse_wikipedia_template(wxr, data, ht)

3550 return None

3551 if re.match(ignored_descendants_templates_re, name):

3552 ignore_count -= 1

3553 return None

3554 if ignore_count == 0: 3554 ↛ 3570line 3554 didn't jump to line 3570 because the condition on line 3554 was always true

3555 ht = clean_template_args(wxr, ht)

3556 nonlocal is_derived

3557 # If we're in a proto-root Derived terms or Extensions

3558 # section, and the current list item has a link template

3559 # to a term in the same proto-language, then we tag this

3560 # descendant entry with "derived"

3561 is_derived = (

3562 is_proto_root_derived_section

3563 and (name == "l" or name == "link")

3564 and ("1" in ht and ht["1"] == lang_code)

3565 )

3566 expansion = clean_node(wxr, None, expansion)

3567 templates.append(

3568 {"name": name, "args": ht, "expansion": expansion}

3569 )

3570 return None

3571

3572 text = clean_node(

3573 wxr,

3574 None,

3575 children,

3576 template_fn=desc_template_fn,

3577 post_template_fn=desc_post_template_fn,

3578 )

3579 item_data["templates"] = templates

3580 item_data["text"] = text

3581 if is_derived: 3581 ↛ 3582line 3581 didn't jump to line 3582 because the condition on line 3581 was never true

3582 item_data["tags"] = ["derived"]

3583 descendants.append(item_data)

3584

3585 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]:

3586 for i, child in enumerate(node.children):

3587 if isinstance(child, WikiNode):

3588 yield (i, child)

3589

3590 def get_sublist_index(list_item: WikiNode) -> Optional[int]:

3591 for i, child in node_children(list_item):

3592 if child.kind == NodeKind.LIST:

3593 return i

3594 return None

3595

3596 def get_descendants(node: WikiNode) -> None:

3597 """Appends the data for every list item in every list in node

3598 to descendants."""

3599 for _, c in node_children(node):

3600 if (

3601 c.kind == NodeKind.TEMPLATE

3602 and c.largs

3603 and len(c.largs[0]) == 1

3604 and isinstance(c.largs[0][0], str)

3605 and c.largs[0][0] in unignored_non_list_templates

3606 ):

3607 # Some Descendants sections have no wikitext list. Rather,

3608 # the list is entirely generated by a single template (see

3609 # e.g. the use of {{CJKV}} in Chinese entries).

3610 process_list_item_children("", [c])

3611 elif c.kind == NodeKind.HTML: 3611 ↛ 3617line 3611 didn't jump to line 3617 because the condition on line 3611 was never true

3612 # The Descendants sections for many languages feature

3613 # templates that generate html to add styling (e.g. using

3614 # multiple columns) to the list, so that the actual wikitext

3615 # list items are found within a <div>. We look within the

3616 # children of the html node for the actual list items.

3617 get_descendants(c)

3618 elif c.kind == NodeKind.LIST:

3619 get_descendants(c)

3620 elif c.kind == NodeKind.LIST_ITEM:

3621 # If a LIST_ITEM has subitems in a sublist, usually its

3622 # last child is a LIST. However, sometimes after the LIST

3623 # there is one or more trailing LIST_ITEMs, like "\n" or

3624 # a reference template. If there is a sublist, we discard

3625 # everything after it.

3626 i = get_sublist_index(c)

3627 if i is not None:

3628 process_list_item_children(c.sarg, c.children[:i])

3629 get_descendants(c.children[i]) # type: ignore[arg-type]

3630 else:

3631 process_list_item_children(c.sarg, c.children)

3632

3633 # parse_descendants() actual work starts here

3634 get_descendants(node)

3635

3636 # if e.g. on a PIE page, there may be both Derived terms and Extensions

3637 # sections, in which case this function will be called multiple times,

3638 # so we have to check if descendants exists first.

3639 if "descendants" in data: 3639 ↛ 3640line 3639 didn't jump to line 3640 because the condition on line 3639 was never true

3640 data["descendants"].extend(descendants)

3641 else:

3642 data["descendants"] = descendants

3643

3644 def process_children(treenode: WikiNode, pos: Optional[str]) -> None:

3645 """This recurses into a subtree in the parse tree for a page."""

3646 nonlocal etym_data

3647 nonlocal pos_data

3648 nonlocal inside_level_four

3649

3650 redirect_list: list[str] = [] # for `zh-see` template

3651

3652 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:

3653 """This is called for otherwise unprocessed parts of the page.

3654 We still expand them so that e.g. Category links get captured."""

3655 if name in wikipedia_templates:

3656 data = select_data()

3657 parse_wikipedia_template(wxr, data, ht)

3658 return None

3659 if is_panel_template(wxr, name):

3660 return ""

3661 return None

3662

3663 for node in treenode.children:

3664 if not isinstance(node, WikiNode):

3665 # print(" X{}".format(repr(node)[:40]))

3666 continue

3667 if isinstance(node, TemplateNode):

3668 if process_soft_redirect_template(wxr, node, redirect_list):

3669 continue

3670 elif node.template_name == "zh-forms":

3671 extract_zh_forms_template(wxr, node, select_data())

3672

3673 if node.kind not in LEVEL_KINDS:

3674 # XXX handle e.g. wikipedia links at the top of a language

3675 # XXX should at least capture "also" at top of page

3676 if node.kind in (

3677 NodeKind.HLINE,

3678 NodeKind.LIST,

3679 NodeKind.LIST_ITEM,

3680 ):

3681 continue

3682 # print(" UNEXPECTED: {}".format(node))

3683 # Clean the node to collect category links

3684 clean_node(wxr, etym_data, node, template_fn=skip_template_fn)

3685 continue

3686 t = clean_node(

3687 wxr, etym_data, node.sarg if node.sarg else node.largs

3688 )

3689 t = t.lower()

3690 # XXX these counts were never implemented fully, and even this

3691 # gets discarded: Search STATISTICS_IMPLEMENTATION

3692 wxr.config.section_counts[t] += 1

3693 # print("PROCESS_CHILDREN: T:", repr(t))

3694 if t in IGNORED_TITLES:

3695 pass

3696 elif t.startswith(PRONUNCIATION_TITLE):

3697 # Chinese Pronunciation section kludge; we demote these to

3698 # be level 4 instead of 3 so that they're part of a larger

3699 # etymology hierarchy; usually the data here is empty and

3700 # acts as an inbetween between POS and Etymology data

3701 inside_level_four = True

3702 if t.startswith(PRONUNCIATION_TITLE + " "):

3703 # Pronunciation 1, etc, are used in Chinese Glyphs,

3704 # and each of them may have senses under Definition

3705 push_level_four_section(True)

3706 wxr.wtp.start_subsection(None)

3707 if wxr.config.capture_pronunciation: 3707 ↛ 3789line 3707 didn't jump to line 3789 because the condition on line 3707 was always true

3708 data = select_data()

3709 parse_pronunciation(

3710 wxr,

3711 node,

3712 data,

3713 etym_data,

3714 have_etym,

3715 base_data,

3716 lang_code,

3717 )

3718 elif t.startswith(tuple(ETYMOLOGY_TITLES)):

3719 push_etym()

3720 wxr.wtp.start_subsection(None)

3721 if wxr.config.capture_etymologies: 3721 ↛ 3789line 3721 didn't jump to line 3789 because the condition on line 3721 was always true

3722 m = re.search(r"\s(\d+)$", t)

3723 if m:

3724 etym_data["etymology_number"] = int(m.group(1))

3725 parse_etymology(etym_data, node)

3726 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants:

3727 data = select_data()

3728 parse_descendants(data, node)

3729 elif ( 3729 ↛ 3735line 3729 didn't jump to line 3735 because the condition on line 3729 was never true

3730 t in PROTO_ROOT_DERIVED_TITLES

3731 and pos == "root"

3732 and is_reconstruction

3733 and wxr.config.capture_descendants

3734 ):

3735 data = select_data()

3736 parse_descendants(data, node, True)

3737 elif t == TRANSLATIONS_TITLE:

3738 data = select_data()

3739 parse_translations(data, node)

3740 elif t in INFLECTION_TITLES:

3741 parse_inflection(node, t, pos)

3742 elif t == "alternative forms":

3743 extract_alt_form_section(wxr, select_data(), node)

3744 else:

3745 lst = t.split()

3746 while len(lst) > 1 and lst[-1].isdigit(): 3746 ↛ 3747line 3746 didn't jump to line 3747 because the condition on line 3746 was never true

3747 lst = lst[:-1]

3748 t_no_number = " ".join(lst).lower()

3749 if t_no_number in POS_TITLES:

3750 push_pos()

3751 dt = POS_TITLES[t_no_number] # type:ignore[literal-required]

3752 pos = dt["pos"] or "MISSING_POS"

3753 wxr.wtp.start_subsection(t)

3754 if "debug" in dt:

3755 wxr.wtp.debug(

3756 "{} in section {}".format(dt["debug"], t),

3757 sortid="page/2755",

3758 )

3759 if "warning" in dt: 3759 ↛ 3760line 3759 didn't jump to line 3760 because the condition on line 3759 was never true

3760 wxr.wtp.warning(

3761 "{} in section {}".format(dt["warning"], t),

3762 sortid="page/2759",

3763 )

3764 if "error" in dt: 3764 ↛ 3765line 3764 didn't jump to line 3765 because the condition on line 3764 was never true

3765 wxr.wtp.error(

3766 "{} in section {}".format(dt["error"], t),

3767 sortid="page/2763",

3768 )

3769 # Parse word senses for the part-of-speech

3770 parse_part_of_speech(node, pos)

3771 if "tags" in dt:

3772 for pdata in pos_datas:

3773 data_extend(pdata, "tags", dt["tags"])

3774 elif t_no_number in LINKAGE_TITLES:

3775 # print(f"LINKAGE_TITLES NODE {node=}")

3776 rel = LINKAGE_TITLES[t_no_number]

3777 data = select_data()

3778 parse_linkage(data, rel, node)

3779 elif t_no_number == COMPOUNDS_TITLE:

3780 data = select_data()

3781 if wxr.config.capture_compounds: 3781 ↛ 3789line 3781 didn't jump to line 3789 because the condition on line 3781 was always true

3782 parse_linkage(data, "derived", node)

3783

3784 # XXX parse interesting templates also from other sections. E.g.,

3785 # {{Letter|...}} in ===See also===

3786 # Also <gallery>

3787

3788 # Recurse to children of this node, processing subtitles therein

3789 stack.append(t)

3790 process_children(node, pos)

3791 stack.pop()

3792

3793 if len(redirect_list) > 0:

3794 if len(pos_data) > 0:

3795 pos_data["redirects"] = redirect_list

3796 if "pos" not in pos_data: 3796 ↛ 3797line 3796 didn't jump to line 3797 because the condition on line 3796 was never true

3797 pos_data["pos"] = "soft-redirect"

3798 else:

3799 new_page_data = copy.deepcopy(base_data)

3800 new_page_data["redirects"] = redirect_list

3801 if "pos" not in new_page_data: 3801 ↛ 3803line 3801 didn't jump to line 3803 because the condition on line 3801 was always true

3802 new_page_data["pos"] = "soft-redirect"

3803 new_page_data["senses"] = [{"tags": ["no-gloss"]}]

3804 page_datas.append(new_page_data)

3805

3806 def extract_examples(

3807 others: list[WikiNode], sense_base: SenseData

3808 ) -> list[ExampleData]:

3809 """Parses through a list of definitions and quotes to find examples.

3810 Returns a list of example dicts to be added to sense data. Adds

3811 meta-data, mostly categories, into sense_base."""

3812 assert isinstance(others, list)

3813 examples: list[ExampleData] = []

3814

3815 for sub in others:

3816 if not sub.sarg.endswith((":", "*")): 3816 ↛ 3817line 3816 didn't jump to line 3817 because the condition on line 3816 was never true

3817 continue

3818 for item in sub.children:

3819 if not isinstance(item, WikiNode): 3819 ↛ 3820line 3819 didn't jump to line 3820 because the condition on line 3819 was never true

3820 continue

3821 if item.kind != NodeKind.LIST_ITEM: 3821 ↛ 3822line 3821 didn't jump to line 3822 because the condition on line 3821 was never true

3822 continue

3823 usex_type = None

3824 example_template_args = []

3825 example_template_names = []

3826 taxons = set()

3827

3828 # Bypass this function when parsing Chinese, Japanese and

3829 # quotation templates.

3830 new_example_lists = extract_example_list_item(

3831 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[])

3832 )

3833 if len(new_example_lists) > 0:

3834 examples.extend(new_example_lists)

3835 continue

3836

3837 def usex_template_fn(

3838 name: str, ht: TemplateArgs

3839 ) -> Optional[str]:

3840 nonlocal usex_type

3841 if is_panel_template(wxr, name):

3842 return ""

3843 if name in usex_templates:

3844 usex_type = "example"

3845 example_template_args.append(ht)

3846 example_template_names.append(name)

3847 elif name in quotation_templates:

3848 usex_type = "quotation"

3849 elif name in taxonomy_templates: 3849 ↛ 3850line 3849 didn't jump to line 3850 because the condition on line 3849 was never true

3850 taxons.update(ht.get(1, "").split())

3851 for prefix in template_linkages_to_ignore_in_examples:

3852 if re.search(

3853 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name

3854 ):

3855 return ""

3856 return None

3857

3858 # bookmark

3859 ruby: list[tuple[str, str]] = []

3860 contents = item.children

3861 if lang_code == "ja":

3862 # Capture ruby contents if this is a Japanese language

3863 # example.

3864 # print(contents)

3865 if ( 3865 ↛ 3870line 3865 didn't jump to line 3870 because the condition on line 3865 was never true

3866 contents

3867 and isinstance(contents, str)

3868 and re.match(r"\s*$", contents[0])

3869 ):

3870 contents = contents[1:]

3871 exp = wxr.wtp.parse(

3872 wxr.wtp.node_to_wikitext(contents),

3873 # post_template_fn=head_post_template_fn,

3874 expand_all=True,

3875 )

3876 rub, rest = extract_ruby(wxr, exp.children)

3877 if rub:

3878 for rtup in rub:

3879 ruby.append(rtup)

3880 contents = rest

3881 subtext = clean_node(

3882 wxr, sense_base, contents, template_fn=usex_template_fn

3883 )

3884

3885 frozen_taxons = frozenset(taxons)

3886 classify_desc2 = partial(classify_desc, accepted=frozen_taxons)

3887

3888 # print(f"{subtext=}")

3889 subtext = re.sub(

3890 r"\s*\(please add an English "

3891 r"translation of this "

3892 r"(example|usage example|quote)\)",

3893 "",

3894 subtext,

3895 ).strip()

3896 subtext = re.sub(r"\^$[^)]*$", "", subtext)

3897 subtext = re.sub(r"\s*[―—]+$", "", subtext)

3898 # print("subtext:", repr(subtext))

3899

3900 lines = subtext.splitlines()

3901 # print(lines)

3902

3903 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines)

3904 lines = list(

3905 x

3906 for x in lines

3907 if not re.match(

3908 r"(Synonyms: |Antonyms: |Hyponyms: |"

3909 r"Synonym: |Antonym: |Hyponym: |"

3910 r"Hypernyms: |Derived terms: |"

3911 r"Related terms: |"

3912 r"Hypernym: |Derived term: |"

3913 r"Coordinate terms:|"

3914 r"Related term: |"

3915 r"For more quotations using )",

3916 x,

3917 )

3918 )

3919 tr = ""

3920 ref = ""

3921 roman = ""

3922 # for line in lines:

3923 # print("LINE:", repr(line))

3924 # print(classify_desc(line))

3925 if len(lines) == 1 and lang_code != "en":

3926 parts = example_splitter_re.split(lines[0])

3927 if ( 3927 ↛ 3935line 3927 didn't jump to line 3935 because the condition on line 3927 was never true

3928 len(parts) > 2

3929 and len(example_template_args) == 1

3930 and any(

3931 ("―" in s) or ("—" in s)

3932 for s in example_template_args[0].values()

3933 )

3934 ):

3935 if nparts := synch_splits_with_args(

3936 lines[0], example_template_args[0]

3937 ):

3938 parts = nparts

3939 if ( 3939 ↛ 3944line 3939 didn't jump to line 3944 because the condition on line 3939 was never true

3940 len(example_template_args) == 1

3941 and "lit" in example_template_args[0]

3942 ):

3943 # ugly brute-force kludge in case there's a lit= arg

3944 literally = example_template_args[0].get("lit", "")

3945 if literally:

3946 literally = (

3947 " (literally, “"

3948 + clean_value(wxr, literally)

3949 + "”)"

3950 )

3951 else:

3952 literally = ""

3953 if ( 3953 ↛ 3992line 3953 didn't jump to line 3992 because the condition on line 3953 was never true

3954 len(example_template_args) == 1

3955 and len(parts) == 2

3956 and len(example_template_args[0])

3957 - (

3958 # horrible kludge to ignore these arguments

3959 # when calculating how many there are

3960 sum(

3961 s in example_template_args[0]

3962 for s in (

3963 "lit", # generates text, but we handle it

3964 "inline",

3965 "noenum",

3966 "nocat",

3967 "sort",

3968 )

3969 )

3970 )

3971 == 3

3972 and clean_value(

3973 wxr, example_template_args[0].get(2, "")

3974 )

3975 == parts[0].strip()

3976 and clean_value(

3977 wxr,

3978 (

3979 example_template_args[0].get(3)

3980 or example_template_args[0].get("translation")

3981 or example_template_args[0].get("t", "")

3982 )

3983 + literally, # in case there's a lit= argument

3984 )

3985 == parts[1].strip()

3986 ):

3987 # {{exampletemplate|ex|Foo bar baz|English translation}}

3988 # is a pretty reliable 'heuristic', so we use it here

3989 # before the others. To be extra sure the template

3990 # doesn't do anything weird, we compare the arguments

3991 # and the output to each other.

3992 lines = [parts[0].strip()]

3993 tr = parts[1].strip()

3994 elif (

3995 len(parts) == 2

3996 and classify_desc2(parts[1]) in ENGLISH_TEXTS

3997 ):

3998 # These other branches just do some simple heuristics w/

3999 # the expanded output of the template (if applicable).

4000 lines = [parts[0].strip()]

4001 tr = parts[1].strip()

4002 elif ( 4002 ↛ 4008line 4002 didn't jump to line 4008 because the condition on line 4002 was never true

4003 len(parts) == 3

4004 and classify_desc2(parts[1])

4005 in ("romanization", "english")

4006 and classify_desc2(parts[2]) in ENGLISH_TEXTS

4007 ):

4008 lines = [parts[0].strip()]

4009 roman = parts[1].strip()

4010 tr = parts[2].strip()

4011 else:

4012 parts = re.split(r"\s+-\s+", lines[0])

4013 if ( 4013 ↛ 4017line 4013 didn't jump to line 4017 because the condition on line 4013 was never true

4014 len(parts) == 2

4015 and classify_desc2(parts[1]) in ENGLISH_TEXTS

4016 ):

4017 lines = [parts[0].strip()]

4018 tr = parts[1].strip()

4019 elif len(lines) > 1:

4020 if any(

4021 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1]

4022 ) and not (len(example_template_names) == 1):

4023 refs: list[str] = []

4024 for i in range(len(lines)): 4024 ↛ 4030line 4024 didn't jump to line 4030 because the loop on line 4024 didn't complete

4025 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 4025 ↛ 4026line 4025 didn't jump to line 4026 because the condition on line 4025 was never true

4026 break

4027 refs.append(lines[i].strip())

4028 if re.search(r"[]\d:)]\s*$", lines[i]):

4029 break

4030 ref = " ".join(refs)

4031 lines = lines[i + 1 :]

4032 if (

4033 lang_code != "en"

4034 and len(lines) >= 2

4035 and classify_desc2(lines[-1]) in ENGLISH_TEXTS

4036 ):

4037 i = len(lines) - 1

4038 while ( 4038 ↛ 4043line 4038 didn't jump to line 4043 because the condition on line 4038 was never true

4039 i > 1

4040 and classify_desc2(lines[i - 1])

4041 in ENGLISH_TEXTS

4042 ):

4043 i -= 1

4044 tr = "\n".join(lines[i:])

4045 lines = lines[:i]

4046 if len(lines) >= 2:

4047 if classify_desc2(lines[-1]) == "romanization":

4048 roman = lines[-1].strip()

4049 lines = lines[:-1]

4050

4051 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]):

4052 ref = lines[0]

4053 lines = lines[1:]

4054 elif lang_code != "en" and len(lines) == 2:

4055 cls1 = classify_desc2(lines[0])

4056 cls2 = classify_desc2(lines[1])

4057 if cls2 in ENGLISH_TEXTS and cls1 != "english":

4058 tr = lines[1]

4059 lines = [lines[0]]

4060 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 4060 ↛ 4061line 4060 didn't jump to line 4061 because the condition on line 4060 was never true

4061 tr = lines[0]

4062 lines = [lines[1]]

4063 elif ( 4063 ↛ 4070line 4063 didn't jump to line 4070 because the condition on line 4063 was never true

4064 re.match(r"^[#*]*:+", lines[1])

4065 and classify_desc2(

4066 re.sub(r"^[#*:]+\s*", "", lines[1])

4067 )

4068 in ENGLISH_TEXTS

4069 ):

4070 tr = re.sub(r"^[#*:]+\s*", "", lines[1])

4071 lines = [lines[0]]

4072 elif cls1 == "english" and cls2 in ENGLISH_TEXTS:

4073 # Both were classified as English, but

4074 # presumably one is not. Assume first is

4075 # non-English, as that seems more common.

4076 tr = lines[1]

4077 lines = [lines[0]]

4078 elif (

4079 usex_type != "quotation"

4080 and lang_code != "en"

4081 and len(lines) == 3

4082 ):

4083 cls1 = classify_desc2(lines[0])

4084 cls2 = classify_desc2(lines[1])

4085 cls3 = classify_desc2(lines[2])

4086 if (

4087 cls3 == "english"

4088 and cls2 in ("english", "romanization")

4089 and cls1 != "english"

4090 ):

4091 tr = lines[2].strip()

4092 roman = lines[1].strip()

4093 lines = [lines[0].strip()]

4094 elif ( 4094 ↛ 4102line 4094 didn't jump to line 4102 because the condition on line 4094 was never true

4095 usex_type == "quotation"

4096 and lang_code != "en"

4097 and len(lines) > 2

4098 ):

4099 # for x in lines:

4100 # print(" LINE: {}: {}"

4101 # .format(classify_desc2(x), x))

4102 if re.match(r"^[#*]*:+\s*$", lines[1]):

4103 ref = lines[0]

4104 lines = lines[2:]

4105 cls1 = classify_desc2(lines[-1])

4106 if cls1 == "english":

4107 i = len(lines) - 1

4108 while (

4109 i > 1

4110 and classify_desc2(lines[i - 1])

4111 == ENGLISH_TEXTS

4112 ):

4113 i -= 1

4114 tr = "\n".join(lines[i:])

4115 lines = lines[:i]

4116

4117 roman = re.sub(r"[ \t\r]+", " ", roman).strip()

4118 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman)

4119 tr = re.sub(r"^[#*:]+\s*", "", tr)

4120 tr = re.sub(r"[ \t\r]+", " ", tr).strip()

4121 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr)

4122 ref = re.sub(r"^[#*:]+\s*", "", ref)

4123 ref = re.sub(

4124 r", (volume |number |page )?“?"

4125 r"$please specify ([^)]|\(s$)*\)”?|"

4126 ", text here$",

4127 "",

4128 ref,

4129 )

4130 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref)

4131 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines)

4132 subtext = "\n".join(x for x in lines if x)

4133 if not tr and lang_code != "en":

4134 m = re.search(r"([.!?])\s+$([^)]+)$\s*$", subtext)

4135 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 4135 ↛ 4136line 4135 didn't jump to line 4136 because the condition on line 4135 was never true

4136 tr = m.group(2)

4137 subtext = subtext[: m.start()] + m.group(1)

4138 elif lines:

4139 parts = re.split(r"\s*[―—]+\s*", lines[0])

4140 if ( 4140 ↛ 4144line 4140 didn't jump to line 4144 because the condition on line 4140 was never true

4141 len(parts) == 2

4142 and classify_desc2(parts[1]) in ENGLISH_TEXTS

4143 ):

4144 subtext = parts[0].strip()

4145 tr = parts[1].strip()

4146 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext)

4147 subtext = re.sub(

4148 r"(please add an English translation of "

4149 r"this (quote|usage example))",

4150 "",

4151 subtext,

4152 )

4153 subtext = re.sub(

4154 r"\s*→New International Version " "translation$",

4155 "",

4156 subtext,

4157 ) # e.g. pis/Tok Pisin (Bible)

4158 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip()

4159 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext)

4160 note = None

4161 m = re.match(r"^$([^)]*)$:\s+", subtext)

4162 if ( 4162 ↛ 4170line 4162 didn't jump to line 4170 because the condition on line 4162 was never true

4163 m is not None

4164 and lang_code != "en"

4165 and (

4166 m.group(1).startswith("with ")

4167 or classify_desc2(m.group(1)) == "english"

4168 )

4169 ):

4170 note = m.group(1)

4171 subtext = subtext[m.end() :]

4172 ref = re.sub(r"\s*$→ISBN$", "", ref)

4173 ref = re.sub(r",\s*→ISBN", "", ref)

4174 ref = ref.strip()

4175 if ref.endswith(":") or ref.endswith(","):

4176 ref = ref[:-1].strip()

4177 ref = re.sub(r"\s+,\s+", ", ", ref)

4178 ref = re.sub(r"\s+", " ", ref)

4179 if ref and not subtext: 4179 ↛ 4180line 4179 didn't jump to line 4180 because the condition on line 4179 was never true

4180 subtext = ref

4181 ref = ""

4182 if subtext:

4183 dt: ExampleData = {"text": subtext}

4184 if ref:

4185 dt["ref"] = ref

4186 if tr:

4187 dt["english"] = tr

4188 if usex_type:

4189 dt["type"] = usex_type

4190 if note: 4190 ↛ 4191line 4190 didn't jump to line 4191 because the condition on line 4190 was never true

4191 dt["note"] = note

4192 if roman:

4193 dt["roman"] = roman

4194 if ruby:

4195 dt["ruby"] = ruby

4196 examples.append(dt)

4197

4198 return examples

4199

4200 # Main code of parse_language()

4201 # Process the section

4202 stack.append(language)

4203 process_children(langnode, None)

4204 stack.pop()

4205

4206 # Finalize word entires

4207 push_etym()

4208 ret = []

4209 for data in page_datas:

4210 merge_base(data, base_data)

4211 ret.append(data)

4212

4213 # Copy all tags to word senses

4214 for data in ret:

4215 if "senses" not in data: 4215 ↛ 4216line 4215 didn't jump to line 4216 because the condition on line 4215 was never true

4216 continue

4217 # WordData should not have a 'tags' field, but if it does, it's

4218 # deleted and its contents removed and placed in each sense;

4219 # that's why the type ignores.

4220 tags: Iterable = data.get("tags", ()) # type: ignore[assignment]

4221 if "tags" in data: 4221 ↛ 4222line 4221 didn't jump to line 4222 because the condition on line 4221 was never true

4222 del data["tags"] # type: ignore[typeddict-item]

4223 for sense in data["senses"]:

4224 data_extend(sense, "tags", tags)

4225

4226 return ret

4227

4228

4229def parse_wikipedia_template(

4230 wxr: WiktextractContext, data: WordData, ht: TemplateArgs

4231) -> None:

4232 """Helper function for parsing {{wikipedia|...}} and related templates."""

4233 assert isinstance(wxr, WiktextractContext)

4234 assert isinstance(data, dict)

4235 assert isinstance(ht, dict)

4236 langid = clean_node(wxr, data, ht.get("lang", ()))

4237 pagename = (

4238 clean_node(wxr, data, ht.get(1, ()))

4239 or wxr.wtp.title

4240 or "MISSING_PAGE_TITLE"

4241 )

4242 if langid:

4243 data_append(data, "wikipedia", langid + ":" + pagename)

4244 else:

4245 data_append(data, "wikipedia", pagename)

4246

4247

4248def parse_top_template(

4249 wxr: WiktextractContext, node: WikiNode, data: WordData

4250) -> None:

4251 """Parses a template that occurs on the top-level in a page, before any

4252 language subtitles."""

4253 assert isinstance(wxr, WiktextractContext)

4254 assert isinstance(node, WikiNode)

4255 assert isinstance(data, dict)

4256

4257 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]:

4258 if name in wikipedia_templates:

4259 parse_wikipedia_template(wxr, data, ht)

4260 return None

4261 if is_panel_template(wxr, name):

4262 return ""

4263 if name in ("reconstruction",): 4263 ↛ 4264line 4263 didn't jump to line 4264 because the condition on line 4263 was never true

4264 return ""

4265 if name.lower() == "also" or name.lower().startswith("also/"):

4266 # XXX shows related words that might really have been the intended

4267 # word, capture them

4268 return ""

4269 if name == "see also": 4269 ↛ 4271line 4269 didn't jump to line 4271 because the condition on line 4269 was never true

4270 # XXX capture

4271 return ""

4272 if name == "cardinalbox": 4272 ↛ 4274line 4272 didn't jump to line 4274 because the condition on line 4272 was never true

4273 # XXX capture

4274 return ""

4275 if name == "character info": 4275 ↛ 4277line 4275 didn't jump to line 4277 because the condition on line 4275 was never true

4276 # XXX capture

4277 return ""

4278 if name == "commonscat": 4278 ↛ 4280line 4278 didn't jump to line 4280 because the condition on line 4278 was never true

4279 # XXX capture link to Wikimedia commons

4280 return ""

4281 if name == "wrongtitle": 4281 ↛ 4284line 4281 didn't jump to line 4284 because the condition on line 4281 was never true

4282 # XXX this should be captured to replace page title with the

4283 # correct title. E.g. ⿰亻革家

4284 return ""

4285 if name == "wikidata": 4285 ↛ 4286line 4285 didn't jump to line 4286 because the condition on line 4285 was never true

4286 arg = clean_node(wxr, data, ht.get(1, ()))

4287 if arg.startswith("Q") or arg.startswith("Lexeme:L"):

4288 data_append(data, "wikidata", arg)

4289 return ""

4290 wxr.wtp.debug(

4291 "UNIMPLEMENTED top-level template: {} {}".format(name, ht),

4292 sortid="page/2870",

4293 )

4294 return ""

4295

4296 clean_node(wxr, None, [node], template_fn=top_template_fn)

4297

4298

4299def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str:

4300 """Fix subtitle hierarchy to be strict Language -> Etymology ->

4301 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections

4302 that are next to each other."""

4303

4304 # Wiktextract issue #620, Chinese Glyph Origin before an etymology

4305 # section get overwritten. In this case, let's just combine the two.

4306

4307 # In Chinese entries, Pronunciation can be preceded on the

4308 # same level 3 by its Etymology *and* Glyph Origin sections:

4309 # ===Glyph Origin===

4310 # ===Etymology===

4311 # ===Pronunciation===

4312 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation

4313 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default')

4314 # are now level 6

4315

4316 # Known lowercase PoS names are in part_of_speech_map

4317 # Known lowercase linkage section names are in linkage_map

4318

4319 old = re.split(

4320 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text

4321 )

4322

4323 parts = []

4324 npar = 4 # Number of parentheses in above expression

4325 parts.append(old[0])

4326 prev_level = None

4327 level = None

4328 skip_level_title = False # When combining etymology sections

4329 for i in range(1, len(old), npar + 1):

4330 left = old[i]

4331 right = old[i + npar - 1]

4332 # remove Wikilinks in title

4333 title = re.sub(r"^\[\[", "", old[i + 1])

4334 title = re.sub(r"\]\]$", "", title)

4335 prev_level = level

4336 level = len(left)

4337 part = old[i + npar]

4338 if level != len(right): 4338 ↛ 4339line 4338 didn't jump to line 4339 because the condition on line 4338 was never true

4339 wxr.wtp.debug(

4340 "subtitle has unbalanced levels: "

4341 "{!r} has {} on the left and {} on the right".format(

4342 title, left, right

4343 ),

4344 sortid="page/2904",

4345 )

4346 lc = title.lower()

4347 if name_to_code(title, "en") != "":

4348 if level > 2: 4348 ↛ 4349line 4348 didn't jump to line 4349 because the condition on line 4348 was never true

4349 wxr.wtp.debug(

4350 "subtitle has language name {} at level {}".format(

4351 title, level

4352 ),

4353 sortid="page/2911",

4354 )

4355 level = 2

4356 elif lc.startswith(tuple(ETYMOLOGY_TITLES)):

4357 if level > 3: 4357 ↛ 4358line 4357 didn't jump to line 4358 because the condition on line 4357 was never true

4358 wxr.wtp.debug(

4359 "etymology section {} at level {}".format(title, level),

4360 sortid="page/2917",

4361 )

4362 if prev_level == 3: # Two etymology (Glyph Origin + Etymology)

4363 # sections cheek-to-cheek

4364 skip_level_title = True

4365 # Modify the title of previous ("Glyph Origin") section, in

4366 # case we have a meaningful title like "Etymology 1"

4367 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level)

4368 level = 3

4369 elif lc.startswith(PRONUNCIATION_TITLE):

4370 # Pronunciation is now a level between POS and Etymology, so

4371 # we need to shift everything down by one

4372 level = 4

4373 elif lc in POS_TITLES:

4374 level = 5

4375 elif lc == TRANSLATIONS_TITLE:

4376 level = 6

4377 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE:

4378 level = 6

4379 elif lc in INFLECTION_TITLES:

4380 level = 6

4381 elif lc == DESCENDANTS_TITLE:

4382 level = 6

4383 elif title in PROTO_ROOT_DERIVED_TITLES: 4383 ↛ 4384line 4383 didn't jump to line 4384 because the condition on line 4383 was never true

4384 level = 6

4385 elif lc in IGNORED_TITLES:

4386 level = 6

4387 else:

4388 level = 6

4389 if skip_level_title:

4390 skip_level_title = False

4391 parts.append(part)

4392 else:

4393 parts.append("{}{}{}".format("=" * level, title, "=" * level))

4394 parts.append(part)

4395 # print("=" * level, title)

4396 # if level != len(left):

4397 # print(" FIXED LEVEL OF {} {} -> {}"

4398 # .format(title, len(left), level))

4399

4400 text = "".join(parts)

4401 # print(text)

4402 return text

4403

4404

4405def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]:

4406 # Skip translation pages

4407 if word.endswith("/" + TRANSLATIONS_TITLE): 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true

4408 return []

4409

4410 if wxr.config.verbose: 4410 ↛ 4411line 4410 didn't jump to line 4411 because the condition on line 4410 was never true

4411 logger.info(f"Parsing page: {word}")

4412

4413 wxr.config.word = word

4414 wxr.wtp.start_page(word)

4415

4416 # Remove <noinclude> and similar tags from main pages. They

4417 # should not appear there, but at least net/Elfdala has one and it

4418 # is probably not the only one.

4419 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text)

4420 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text)

4421 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text)

4422

4423 # Fix up the subtitle hierarchy. There are hundreds if not thousands of

4424 # pages that have, for example, Translations section under Linkage, or

4425 # Translations section on the same level as Noun. Enforce a proper

4426 # hierarchy by manipulating the subtitle levels in certain cases.

4427 text = fix_subtitle_hierarchy(wxr, text)

4428

4429 # Parse the page, pre-expanding those templates that are likely to

4430 # influence parsing

4431 tree = wxr.wtp.parse(

4432 text,

4433 pre_expand=True,

4434 additional_expand=ADDITIONAL_EXPAND_TEMPLATES,

4435 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES,

4436 )

4437 # from wikitextprocessor.parser import print_tree

4438 # print("PAGE PARSE:", print_tree(tree))

4439

4440 top_data: WordData = {}

4441

4442 # Iterate over top-level titles, which should be languages for normal

4443 # pages

4444 by_lang = defaultdict(list)

4445 for langnode in tree.children:

4446 if not isinstance(langnode, WikiNode):

4447 continue

4448 if langnode.kind == NodeKind.TEMPLATE:

4449 parse_top_template(wxr, langnode, top_data)

4450 continue

4451 if langnode.kind == NodeKind.LINK:

4452 # Some pages have links at top level, e.g., "trees" in Wiktionary

4453 continue

4454 if langnode.kind != NodeKind.LEVEL2: 4454 ↛ 4455line 4454 didn't jump to line 4455 because the condition on line 4454 was never true

4455 wxr.wtp.debug(

4456 f"unexpected top-level node: {langnode}", sortid="page/3014"

4457 )

4458 continue

4459 lang = clean_node(

4460 wxr, None, langnode.sarg if langnode.sarg else langnode.largs

4461 )

4462 lang_code = name_to_code(lang, "en")

4463 if lang_code == "": 4463 ↛ 4464line 4463 didn't jump to line 4464 because the condition on line 4463 was never true

4464 wxr.wtp.debug(

4465 f"unrecognized language name: {lang}", sortid="page/3019"

4466 )

4467 if (

4468 wxr.config.capture_language_codes

4469 and lang_code not in wxr.config.capture_language_codes

4470 ):

4471 continue

4472 wxr.wtp.start_section(lang)

4473

4474 # Collect all words from the page.

4475 # print(f"{langnode=}")

4476 datas = parse_language(wxr, langnode, lang, lang_code)

4477

4478 # Propagate fields resulting from top-level templates to this

4479 # part-of-speech.

4480 for data in datas:

4481 if "lang" not in data: 4481 ↛ 4482line 4481 didn't jump to line 4482 because the condition on line 4481 was never true

4482 wxr.wtp.debug(

4483 "internal error -- no lang in data: {}".format(data),

4484 sortid="page/3034",

4485 )

4486 continue

4487 for k, v in top_data.items():

4488 assert isinstance(v, (list, tuple))

4489 data_extend(data, k, v)

4490 by_lang[data["lang"]].append(data)

4491

4492 # XXX this code is clearly out of date. There is no longer a "conjugation"

4493 # field. FIX OR REMOVE.

4494 # Do some post-processing on the words. For example, we may distribute

4495 # conjugation information to all the words.

4496 ret = []

4497 for lang, lang_datas in by_lang.items():

4498 ret.extend(lang_datas)

4499

4500 for x in ret:

4501 if x["word"] != word:

4502 if word.startswith("Unsupported titles/"): 4502 ↛ 4508line 4502 didn't jump to line 4508 because the condition on line 4502 was always true

4503 wxr.wtp.debug(

4504 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'",

4505 sortid="20231101/3578page.py",

4506 )

4507 else:

4508 wxr.wtp.debug(

4509 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'",

4510 sortid="20231101/3582page.py",

4511 )

4512 x["original_title"] = word

4513 # validate tag data

4514 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type]

4515 return ret

4516

4517

4518def recursively_separate_raw_tags(

4519 wxr: WiktextractContext, data: dict[str, Any]

4520) -> None:

4521 if not isinstance(data, dict): 4521 ↛ 4522line 4521 didn't jump to line 4522 because the condition on line 4521 was never true

4522 wxr.wtp.error(

4523 "'data' is not dict; most probably "

4524 "data has a list that contains at least one dict and "

4525 "at least one non-dict item",

4526 sortid="en/page-4016/20240419",

4527 )

4528 return

4529 new_tags: list[str] = []

4530 raw_tags: list[str] = data.get("raw_tags", [])

4531 for field, val in data.items():

4532 if field == "tags":

4533 for tag in val:

4534 if tag not in valid_tags:

4535 raw_tags.append(tag)

4536 else:

4537 new_tags.append(tag)

4538 if isinstance(val, list):

4539 if len(val) > 0 and isinstance(val[0], dict):

4540 for d in val:

4541 recursively_separate_raw_tags(wxr, d)

4542 if "tags" in data and not new_tags:

4543 del data["tags"]

4544 elif new_tags:

4545 data["tags"] = new_tags

4546 if raw_tags:

4547 data["raw_tags"] = raw_tags

4548

4549

4550def process_soft_redirect_template(

4551 wxr: WiktextractContext,

4552 template_node: TemplateNode,

4553 redirect_pages: list[str],

4554) -> bool:

4555 # return `True` if the template is soft redirect template

4556 if template_node.template_name == "zh-see":

4557 # https://en.wiktionary.org/wiki/Template:zh-see

4558 title = clean_node(

4559 wxr, None, template_node.template_parameters.get(1, "")

4560 )

4561 if title != "": 4561 ↛ 4563line 4561 didn't jump to line 4563 because the condition on line 4561 was always true

4562 redirect_pages.append(title)

4563 return True

4564 elif template_node.template_name in ["ja-see", "ja-see-kango"]:

4565 # https://en.wiktionary.org/wiki/Template:ja-see

4566 for key, value in template_node.template_parameters.items():

4567 if isinstance(key, int): 4567 ↛ 4566line 4567 didn't jump to line 4566 because the condition on line 4567 was always true

4568 title = clean_node(wxr, None, value)

4569 if title != "": 4569 ↛ 4566line 4569 didn't jump to line 4566 because the condition on line 4569 was always true

4570 redirect_pages.append(title)

4571 return True

4572 return False

4573

4574

4575ZH_FORMS_TAGS = {

4576 "trad.": "Traditional-Chinese",

4577 "simp.": "Simplified-Chinese",

4578 "alternative forms": "alternative",

4579}

4580

4581

4582def extract_zh_forms_template(

4583 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData

4584):

4585 # https://en.wiktionary.org/wiki/Template:zh-forms

4586 lit_meaning = clean_node(

4587 wxr, None, t_node.template_parameters.get("lit", "")

4588 )

4589 if lit_meaning != "": 4589 ↛ 4590line 4589 didn't jump to line 4590 because the condition on line 4589 was never true

4590 base_data["literal_meaning"] = lit_meaning

4591 expanded_node = wxr.wtp.parse(

4592 wxr.wtp.node_to_wikitext(t_node), expand_all=True

4593 )

4594 for table in expanded_node.find_child(NodeKind.TABLE):

4595 for row in table.find_child(NodeKind.TABLE_ROW):

4596 row_header = ""

4597 row_header_tags = []

4598 header_has_span = False

4599 for cell in row.find_child(

4600 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

4601 ):

4602 if cell.kind == NodeKind.TABLE_HEADER_CELL:

4603 row_header, row_header_tags, header_has_span = (

4604 extract_zh_forms_header_cell(wxr, base_data, cell)

4605 )

4606 elif not header_has_span:

4607 extract_zh_forms_data_cell(

4608 wxr, base_data, cell, row_header, row_header_tags

4609 )

4610

4611 if "forms" in base_data and len(base_data["forms"]) == 0: 4611 ↛ 4612line 4611 didn't jump to line 4612 because the condition on line 4611 was never true

4612 del base_data["forms"]

4613

4614

4615def extract_zh_forms_header_cell(

4616 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode

4617) -> tuple[str, list[str], bool]:

4618 row_header = ""

4619 row_header_tags = []

4620 header_has_span = False

4621 first_span_index = len(header_cell.children)

4622 for index, span_tag in header_cell.find_html("span", with_index=True):

4623 if index < first_span_index: 4623 ↛ 4625line 4623 didn't jump to line 4625 because the condition on line 4623 was always true

4624 first_span_index = index

4625 header_has_span = True

4626 row_header = clean_node(wxr, None, header_cell.children[:first_span_index])

4627 for raw_tag in row_header.split(" and "):

4628 raw_tag = raw_tag.strip()

4629 if raw_tag != "": 4629 ↛ 4627line 4629 didn't jump to line 4627 because the condition on line 4629 was always true

4630 row_header_tags.append(raw_tag)

4631 for span_tag in header_cell.find_html_recursively("span"):

4632 span_lang = span_tag.attrs.get("lang", "")

4633 form_nodes = []

4634 sup_title = ""

4635 for node in span_tag.children:

4636 if isinstance(node, HTMLNode) and node.tag == "sup": 4636 ↛ 4637line 4636 didn't jump to line 4637 because the condition on line 4636 was never true

4637 for sup_span in node.find_html("span"):

4638 sup_title = sup_span.attrs.get("title", "")

4639 else:

4640 form_nodes.append(node)

4641 if span_lang in ["zh-Hant", "zh-Hans"]: 4641 ↛ 4642line 4641 didn't jump to line 4642 because the condition on line 4641 was never true

4642 for word in clean_node(wxr, None, form_nodes).split("/"):

4643 if word not in [wxr.wtp.title, ""]:

4644 form = {"form": word}

4645 for raw_tag in row_header_tags:

4646 if raw_tag in ZH_FORMS_TAGS:

4647 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])

4648 else:

4649 data_append(form, "raw_tags", raw_tag)

4650 if sup_title != "":

4651 data_append(form, "raw_tags", sup_title)

4652 data_append(base_data, "forms", form)

4653 return row_header, row_header_tags, header_has_span

4654

4655

4656def extract_zh_forms_data_cell(

4657 wxr: WiktextractContext,

4658 base_data: WordData,

4659 cell: WikiNode,

4660 row_header: str,

4661 row_header_tags: list[str],

4662):

4663 from .zh_pron_tags import ZH_PRON_TAGS

4664

4665 for top_span_tag in cell.find_html("span"):

4666 forms = []

4667 for span_tag in top_span_tag.find_html("span"):

4668 span_lang = span_tag.attrs.get("lang", "")

4669 if span_lang in ["zh-Hant", "zh-Hans", "zh"]:

4670 word = clean_node(wxr, None, span_tag)

4671 if word not in ["", "／", wxr.wtp.title]:

4672 form = {"form": word}

4673 if row_header != "anagram": 4673 ↛ 4679line 4673 didn't jump to line 4679 because the condition on line 4673 was always true

4674 for raw_tag in row_header_tags:

4675 if raw_tag in ZH_FORMS_TAGS: 4675 ↛ 4678line 4675 didn't jump to line 4678 because the condition on line 4675 was always true

4676 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag])

4677 else:

4678 data_append(form, "raw_tags", raw_tag)

4679 if span_lang == "zh-Hant":

4680 data_append(form, "tags", "Traditional-Chinese")

4681 elif span_lang == "zh-Hans":

4682 data_append(form, "tags", "Simplified-Chinese")

4683 forms.append(form)

4684 elif "font-size:80%" in span_tag.attrs.get("style", ""): 4684 ↛ 4667line 4684 didn't jump to line 4667 because the condition on line 4684 was always true

4685 raw_tag = clean_node(wxr, None, span_tag)

4686 if raw_tag != "": 4686 ↛ 4667line 4686 didn't jump to line 4667 because the condition on line 4686 was always true

4687 for form in forms:

4688 if raw_tag in ZH_PRON_TAGS: 4688 ↛ 4694line 4688 didn't jump to line 4694 because the condition on line 4688 was always true

4689 tr_tag = ZH_PRON_TAGS[raw_tag]

4690 if isinstance(tr_tag, list): 4690 ↛ 4691line 4690 didn't jump to line 4691 because the condition on line 4690 was never true

4691 data_extend(form, "tags", tr_tag)

4692 elif isinstance(tr_tag, str): 4692 ↛ 4687line 4692 didn't jump to line 4687 because the condition on line 4692 was always true

4693 data_append(form, "tags", tr_tag)

4694 elif raw_tag in valid_tags:

4695 data_append(form, "tags", raw_tag)

4696 else:

4697 data_append(form, "raw_tags", raw_tag)

4698

4699 if row_header == "anagram": 4699 ↛ 4700line 4699 didn't jump to line 4700 because the condition on line 4699 was never true

4700 for form in forms:

4701 l_data = {"word": form["form"]}

4702 for key in ["tags", "raw_tags"]:

4703 if key in form:

4704 l_data[key] = form[key]

4705 data_append(base_data, "anagrams", l_data)

4706 else:

4707 data_extend(base_data, "forms", forms)