Coverage for src/wiktextract/extractor/en/page.py: 45%

1940 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8import sys 

9from collections import defaultdict 

10from functools import partial 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Iterable, 

15 Iterator, 

16 Optional, 

17 Set, 

18 Union, 

19 cast, 

20) 

21 

22from mediawiki_langcodes import get_all_names, name_to_code 

23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

24from wikitextprocessor.parser import ( 

25 LEVEL_KIND_FLAGS, 

26 GeneralNode, 

27 NodeKind, 

28 TemplateNode, 

29 WikiNode, 

30) 

31 

32from ...clean import clean_template_args, clean_value 

33from ...datautils import ( 

34 data_append, 

35 data_extend, 

36 ns_title_prefix_tuple, 

37) 

38from ...page import ( 

39 LEVEL_KINDS, 

40 clean_node, 

41 is_panel_template, 

42 recursively_extract, 

43) 

44from ...tags import valid_tags 

45from ...wxr_context import WiktextractContext 

46from ...wxr_logging import logger 

47from ..ruby import extract_ruby, parse_ruby 

48from ..share import strip_nodes 

49from .example import extract_example_list_item, extract_template_zh_x 

50from .form_descriptions import ( 

51 classify_desc, 

52 decode_tags, 

53 distw, 

54 parse_alt_or_inflection_of, 

55 parse_sense_qualifier, 

56 parse_word_head, 

57) 

58from .inflection import TableContext, parse_inflection_section 

59from .info_templates import ( 

60 INFO_TEMPLATE_FUNCS, 

61 parse_info_template_arguments, 

62 parse_info_template_node, 

63) 

64from .linkages import parse_linkage_item_text 

65from .parts_of_speech import PARTS_OF_SPEECH 

66from .section_titles import ( 

67 COMPOUNDS_TITLE, 

68 DESCENDANTS_TITLE, 

69 ETYMOLOGY_TITLES, 

70 IGNORED_TITLES, 

71 INFLECTION_TITLES, 

72 LINKAGE_TITLES, 

73 POS_TITLES, 

74 PRONUNCIATION_TITLE, 

75 PROTO_ROOT_DERIVED_TITLES, 

76 TRANSLATIONS_TITLE, 

77) 

78from .translations import parse_translation_item_text 

79from .type_utils import ( 

80 DescendantData, 

81 ExampleData, 

82 FormData, 

83 LinkageData, 

84 SenseData, 

85 SoundData, 

86 TemplateData, 

87 WordData, 

88) 

89from .unsupported_titles import unsupported_title_map 

90 

91# When determining whether a string is 'english', classify_desc 

92# might return 'taxonomic' which is English text 99% of the time. 

93ENGLISH_TEXTS = ("english", "taxonomic") 

94 

95# Matches head tag 

96HEAD_TAG_RE = re.compile( 

97 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

98 r"hangul-symbol|syllable-hangul)$|" 

99 + r"^(latin|" 

100 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

101 + r")-(" 

102 + "|".join( 

103 [ 

104 "abbr", 

105 "adj", 

106 "adjective", 

107 "adjective form", 

108 "adjective-form", 

109 "adv", 

110 "adverb", 

111 "affix", 

112 "animal command", 

113 "art", 

114 "article", 

115 "aux", 

116 "bound pronoun", 

117 "bound-pronoun", 

118 "Buyla", 

119 "card num", 

120 "card-num", 

121 "cardinal", 

122 "chunom", 

123 "classifier", 

124 "clitic", 

125 "cls", 

126 "cmene", 

127 "cmavo", 

128 "colloq-verb", 

129 "colverbform", 

130 "combining form", 

131 "combining-form", 

132 "comparative", 

133 "con", 

134 "concord", 

135 "conj", 

136 "conjunction", 

137 "conjug", 

138 "cont", 

139 "contr", 

140 "converb", 

141 "daybox", 

142 "decl", 

143 "decl noun", 

144 "def", 

145 "dem", 

146 "det", 

147 "determ", 

148 "Deva", 

149 "ending", 

150 "entry", 

151 "form", 

152 "fuhivla", 

153 "gerund", 

154 "gismu", 

155 "hanja", 

156 "hantu", 

157 "hanzi", 

158 "head", 

159 "ideophone", 

160 "idiom", 

161 "inf", 

162 "indef", 

163 "infixed pronoun", 

164 "infixed-pronoun", 

165 "infl", 

166 "inflection", 

167 "initialism", 

168 "int", 

169 "interfix", 

170 "interj", 

171 "interjection", 

172 "jyut", 

173 "latin", 

174 "letter", 

175 "locative", 

176 "lujvo", 

177 "monthbox", 

178 "mutverb", 

179 "name", 

180 "nisba", 

181 "nom", 

182 "noun", 

183 "noun form", 

184 "noun-form", 

185 "noun plural", 

186 "noun-plural", 

187 "nounprefix", 

188 "num", 

189 "number", 

190 "numeral", 

191 "ord", 

192 "ordinal", 

193 "par", 

194 "part", 

195 "part form", 

196 "part-form", 

197 "participle", 

198 "particle", 

199 "past", 

200 "past neg", 

201 "past-neg", 

202 "past participle", 

203 "past-participle", 

204 "perfect participle", 

205 "perfect-participle", 

206 "personal pronoun", 

207 "personal-pronoun", 

208 "pref", 

209 "prefix", 

210 "phrase", 

211 "pinyin", 

212 "plural noun", 

213 "plural-noun", 

214 "pos", 

215 "poss-noun", 

216 "post", 

217 "postp", 

218 "postposition", 

219 "PP", 

220 "pp", 

221 "ppron", 

222 "pred", 

223 "predicative", 

224 "prep", 

225 "prep phrase", 

226 "prep-phrase", 

227 "preposition", 

228 "present participle", 

229 "present-participle", 

230 "pron", 

231 "prondem", 

232 "pronindef", 

233 "pronoun", 

234 "prop", 

235 "proper noun", 

236 "proper-noun", 

237 "proper noun form", 

238 "proper-noun form", 

239 "proper noun-form", 

240 "proper-noun-form", 

241 "prov", 

242 "proverb", 

243 "prpn", 

244 "prpr", 

245 "punctuation mark", 

246 "punctuation-mark", 

247 "regnoun", 

248 "rel", 

249 "rom", 

250 "romanji", 

251 "root", 

252 "sign", 

253 "suff", 

254 "suffix", 

255 "syllable", 

256 "symbol", 

257 "verb", 

258 "verb form", 

259 "verb-form", 

260 "verbal noun", 

261 "verbal-noun", 

262 "verbnec", 

263 "vform", 

264 ] 

265 ) 

266 + r")(-|/|\+|$)" 

267) 

268 

269# Head-templates causing problems (like newlines) that can be squashed into 

270# an empty string in the template handler while saving their template 

271# data for later. 

272WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

273 

274FLOATING_TABLE_TEMPLATES: set[str] = { 

275 # az-suffix-form creates a style=floatright div that is otherwise 

276 # deleted; if it is not pre-expanded, we can intercept the template 

277 # so we add this set into do_not_pre_expand, and intercept the 

278 # templates in parse_part_of_speech 

279 "az-suffix-forms", 

280 "az-inf-p", 

281 "kk-suffix-forms", 

282 "ky-suffix-forms", 

283 "tr-inf-p", 

284 "tr-suffix-forms", 

285 "tt-suffix-forms", 

286 "uz-suffix-forms", 

287} 

288# These two should contain template names that should always be 

289# pre-expanded when *first* processing the tree, or not pre-expanded 

290# so that the template are left in place with their identifying 

291# name intact for later filtering. 

292 

293DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

294DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

295 

296# Additional templates to be expanded in the pre-expand phase 

297ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

298 "multitrans", 

299 "multitrans-nowiki", 

300 "trans-top", 

301 "trans-top-also", 

302 "trans-bottom", 

303 "checktrans-top", 

304 "checktrans-bottom", 

305 "col1", 

306 "col2", 

307 "col3", 

308 "col4", 

309 "col5", 

310 "col1-u", 

311 "col2-u", 

312 "col3-u", 

313 "col4-u", 

314 "col5-u", 

315 "check deprecated lang param usage", 

316 "deprecated code", 

317 "ru-verb-alt-ё", 

318 "ru-noun-alt-ё", 

319 "ru-adj-alt-ё", 

320 "ru-proper noun-alt-ё", 

321 "ru-pos-alt-ё", 

322 "ru-alt-ё", 

323 "inflection of", 

324 "no deprecated lang param usage", 

325} 

326 

327# Inverse linkage for those that have them 

328linkage_inverses: dict[str, str] = { 

329 # XXX this is not currently used, move to post-processing 

330 "synonyms": "synonyms", 

331 "hypernyms": "hyponyms", 

332 "hyponyms": "hypernyms", 

333 "holonyms": "meronyms", 

334 "meronyms": "holonyms", 

335 "derived": "derived_from", 

336 "coordinate_terms": "coordinate_terms", 

337 "troponyms": "hypernyms", 

338 "antonyms": "antonyms", 

339 "instances": "instance_of", 

340 "related": "related", 

341} 

342 

343# Templates that are used to form panels on pages and that 

344# should be ignored in various positions 

345PANEL_TEMPLATES: set[str] = { 

346 "Character info", 

347 "CJKV", 

348 "French personal pronouns", 

349 "French possessive adjectives", 

350 "French possessive pronouns", 

351 "Han etym", 

352 "Japanese demonstratives", 

353 "Latn-script", 

354 "LDL", 

355 "MW1913Abbr", 

356 "Number-encoding", 

357 "Nuttall", 

358 "Spanish possessive adjectives", 

359 "Spanish possessive pronouns", 

360 "USRegionDisputed", 

361 "Webster 1913", 

362 "ase-rfr", 

363 "attention", 

364 "attn", 

365 "beer", 

366 "broken ref", 

367 "ca-compass", 

368 "character info", 

369 "character info/var", 

370 "checksense", 

371 "compass-fi", 

372 "copyvio suspected", 

373 "delete", 

374 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

375 "etystub", 

376 "examples", 

377 "hu-corr", 

378 "hu-suff-pron", 

379 "interwiktionary", 

380 "ja-kanjitab", 

381 "ko-hanja-search", 

382 "look", 

383 "maintenance box", 

384 "maintenance line", 

385 "mediagenic terms", 

386 "merge", 

387 "missing template", 

388 "morse links", 

389 "move", 

390 "multiple images", 

391 "no inline", 

392 "picdic", 

393 "picdicimg", 

394 "picdiclabel", 

395 "polyominoes", 

396 "predidential nomics", 

397 "punctuation", # This actually gets pre-expanded 

398 "reconstructed", 

399 "request box", 

400 "rf-sound example", 

401 "rfaccents", 

402 "rfap", 

403 "rfaspect", 

404 "rfc", 

405 "rfc-auto", 

406 "rfc-header", 

407 "rfc-level", 

408 "rfc-pron-n", 

409 "rfc-sense", 

410 "rfclarify", 

411 "rfd", 

412 "rfd-redundant", 

413 "rfd-sense", 

414 "rfdate", 

415 "rfdatek", 

416 "rfdef", 

417 "rfe", 

418 "rfe/dowork", 

419 "rfex", 

420 "rfexp", 

421 "rfform", 

422 "rfgender", 

423 "rfi", 

424 "rfinfl", 

425 "rfm", 

426 "rfm-sense", 

427 "rfp", 

428 "rfp-old", 

429 "rfquote", 

430 "rfquote-sense", 

431 "rfquotek", 

432 "rfref", 

433 "rfscript", 

434 "rft2", 

435 "rftaxon", 

436 "rftone", 

437 "rftranslit", 

438 "rfv", 

439 "rfv-etym", 

440 "rfv-pron", 

441 "rfv-quote", 

442 "rfv-sense", 

443 "selfref", 

444 "split", 

445 "stroke order", # XXX consider capturing this? 

446 "stub entry", 

447 "t-needed", 

448 "tbot entry", 

449 "tea room", 

450 "tea room sense", 

451 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

452 "unblock", 

453 "unsupportedpage", 

454 "video frames", 

455 "was wotd", 

456 "wrongtitle", 

457 "zh-forms", 

458 "zh-hanzi-box", 

459 "no entry", 

460} 

461 

462# lookup table for the tags of Chinese dialectal synonyms 

463zh_tag_lookup: dict[str, list[str]] = { 

464 "Formal": ["formal"], 

465 "Written-Standard-Chinese": ["Standard-Chinese"], 

466 "historical or Internet slang": ["historical", "internet-slang"], 

467 "now usually derogatory or offensive": ["offensive", "derogatory"], 

468 "lofty": [], 

469} 

470 

471# Template name prefixes used for language-specific panel templates (i.e., 

472# templates that create side boxes or notice boxes or that should generally 

473# be ignored). 

474PANEL_PREFIXES: set[str] = { 

475 "list:compass points/", 

476 "list:Gregorian calendar months/", 

477 "RQ:", 

478} 

479 

480# Templates used for wikipedia links. 

481wikipedia_templates: set[str] = { 

482 "wikipedia", 

483 "slim-wikipedia", 

484 "w", 

485 "W", 

486 "swp", 

487 "wiki", 

488 "Wikipedia", 

489 "wtorw", 

490} 

491for x in PANEL_PREFIXES & wikipedia_templates: 491 ↛ 492line 491 didn't jump to line 492 because the loop on line 491 never started

492 print( 

493 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

494 x 

495 ) 

496 ) 

497 

498# Mapping from a template name (without language prefix) for the main word 

499# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

500# it could validly occur. This is used as just a sanity check to give 

501# warnings about probably incorrect coding in Wiktionary. 

502template_allowed_pos_map: dict[str, list[str]] = { 

503 "abbr": ["abbrev"], 

504 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

505 "plural noun": ["noun", "name"], 

506 "plural-noun": ["noun", "name"], 

507 "proper noun": ["noun", "name"], 

508 "proper-noun": ["name", "noun"], 

509 "prop": ["name", "noun"], 

510 "verb": ["verb", "phrase"], 

511 "gerund": ["verb"], 

512 "particle": ["adv", "particle"], 

513 "adj": ["adj", "adj_noun"], 

514 "pron": ["pron", "noun"], 

515 "name": ["name", "noun"], 

516 "adv": ["adv", "intj", "conj", "particle"], 

517 "phrase": ["phrase", "prep_phrase"], 

518 "noun phrase": ["phrase"], 

519 "ordinal": ["num"], 

520 "number": ["num"], 

521 "pos": ["affix", "name", "num"], 

522 "suffix": ["suffix", "affix"], 

523 "character": ["character"], 

524 "letter": ["character"], 

525 "kanji": ["character"], 

526 "cont": ["abbrev"], 

527 "interj": ["intj"], 

528 "con": ["conj"], 

529 "part": ["particle"], 

530 "prep": ["prep", "postp"], 

531 "postp": ["postp"], 

532 "misspelling": ["noun", "adj", "verb", "adv"], 

533 "part-form": ["verb"], 

534} 

535for k, v in template_allowed_pos_map.items(): 

536 for x in v: 

537 if x not in PARTS_OF_SPEECH: 537 ↛ 538line 537 didn't jump to line 538 because the condition on line 537 was never true

538 print( 

539 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

540 "".format(x, k, v) 

541 ) 

542 assert False 

543 

544 

545# Templates ignored during etymology extraction, i.e., these will not be listed 

546# in the extracted etymology templates. 

547ignored_etymology_templates: list[str] = [ 

548 "...", 

549 "IPAchar", 

550 "ipachar", 

551 "ISBN", 

552 "isValidPageName", 

553 "redlink category", 

554 "deprecated code", 

555 "check deprecated lang param usage", 

556 "para", 

557 "p", 

558 "cite", 

559 "Cite news", 

560 "Cite newsgroup", 

561 "cite paper", 

562 "cite MLLM 1976", 

563 "cite journal", 

564 "cite news/documentation", 

565 "cite paper/documentation", 

566 "cite video game", 

567 "cite video game/documentation", 

568 "cite newsgroup", 

569 "cite newsgroup/documentation", 

570 "cite web/documentation", 

571 "cite news", 

572 "Cite book", 

573 "Cite-book", 

574 "cite book", 

575 "cite web", 

576 "cite-usenet", 

577 "cite-video/documentation", 

578 "Cite-journal", 

579 "rfe", 

580 "catlangname", 

581 "cln", 

582 "langname-lite", 

583 "no deprecated lang param usage", 

584 "mention", 

585 "m", 

586 "m-self", 

587 "link", 

588 "l", 

589 "ll", 

590 "l-self", 

591] 

592# Regexp for matching ignored etymology template names. This adds certain 

593# prefixes to the names listed above. 

594ignored_etymology_templates_re = re.compile( 

595 r"^((cite-|R:|RQ:).*|" 

596 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

597 + r")$" 

598) 

599 

600# Regexp for matching ignored descendants template names. Right now we just 

601# copy the ignored etymology templates 

602ignored_descendants_templates_re = ignored_etymology_templates_re 

603 

604# Set of template names that are used to define usage examples. If the usage 

605# example contains one of these templates, then it its type is set to 

606# "example" 

607usex_templates: set[str] = { 

608 "afex", 

609 "affixusex", 

610 "co", # {{collocation}} acts like a example template, specifically for 

611 # pairs of combinations of words that are more common than you'd 

612 # except would be randomly; hlavní#Czech 

613 "coi", 

614 "collocation", 

615 "el-example", 

616 "el-x", 

617 "example", 

618 "examples", 

619 "he-usex", 

620 "he-x", 

621 "hi-usex", 

622 "hi-x", 

623 "ja-usex-inline", 

624 "ja-usex", 

625 "ja-x", 

626 "jbo-example", 

627 "jbo-x", 

628 "km-usex", 

629 "km-x", 

630 "ko-usex", 

631 "ko-x", 

632 "lo-usex", 

633 "lo-x", 

634 "ne-x", 

635 "ne-usex", 

636 "prefixusex", 

637 "ryu-usex", 

638 "ryu-x", 

639 "shn-usex", 

640 "shn-x", 

641 "suffixusex", 

642 "th-usex", 

643 "th-x", 

644 "ur-usex", 

645 "ur-x", 

646 "usex", 

647 "usex-suffix", 

648 "ux", 

649 "uxi", 

650} 

651 

652stop_head_at_these_templates: set[str] = { 

653 "category", 

654 "cat", 

655 "topics", 

656 "catlangname", 

657 "c", 

658 "C", 

659 "top", 

660 "cln", 

661} 

662 

663# Set of template names that are used to define quotation examples. If the 

664# usage example contains one of these templates, then its type is set to 

665# "quotation". 

666quotation_templates: set[str] = { 

667 "collapse-quote", 

668 "quote-av", 

669 "quote-book", 

670 "quote-GYLD", 

671 "quote-hansard", 

672 "quotei", 

673 "quote-journal", 

674 "quotelite", 

675 "quote-mailing list", 

676 "quote-meta", 

677 "quote-newsgroup", 

678 "quote-song", 

679 "quote-text", 

680 "quote", 

681 "quote-us-patent", 

682 "quote-video game", 

683 "quote-web", 

684 "quote-wikipedia", 

685 "wikiquote", 

686 "Wikiquote", 

687} 

688 

689taxonomy_templates = { 

690 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

691 "taxfmt", 

692 "taxlink", 

693 "taxlink2", 

694 "taxlinknew", 

695 "taxlook", 

696} 

697 

698# Template name component to linkage section listing. Integer section means 

699# default section, starting at that argument. 

700# XXX not used anymore, except for the first elements: moved to 

701# template_linkages 

702# template_linkage_mappings: list[list[Union[str, int]]] = [ 

703# ["syn", "synonyms"], 

704# ["synonyms", "synonyms"], 

705# ["ant", "antonyms"], 

706# ["antonyms", "antonyms"], 

707# ["hyp", "hyponyms"], 

708# ["hyponyms", "hyponyms"], 

709# ["der", "derived"], 

710# ["derived terms", "derived"], 

711# ["coordinate terms", "coordinate_terms"], 

712# ["rel", "related"], 

713# ["col", 2], 

714# ] 

715 

716# Template names, this was exctracted from template_linkage_mappings, 

717# because the code using template_linkage_mappings was actually not used 

718# (but not removed). 

719template_linkages: set[str] = { 

720 "syn", 

721 "synonyms", 

722 "ant", 

723 "antonyms", 

724 "hyp", 

725 "hyponyms", 

726 "der", 

727 "derived terms", 

728 "coordinate terms", 

729 "rel", 

730 "col", 

731} 

732 

733# Maps template name used in a word sense to a linkage field that it adds. 

734sense_linkage_templates: dict[str, str] = { 

735 "syn": "synonyms", 

736 "synonyms": "synonyms", 

737 "hyp": "hyponyms", 

738 "hyponyms": "hyponyms", 

739 "ant": "antonyms", 

740 "antonyms": "antonyms", 

741} 

742 

743 

744def decode_html_entities(v: Union[str, int]) -> str: 

745 """Decodes HTML entities from a value, converting them to the respective 

746 Unicode characters/strings.""" 

747 if isinstance(v, int): 

748 # I changed this to return str(v) instead of v = str(v), 

749 # but there might have been the intention to have more logic 

750 # here. html.unescape would not do anything special with an integer, 

751 # it needs html escape symbols (&xx;). 

752 return str(v) 

753 return html.unescape(v) 

754 

755 

756def parse_sense_linkage( 

757 wxr: WiktextractContext, 

758 data: SenseData, 

759 name: str, 

760 ht: TemplateArgs, 

761) -> None: 

762 """Parses a linkage (synonym, etc) specified in a word sense.""" 

763 assert isinstance(wxr, WiktextractContext) 

764 assert isinstance(data, dict) 

765 assert isinstance(name, str) 

766 assert isinstance(ht, dict) 

767 field = sense_linkage_templates[name] 

768 for i in range(2, 20): 

769 w = ht.get(i) or "" 

770 w = clean_node(wxr, data, w) 

771 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

772 if w.startswith(alias): 

773 w = w[len(alias) :] 

774 break 

775 if not w: 

776 break 

777 tags: list[str] = [] 

778 topics: list[str] = [] 

779 english: Optional[str] = None 

780 # Try to find qualifiers for this synonym 

781 q = ht.get("q{}".format(i - 1)) 

782 if q: 

783 cls = classify_desc(q) 

784 if cls == "tags": 

785 tagsets1, topics1 = decode_tags(q) 

786 for ts in tagsets1: 

787 tags.extend(ts) 

788 topics.extend(topics1) 

789 elif cls == "english": 

790 if english: 

791 english += "; " + q 

792 else: 

793 english = q 

794 # Try to find English translation for this synonym 

795 t = ht.get("t{}".format(i - 1)) 

796 if t: 

797 if english: 

798 english += "; " + t 

799 else: 

800 english = t 

801 

802 # See if the linkage contains a parenthesized alt 

803 alt = None 

804 m = re.search(r"\(([^)]+)\)$", w) 

805 if m: 

806 w = w[: m.start()].strip() 

807 alt = m.group(1) 

808 

809 dt = {"word": w} 

810 if tags: 

811 data_extend(dt, "tags", tags) 

812 if topics: 

813 data_extend(dt, "topics", topics) 

814 if english: 

815 dt["english"] = english 

816 if alt: 

817 dt["alt"] = alt 

818 data_append(data, field, dt) 

819 

820 

821EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

822example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

823captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

824 

825 

826def synch_splits_with_args( 

827 line: str, targs: TemplateArgs 

828) -> Optional[list[str]]: 

829 """If it looks like there's something weird with how a line of example 

830 text has been split, this function will do the splitting after counting 

831 occurences of the splitting regex inside the two main template arguments 

832 containing the string data for the original language example and the 

833 English translations. 

834 """ 

835 # Previously, we split without capturing groups, but here we want to 

836 # keep the original splitting hyphen regex intact. 

837 fparts = captured_splitters_re.split(line) 

838 new_parts = [] 

839 # ["First", " – ", "second", " – ", "third..."] from OL argument 

840 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

841 new_parts.append("".join(fparts[:first])) 

842 # Translation argument 

843 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

844 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

845 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

846 new_parts.append("".join(fparts[first + 1 : second])) 

847 

848 if all(new_parts): # no empty strings from the above spaghetti 

849 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

850 return new_parts 

851 else: 

852 return None 

853 

854 

855QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

856QUALIFIERS_RE = re.compile(QUALIFIERS) 

857# (...): ... or (...(...)...): ... 

858 

859 

860def parse_language( 

861 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

862) -> list[WordData]: 

863 """Iterates over the text of the page, returning words (parts-of-speech) 

864 defined on the page one at a time. (Individual word senses for the 

865 same part-of-speech are typically encoded in the same entry.)""" 

866 # imported here to avoid circular import 

867 from .pronunciation import parse_pronunciation 

868 

869 assert isinstance(wxr, WiktextractContext) 

870 assert isinstance(langnode, WikiNode) 

871 assert isinstance(language, str) 

872 assert isinstance(lang_code, str) 

873 # print("parse_language", language) 

874 

875 is_reconstruction = False 

876 word: str = wxr.wtp.title # type: ignore[assignment] 

877 unsupported_prefix = "Unsupported titles/" 

878 if word.startswith(unsupported_prefix): 

879 w = word[len(unsupported_prefix) :] 

880 if w in unsupported_title_map: 880 ↛ 883line 880 didn't jump to line 883 because the condition on line 880 was always true

881 word = unsupported_title_map[w] 

882 else: 

883 wxr.wtp.error( 

884 "Unimplemented unsupported title: {}".format(word), 

885 sortid="page/870", 

886 ) 

887 word = w 

888 elif word.startswith("Reconstruction:"): 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true

889 word = word[word.find("/") + 1 :] 

890 is_reconstruction = True 

891 

892 base_data: WordData = { 

893 "word": word, 

894 "lang": language, 

895 "lang_code": lang_code, 

896 } 

897 if is_reconstruction: 897 ↛ 898line 897 didn't jump to line 898 because the condition on line 897 was never true

898 data_append(base_data, "tags", "reconstruction") 

899 sense_data: SenseData = {} 

900 pos_data: WordData = {} # For a current part-of-speech 

901 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

902 etym_data: WordData = {} # For one etymology 

903 pos_datas: list[SenseData] = [] 

904 level_four_datas: list[WordData] = [] 

905 etym_datas: list[WordData] = [] 

906 page_datas: list[WordData] = [] 

907 have_etym = False 

908 inside_level_four = False # This is for checking if the etymology section 

909 # or article has a Pronunciation section, for Chinese mostly; because 

910 # Chinese articles can have three level three sections (two etymology 

911 # sections and pronunciation sections) one after another, we need a kludge 

912 # to better keep track of whether we're in a normal "etym" or inside a 

913 # "level four" (which is what we've turned the level three Pron sections 

914 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

915 # a step. 

916 stack: list[str] = [] # names of items on the "stack" 

917 

918 def merge_base(data: WordData, base: WordData) -> None: 

919 for k, v in base.items(): 

920 # Copy the value to ensure that we don't share lists or 

921 # dicts between structures (even nested ones). 

922 v = copy.deepcopy(v) 

923 if k not in data: 

924 # The list was copied above, so this will not create shared ref 

925 data[k] = v # type: ignore[literal-required] 

926 continue 

927 if data[k] == v: # type: ignore[literal-required] 927 ↛ 929line 927 didn't jump to line 929 because the condition on line 927 was always true

928 continue 

929 if ( 

930 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

931 or isinstance( 

932 v, 

933 (list, tuple), # Should this be "and"? 

934 ) 

935 ): 

936 data[k] = list(data[k]) + list(v) # type: ignore 

937 elif data[k] != v: # type: ignore[literal-required] 

938 wxr.wtp.warning( 

939 "conflicting values for {} in merge_base: " 

940 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

941 sortid="page/904", 

942 ) 

943 

944 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

945 """Remove unnecessary keys from dict values 

946 in a list comprehension...""" 

947 if key in pron: 

948 pron.pop(key) # type: ignore 

949 return pron 

950 

951 # If the result has sounds, eliminate sounds that have a prefix that 

952 # does not match "word" or one of "forms" 

953 if "sounds" in data and "word" in data: 953 ↛ 954line 953 didn't jump to line 954 because the condition on line 953 was never true

954 accepted = [data["word"]] 

955 accepted.extend(f["form"] for f in data.get("forms", dict())) 

956 data["sounds"] = list( 

957 s 

958 for s in data["sounds"] 

959 if "form" not in s or s["form"] in accepted 

960 ) 

961 # If the result has sounds, eliminate sounds that have a pos that 

962 # does not match "pos" 

963 if "sounds" in data and "pos" in data: 963 ↛ 964line 963 didn't jump to line 964 because the condition on line 963 was never true

964 data["sounds"] = list( 

965 complementary_pop(s, "pos") 

966 for s in data["sounds"] 

967 # "pos" is not a field of SoundData, correctly, so we're 

968 # removing it here. It's a kludge on a kludge on a kludge. 

969 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

970 ) 

971 

972 def push_sense() -> bool: 

973 """Starts collecting data for a new word sense. This returns True 

974 if a sense was added.""" 

975 nonlocal sense_data 

976 tags = sense_data.get("tags", ()) 

977 if ( 

978 not sense_data.get("glosses") 

979 and "translation-hub" not in tags 

980 and "no-gloss" not in tags 

981 ): 

982 return False 

983 

984 if ( 984 ↛ 994line 984 didn't jump to line 994 because the condition on line 984 was never true

985 ( 

986 "participle" in sense_data.get("tags", ()) 

987 or "infinitive" in sense_data.get("tags", ()) 

988 ) 

989 and "alt_of" not in sense_data 

990 and "form_of" not in sense_data 

991 and "etymology_text" in etym_data 

992 and etym_data["etymology_text"] != "" 

993 ): 

994 etym = etym_data["etymology_text"] 

995 etym = etym.split(". ")[0] 

996 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

997 if ret is not None: 

998 tags, lst = ret 

999 assert isinstance(lst, (list, tuple)) 

1000 if "form-of" in tags: 

1001 data_extend(sense_data, "form_of", lst) 

1002 data_extend(sense_data, "tags", tags) 

1003 elif "alt-of" in tags: 

1004 data_extend(sense_data, "alt_of", lst) 

1005 data_extend(sense_data, "tags", tags) 

1006 

1007 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1007 ↛ 1010line 1007 didn't jump to line 1010 because the condition on line 1007 was never true

1008 "tags", () 

1009 ): 

1010 data_append(sense_data, "tags", "no-gloss") 

1011 

1012 pos_datas.append(sense_data) 

1013 sense_data = {} 

1014 return True 

1015 

1016 def push_pos() -> None: 

1017 """Starts collecting data for a new part-of-speech.""" 

1018 nonlocal pos_data 

1019 nonlocal pos_datas 

1020 push_sense() 

1021 if wxr.wtp.subsection: 

1022 data: WordData = {"senses": pos_datas} 

1023 merge_base(data, pos_data) 

1024 level_four_datas.append(data) 

1025 pos_data = {} 

1026 pos_datas = [] 

1027 wxr.wtp.start_subsection(None) 

1028 

1029 def push_level_four_section() -> None: 

1030 """Starts collecting data for a new level four sections, which 

1031 is usually virtual and empty, unless the article has Chinese 

1032 'Pronunciation' sections that are etymology-section-like but 

1033 under etymology, and at the same level in the source. We modify 

1034 the source to demote Pronunciation sections like that to level 

1035 4, and other sections one step lower.""" 

1036 nonlocal level_four_data 

1037 nonlocal level_four_datas 

1038 nonlocal etym_datas 

1039 push_pos() 

1040 # print(f"======\n{etym_data=}") 

1041 # print(f"======\n{etym_datas=}") 

1042 # print(f"======\n{level_four_data=}") 

1043 # print(f"======\n{level_four_datas=}") 

1044 for data in level_four_datas: 

1045 merge_base(data, level_four_data) 

1046 etym_datas.append(data) 

1047 for data in etym_datas: 

1048 merge_base(data, etym_data) 

1049 page_datas.append(data) 

1050 level_four_data = {} 

1051 level_four_datas = [] 

1052 etym_datas = [] 

1053 

1054 def push_etym() -> None: 

1055 """Starts collecting data for a new etymology.""" 

1056 nonlocal etym_data 

1057 nonlocal etym_datas 

1058 nonlocal have_etym 

1059 nonlocal inside_level_four 

1060 have_etym = True 

1061 push_level_four_section() 

1062 inside_level_four = False 

1063 etym_data = {} 

1064 

1065 def select_data() -> WordData: 

1066 """Selects where to store data (pos or etym) based on whether we 

1067 are inside a pos (part-of-speech).""" 

1068 # print(f"{wxr.wtp.subsection=}") 

1069 # print(f"{stack=}") 

1070 if wxr.wtp.subsection is not None: 1070 ↛ 1072line 1070 didn't jump to line 1072 because the condition on line 1070 was always true

1071 return pos_data 

1072 if stack[-1] == language: 

1073 return base_data 

1074 if inside_level_four is False: 

1075 return etym_data 

1076 return level_four_data 

1077 

1078 term_label_templates: list[TemplateData] = [] 

1079 

1080 def head_post_template_fn( 

1081 name: str, ht: TemplateArgs, expansion: str 

1082 ) -> Optional[str]: 

1083 """Handles special templates in the head section of a word. Head 

1084 section is the text after part-of-speech subtitle and before word 

1085 sense list. Typically it generates the bold line for the word, but 

1086 may also contain other useful information that often ends in 

1087 side boxes. We want to capture some of that additional information.""" 

1088 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1089 if is_panel_template(wxr, name): 1089 ↛ 1092line 1089 didn't jump to line 1092 because the condition on line 1089 was never true

1090 # Completely ignore these templates (not even recorded in 

1091 # head_templates) 

1092 return "" 

1093 if name == "head": 

1094 # XXX are these also captured in forms? Should this special case 

1095 # be removed? 

1096 t = ht.get(2, "") 

1097 if t == "pinyin": 1097 ↛ 1098line 1097 didn't jump to line 1098 because the condition on line 1097 was never true

1098 data_append(pos_data, "tags", "Pinyin") 

1099 elif t == "romanization": 1099 ↛ 1100line 1099 didn't jump to line 1100 because the condition on line 1099 was never true

1100 data_append(pos_data, "tags", "romanization") 

1101 if ( 1101 ↛ 1122line 1101 didn't jump to line 1122 because the condition on line 1101 was always true

1102 HEAD_TAG_RE.fullmatch(name) is not None 

1103 or name in WORD_LEVEL_HEAD_TEMPLATES 

1104 ): 

1105 args_ht = clean_template_args(wxr, ht) 

1106 cleaned_expansion = clean_node(wxr, None, expansion) 

1107 dt: TemplateData = { 

1108 "name": name, 

1109 "args": args_ht, 

1110 "expansion": cleaned_expansion, 

1111 } 

1112 data_append(pos_data, "head_templates", dt) 

1113 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1114 term_label_templates.append(dt) 

1115 # Squash these, their tags are applied to the whole word, 

1116 # and some cause problems like "term-label" 

1117 return "" 

1118 

1119 # The following are both captured in head_templates and parsed 

1120 # separately 

1121 

1122 if name in wikipedia_templates: 1122 ↛ 1125line 1122 didn't jump to line 1125 because the condition on line 1122 was never true

1123 # Note: various places expect to have content from wikipedia 

1124 # templates, so cannot convert this to empty 

1125 parse_wikipedia_template(wxr, pos_data, ht) 

1126 return None 

1127 

1128 if name == "number box": 1128 ↛ 1130line 1128 didn't jump to line 1130 because the condition on line 1128 was never true

1129 # XXX extract numeric value? 

1130 return "" 

1131 if name == "enum": 1131 ↛ 1133line 1131 didn't jump to line 1133 because the condition on line 1131 was never true

1132 # XXX extract? 

1133 return "" 

1134 if name == "cardinalbox": 1134 ↛ 1137line 1134 didn't jump to line 1137 because the condition on line 1134 was never true

1135 # XXX extract similar to enum? 

1136 # XXX this can also occur in top-level under language 

1137 return "" 

1138 if name == "Han simplified forms": 1138 ↛ 1140line 1138 didn't jump to line 1140 because the condition on line 1138 was never true

1139 # XXX extract? 

1140 return "" 

1141 # if name == "ja-kanji forms": 

1142 # # XXX extract? 

1143 # return "" 

1144 # if name == "vi-readings": 

1145 # # XXX extract? 

1146 # return "" 

1147 # if name == "ja-kanji": 

1148 # # XXX extract? 

1149 # return "" 

1150 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1150 ↛ 1152line 1150 didn't jump to line 1152 because the condition on line 1150 was never true

1151 # XXX extract? 

1152 return "" 

1153 

1154 return None 

1155 

1156 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1157 """Parses the subsection for a part-of-speech under a language on 

1158 a page.""" 

1159 assert isinstance(posnode, WikiNode) 

1160 assert isinstance(pos, str) 

1161 # print("parse_part_of_speech", pos) 

1162 pos_data["pos"] = pos 

1163 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1164 lists: list[list[WikiNode]] = [[]] # list of lists 

1165 first_para = True 

1166 first_head_tmplt = True 

1167 collecting_head = True 

1168 start_of_paragraph = True 

1169 

1170 # XXX extract templates from posnode with recursively_extract 

1171 # that break stuff, like ja-kanji or az-suffix-form. 

1172 # Do the extraction with a list of template names, combined from 

1173 # different lists, then separate out them into different lists 

1174 # that are handled at different points of the POS section. 

1175 # First, extract az-suffix-form, put it in `inflection`, 

1176 # and parse `inflection`'s content when appropriate later. 

1177 # The contents of az-suffix-form (and ja-kanji) that generate 

1178 # divs with "floatright" in their style gets deleted by 

1179 # clean_value, so templates that slip through from here won't 

1180 # break anything. 

1181 # XXX bookmark 

1182 # print("===================") 

1183 # print(posnode.children) 

1184 

1185 floaters, poschildren = recursively_extract( 

1186 posnode.children, 

1187 lambda x: ( 

1188 isinstance(x, WikiNode) 

1189 and ( 

1190 ( 

1191 x.kind == NodeKind.TEMPLATE 

1192 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES 

1193 ) 

1194 or ( 

1195 x.kind == NodeKind.LINK 

1196 # Need to check for stringiness because some links are 

1197 # broken; for example, if a template is missing an 

1198 # argument, a link might look like `[[{{{1}}}...]]` 

1199 and isinstance(x.largs[0][0], str) 

1200 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1201 ) 

1202 ) 

1203 ), 

1204 ) 

1205 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1206 tempnode.largs = [["Inflection"]] 

1207 tempnode.children = floaters 

1208 parse_inflection(tempnode, "Floating Div", pos) 

1209 # print(poschildren) 

1210 # XXX new above 

1211 

1212 if not poschildren: 1212 ↛ 1213line 1212 didn't jump to line 1213 because the condition on line 1212 was never true

1213 if not floaters: 

1214 wxr.wtp.debug( 

1215 "PoS section without contents", 

1216 sortid="en/page/1051/20230612", 

1217 ) 

1218 else: 

1219 wxr.wtp.debug( 

1220 "PoS section without contents except for a floating table", 

1221 sortid="en/page/1056/20230612", 

1222 ) 

1223 return 

1224 

1225 for node in poschildren: 

1226 if isinstance(node, str): 

1227 for m in re.finditer(r"\n+|[^\n]+", node): 

1228 p = m.group(0) 

1229 if p.startswith("\n\n") and pre: 

1230 first_para = False 

1231 start_of_paragraph = True 

1232 break 

1233 if p and collecting_head: 

1234 pre[-1].append(p) 

1235 continue 

1236 assert isinstance(node, WikiNode) 

1237 kind = node.kind 

1238 if kind == NodeKind.LIST: 

1239 lists[-1].append(node) 

1240 collecting_head = False 

1241 start_of_paragraph = True 

1242 continue 

1243 elif kind in LEVEL_KINDS: 

1244 # Stop parsing section if encountering any kind of 

1245 # level header (like ===Noun=== or ====Further Reading====). 

1246 # At a quick glance, this should be the default behavior, 

1247 # but if some kinds of source articles have sub-sub-sections 

1248 # that should be parsed XXX it should be handled by changing 

1249 # this break. 

1250 break 

1251 elif collecting_head and kind == NodeKind.LINK: 1251 ↛ 1254line 1251 didn't jump to line 1254 because the condition on line 1251 was never true

1252 # We might collect relevant links as they are often pictures 

1253 # relating to the word 

1254 if len(node.largs[0]) >= 1 and isinstance( 

1255 node.largs[0][0], str 

1256 ): 

1257 if node.largs[0][0].startswith( 

1258 ns_title_prefix_tuple(wxr, "Category") 

1259 ): 

1260 # [[Category:...]] 

1261 # We're at the end of the file, probably, so stop 

1262 # here. Otherwise the head will get garbage. 

1263 break 

1264 if node.largs[0][0].startswith( 

1265 ns_title_prefix_tuple(wxr, "File") 

1266 ): 

1267 # Skips file links 

1268 continue 

1269 start_of_paragraph = False 

1270 pre[-1].extend(node.largs[-1]) 

1271 elif kind == NodeKind.HTML: 

1272 if node.sarg == "br": 1272 ↛ 1278line 1272 didn't jump to line 1278 because the condition on line 1272 was always true

1273 if pre[-1]: 1273 ↛ 1225line 1273 didn't jump to line 1225 because the condition on line 1273 was always true

1274 pre.append([]) # Switch to next head 

1275 lists.append([]) # Lists parallels pre 

1276 collecting_head = True 

1277 start_of_paragraph = True 

1278 elif collecting_head and node.sarg not in ( 

1279 "gallery", 

1280 "ref", 

1281 "cite", 

1282 "caption", 

1283 ): 

1284 start_of_paragraph = False 

1285 pre[-1].append(node) 

1286 else: 

1287 start_of_paragraph = False 

1288 elif isinstance(node, TemplateNode): 

1289 # XXX Insert code here that disambiguates between 

1290 # templates that generate word heads and templates 

1291 # that don't. 

1292 # There's head_tag_re that seems like a regex meant 

1293 # to identify head templates. Too bad it's None. 

1294 

1295 # ignore {{category}}, {{cat}}... etc. 

1296 if node.template_name in stop_head_at_these_templates: 

1297 # we've reached a template that should be at the end, 

1298 continue 

1299 

1300 # skip these templates; panel_templates is already used 

1301 # to skip certain templates else, but it also applies to 

1302 # head parsing quite well. 

1303 # node.largs[0][0] should always be str, but can't type-check 

1304 # that. 

1305 if is_panel_template(wxr, node.template_name): 1305 ↛ 1306line 1305 didn't jump to line 1306 because the condition on line 1305 was never true

1306 continue 

1307 # skip these templates 

1308 # if node.largs[0][0] in skip_these_templates_in_head: 

1309 # first_head_tmplt = False # no first_head_tmplt at all 

1310 # start_of_paragraph = False 

1311 # continue 

1312 

1313 if first_head_tmplt and pre[-1]: 

1314 first_head_tmplt = False 

1315 start_of_paragraph = False 

1316 pre[-1].append(node) 

1317 elif pre[-1] and start_of_paragraph: 

1318 pre.append([]) # Switch to the next head 

1319 lists.append([]) # lists parallel pre 

1320 collecting_head = True 

1321 start_of_paragraph = False 

1322 pre[-1].append(node) 

1323 else: 

1324 pre[-1].append(node) 

1325 elif first_para: 1325 ↛ 1225line 1325 didn't jump to line 1225 because the condition on line 1325 was always true

1326 start_of_paragraph = False 

1327 if collecting_head: 1327 ↛ 1225line 1327 didn't jump to line 1225 because the condition on line 1327 was always true

1328 pre[-1].append(node) 

1329 # XXX use template_fn in clean_node to check that the head macro 

1330 # is compatible with the current part-of-speech and generate warning 

1331 # if not. Use template_allowed_pos_map. 

1332 

1333 # Clean up empty pairs, and fix messes with extra newlines that 

1334 # separate templates that are followed by lists wiktextract issue #314 

1335 

1336 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1337 cleaned_lists: list[list[WikiNode]] = [] 

1338 pairless_pre_index = None 

1339 

1340 for pre1, ls in zip(pre, lists): 

1341 if pre1 and not ls: 

1342 pairless_pre_index = len(cleaned_pre) 

1343 if not pre1 and not ls: 1343 ↛ 1345line 1343 didn't jump to line 1345 because the condition on line 1343 was never true

1344 # skip [] + [] 

1345 continue 

1346 if not ls and all( 

1347 (isinstance(x, str) and not x.strip()) for x in pre1 

1348 ): 

1349 # skip ["\n", " "] + [] 

1350 continue 

1351 if ls and not pre1: 

1352 if pairless_pre_index is not None: 1352 ↛ 1353line 1352 didn't jump to line 1353 because the condition on line 1352 was never true

1353 cleaned_lists[pairless_pre_index] = ls 

1354 pairless_pre_index = None 

1355 continue 

1356 cleaned_pre.append(pre1) 

1357 cleaned_lists.append(ls) 

1358 

1359 pre = cleaned_pre 

1360 lists = cleaned_lists 

1361 

1362 there_are_many_heads = len(pre) > 1 

1363 header_tags: list[str] = [] 

1364 header_topics: list[str] = [] 

1365 previous_head_had_list = False 

1366 

1367 if not any(g for g in lists): 

1368 process_gloss_without_list( 

1369 poschildren, pos, pos_data, header_tags, header_topics 

1370 ) 

1371 else: 

1372 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1373 # if len(ls) == 0: 

1374 # # don't have gloss list 

1375 # # XXX add code here to filter out 'garbage', like text 

1376 # # that isn't a head template or head. 

1377 # continue 

1378 

1379 if all(not sl for sl in lists[i:]): 1379 ↛ 1380line 1379 didn't jump to line 1380 because the condition on line 1379 was never true

1380 if i == 0: 

1381 if isinstance(node, str): 

1382 wxr.wtp.debug( 

1383 "first head without list of senses," 

1384 "string: '{}[...]', {}/{}".format( 

1385 node[:20], word, language 

1386 ), 

1387 sortid="page/1689/20221215", 

1388 ) 

1389 if isinstance(node, WikiNode): 

1390 if node.largs and node.largs[0][0] in [ 

1391 "Han char", 

1392 ]: 

1393 # just ignore these templates 

1394 pass 

1395 else: 

1396 wxr.wtp.debug( 

1397 "first head without " 

1398 "list of senses, " 

1399 "template node " 

1400 "{}, {}/{}".format( 

1401 node.largs, word, language 

1402 ), 

1403 sortid="page/1694/20221215", 

1404 ) 

1405 else: 

1406 wxr.wtp.debug( 

1407 "first head without list of senses, " 

1408 "{}/{}".format(word, language), 

1409 sortid="page/1700/20221215", 

1410 ) 

1411 # no break here so that the first head always 

1412 # gets processed. 

1413 else: 

1414 if isinstance(node, str): 

1415 wxr.wtp.debug( 

1416 "later head without list of senses," 

1417 "string: '{}[...]', {}/{}".format( 

1418 node[:20], word, language 

1419 ), 

1420 sortid="page/1708/20221215", 

1421 ) 

1422 if isinstance(node, WikiNode): 

1423 wxr.wtp.debug( 

1424 "later head without list of senses," 

1425 "template node " 

1426 "{}, {}/{}".format( 

1427 node.sarg if node.sarg else node.largs, 

1428 word, 

1429 language, 

1430 ), 

1431 sortid="page/1713/20221215", 

1432 ) 

1433 else: 

1434 wxr.wtp.debug( 

1435 "later head without list of senses, " 

1436 "{}/{}".format(word, language), 

1437 sortid="page/1719/20221215", 

1438 ) 

1439 break 

1440 head_group = i + 1 if there_are_many_heads else None 

1441 # print("parse_part_of_speech: {}: {}: pre={}" 

1442 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1443 

1444 if previous_head_had_list: 1444 ↛ 1449line 1444 didn't jump to line 1449 because the condition on line 1444 was never true

1445 # We use a boolean flag here because we want to be able 

1446 # let the header_tags data pass through after the loop 

1447 # is over without accidentally emptying it, if there are 

1448 # no pos_datas and we need a dummy data. 

1449 header_tags.clear() 

1450 header_topics.clear() 

1451 

1452 process_gloss_header( 

1453 pre1, pos, head_group, pos_data, header_tags, header_topics 

1454 ) 

1455 for ln in ls: 

1456 # Parse each list associated with this head. 

1457 for node in ln.children: 

1458 # Parse nodes in l.children recursively. 

1459 # The recursion function uses push_sense() to 

1460 # add stuff into pos_data, and returns True or 

1461 # False if something is added, which bubbles upward. 

1462 # If the bubble is "True", then higher levels of 

1463 # the recursion will not push_sense(), because 

1464 # the data is already pushed into a sub-gloss 

1465 # downstream, unless the higher level has examples 

1466 # that need to be put somewhere. 

1467 common_data: SenseData = { 

1468 "tags": list(header_tags), 

1469 "topics": list(header_topics), 

1470 } 

1471 if head_group: 

1472 common_data["head_nr"] = head_group 

1473 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1474 

1475 if len(ls) > 0: 

1476 previous_head_had_list = True 

1477 else: 

1478 previous_head_had_list = False 

1479 

1480 # If there are no senses extracted, add a dummy sense. We want to 

1481 # keep tags extracted from the head for the dummy sense. 

1482 push_sense() # Make sure unfinished data pushed, and start clean sense 

1483 if len(pos_datas) == 0: 

1484 data_extend(sense_data, "tags", header_tags) 

1485 data_extend(sense_data, "topics", header_topics) 

1486 data_append(sense_data, "tags", "no-gloss") 

1487 push_sense() 

1488 

1489 def process_gloss_header( 

1490 header_nodes: list[Union[WikiNode, str]], 

1491 pos_type: str, 

1492 header_group: Optional[int], 

1493 pos_data: WordData, 

1494 header_tags: list[str], 

1495 header_topics: list[str], 

1496 ) -> None: 

1497 ruby = [] 

1498 links: list[str] = [] 

1499 

1500 # process template parse nodes here 

1501 new_nodes = [] 

1502 info_template_data = [] 

1503 for node in header_nodes: 

1504 # print(f"{node=}") 

1505 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1506 if info_data or info_out: 

1507 if info_data: 1507 ↛ 1509line 1507 didn't jump to line 1509 because the condition on line 1507 was always true

1508 info_template_data.append(info_data) 

1509 if info_out: # including just the original node 1509 ↛ 1510line 1509 didn't jump to line 1510 because the condition on line 1509 was never true

1510 new_nodes.append(info_out) 

1511 else: 

1512 new_nodes.append(node) 

1513 header_nodes = new_nodes 

1514 

1515 if info_template_data: 

1516 if "info_templates" not in pos_data: 1516 ↛ 1519line 1516 didn't jump to line 1519 because the condition on line 1516 was always true

1517 pos_data["info_templates"] = info_template_data 

1518 else: 

1519 pos_data["info_templates"].extend(info_template_data) 

1520 

1521 if not word.isalnum(): 

1522 # if the word contains non-letter or -number characters, it might 

1523 # have something that messes with split-at-semi-comma; we collect 

1524 # links so that we can skip splitting them. 

1525 exp = wxr.wtp.parse( 

1526 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1527 ) 

1528 link_nodes, _ = recursively_extract( 

1529 exp.children, 

1530 lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.LINK, 

1531 ) 

1532 for ln in link_nodes: 

1533 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1534 if not ltext.isalnum(): 

1535 links.append(ltext) 

1536 if word not in links: 1536 ↛ 1538line 1536 didn't jump to line 1538 because the condition on line 1536 was always true

1537 links.append(word) 

1538 if lang_code == "ja": 

1539 exp = wxr.wtp.parse( 

1540 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1541 ) 

1542 rub, _ = recursively_extract( 

1543 exp.children, 

1544 lambda x: isinstance(x, WikiNode) 

1545 and x.kind == NodeKind.HTML 

1546 and x.sarg == "ruby", 

1547 ) 

1548 if rub is not None: 1548 ↛ 1557line 1548 didn't jump to line 1557 because the condition on line 1548 was always true

1549 for r in rub: 1549 ↛ 1550line 1549 didn't jump to line 1550 because the loop on line 1549 never started

1550 if TYPE_CHECKING: 

1551 # we know the lambda above in recursively_extract 

1552 # returns only WikiNodes in rub 

1553 assert isinstance(r, WikiNode) 

1554 rt = parse_ruby(wxr, r) 

1555 if rt is not None: 

1556 ruby.append(rt) 

1557 header_text = clean_node( 

1558 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1559 ) 

1560 

1561 term_label_tags: list[str] = [] 

1562 term_label_topics: list[str] = [] 

1563 if len(term_label_templates) > 0: 

1564 # parse term label templates; if there are other similar kinds 

1565 # of templates in headers that you want to squash and apply as 

1566 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1567 for templ_data in term_label_templates: 

1568 print(templ_data) 

1569 expan = templ_data.get("expansion", "").strip("().,; ") 

1570 if not expan: 1570 ↛ 1571line 1570 didn't jump to line 1571 because the condition on line 1570 was never true

1571 continue 

1572 tlb_tagsets, tlb_topics = decode_tags(expan) 

1573 for tlb_tags in tlb_tagsets: 

1574 if len(tlb_tags) > 0 and not any( 1574 ↛ 1573line 1574 didn't jump to line 1573 because the condition on line 1574 was always true

1575 t.startswith("error-") for t in tlb_tags 

1576 ): 

1577 term_label_tags.extend(tlb_tags) 

1578 term_label_topics.extend(tlb_topics) 

1579 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1580 

1581 header_text = re.sub(r"\s+", " ", header_text) 

1582 # print(f"{header_text=}") 

1583 parse_word_head( 

1584 wxr, 

1585 pos_type, 

1586 header_text, 

1587 pos_data, 

1588 is_reconstruction, 

1589 header_group, 

1590 ruby=ruby, 

1591 links=links, 

1592 ) 

1593 if "tags" in pos_data: 

1594 # pos_data can get "tags" data from some source; type-checkers 

1595 # doesn't like it, so let's ignore it. 

1596 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1597 del pos_data["tags"] # type: ignore[typeddict-item] 

1598 if len(term_label_tags) > 0: 

1599 header_tags.extend(term_label_tags) 

1600 if len(term_label_topics) > 0: 

1601 header_topics.extend(term_label_topics) 

1602 

1603 def process_gloss_without_list( 

1604 nodes: list[Union[WikiNode, str]], 

1605 pos_type: str, 

1606 pos_data: WordData, 

1607 header_tags: list[str], 

1608 header_topics: list[str], 

1609 ) -> None: 

1610 # gloss text might not inside a list 

1611 header_nodes: list[Union[str, WikiNode]] = [] 

1612 gloss_nodes: list[Union[str, WikiNode]] = [] 

1613 for node in strip_nodes(nodes): 

1614 if isinstance(node, WikiNode): 

1615 if isinstance(node, TemplateNode): 

1616 if node.template_name in ( 

1617 "zh-see", 

1618 "ja-see", 

1619 "ja-see-kango", 

1620 ): 

1621 continue # soft redirect 

1622 elif ( 1622 ↛ 1630line 1622 didn't jump to line 1630 because the condition on line 1622 was always true

1623 node.template_name == "head" 

1624 or node.template_name.startswith(f"{lang_code}-") 

1625 ): 

1626 header_nodes.append(node) 

1627 continue 

1628 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1628 ↛ 1630line 1628 didn't jump to line 1630 because the condition on line 1628 was always true

1629 break 

1630 gloss_nodes.append(node) 

1631 

1632 if len(header_nodes) > 0: 

1633 process_gloss_header( 

1634 header_nodes, 

1635 pos_type, 

1636 None, 

1637 pos_data, 

1638 header_tags, 

1639 header_topics, 

1640 ) 

1641 if len(gloss_nodes) > 0: 

1642 process_gloss_contents( 

1643 gloss_nodes, 

1644 pos_type, 

1645 {"tags": list(header_tags), "topics": list(header_topics)}, 

1646 ) 

1647 

1648 def parse_sense_node( 

1649 node: Union[str, WikiNode], # never receives str 

1650 sense_base: SenseData, 

1651 pos: str, 

1652 ) -> bool: 

1653 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1654 Uses push_sense() to attempt adding data to pos_data in the scope 

1655 of parse_language() when it reaches deep in the recursion. push_sense() 

1656 returns True if it succeeds, and that is bubbled up the stack; if 

1657 a sense was added downstream, the higher levels (whose shared data 

1658 was already added by a subsense) do not push_sense(), unless it 

1659 has examples that need to be put somewhere. 

1660 """ 

1661 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1662 if not isinstance(node, WikiNode): 1662 ↛ 1664line 1662 didn't jump to line 1664 because the condition on line 1662 was never true

1663 # This doesn't seem to ever happen in practice. 

1664 wxr.wtp.debug( 

1665 "{}: parse_sense_node called with" 

1666 "something that isn't a WikiNode".format(pos), 

1667 sortid="page/1287/20230119", 

1668 ) 

1669 return False 

1670 

1671 if node.kind != NodeKind.LIST_ITEM: 1671 ↛ 1672line 1671 didn't jump to line 1672 because the condition on line 1671 was never true

1672 wxr.wtp.debug( 

1673 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1674 ) 

1675 return False 

1676 

1677 if node.sarg == ":": 1677 ↛ 1683line 1677 didn't jump to line 1683 because the condition on line 1677 was never true

1678 # Skip example entries at the highest level, ones without 

1679 # a sense ("...#") above them. 

1680 # If node.sarg is exactly and only ":", then it's at 

1681 # the highest level; lower levels would have more 

1682 # "indentation", like "#:" or "##:" 

1683 return False 

1684 

1685 # If a recursion call succeeds in push_sense(), bubble it up with 

1686 # `added`. 

1687 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1688 added = False 

1689 

1690 gloss_template_args: set[str] = set() 

1691 

1692 # For LISTs and LIST_ITEMS, their argument is something like 

1693 # "##" or "##:", and using that we can rudimentally determine 

1694 # list 'depth' if need be, and also what kind of list or 

1695 # entry it is; # is for normal glosses, : for examples (indent) 

1696 # and * is used for quotations on wiktionary. 

1697 current_depth = node.sarg 

1698 

1699 children = node.children 

1700 

1701 # subentries, (presumably) a list 

1702 # of subglosses below this. The list's 

1703 # argument ends with #, and its depth should 

1704 # be bigger than parent node. 

1705 subentries = [ 

1706 x 

1707 for x in children 

1708 if isinstance(x, WikiNode) 

1709 and x.kind == NodeKind.LIST 

1710 and x.sarg == current_depth + "#" 

1711 ] 

1712 

1713 # sublists of examples and quotations. .sarg 

1714 # does not end with "#". 

1715 others = [ 

1716 x 

1717 for x in children 

1718 if isinstance(x, WikiNode) 

1719 and x.kind == NodeKind.LIST 

1720 and x.sarg != current_depth + "#" 

1721 ] 

1722 

1723 # the actual contents of this particular node. 

1724 # can be a gloss (or a template that expands into 

1725 # many glosses which we can't easily pre-expand) 

1726 # or could be an "outer gloss" with more specific 

1727 # subglosses, or could be a qualfier for the subglosses. 

1728 contents = [ 

1729 x 

1730 for x in children 

1731 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1732 ] 

1733 # If this entry has sublists of entries, we should combine 

1734 # gloss information from both the "outer" and sublist content. 

1735 # Sometimes the outer gloss 

1736 # is more non-gloss or tags, sometimes it is a coarse sense 

1737 # and the inner glosses are more specific. The outer one 

1738 # does not seem to have qualifiers. 

1739 

1740 # If we have one sublist with one element, treat it 

1741 # specially as it may be a Wiktionary error; raise 

1742 # that nested element to the same level. 

1743 # XXX If need be, this block can be easily removed in 

1744 # the current recursive logicand the result is one sense entry 

1745 # with both glosses in the glosses list, as you would 

1746 # expect. If the higher entry has examples, there will 

1747 # be a higher entry with some duplicated data. 

1748 if len(subentries) == 1: 

1749 slc = subentries[0].children 

1750 if len(slc) == 1: 1750 ↛ 1753line 1750 didn't jump to line 1753 because the condition on line 1750 was never true

1751 # copy current node and modify it so it doesn't 

1752 # loop infinitely. 

1753 cropped_node = copy.copy(node) 

1754 cropped_node.children = [ 

1755 x 

1756 for x in children 

1757 if not ( 

1758 isinstance(x, WikiNode) 

1759 and x.kind == NodeKind.LIST 

1760 and x.sarg == current_depth + "#" 

1761 ) 

1762 ] 

1763 added |= parse_sense_node(cropped_node, sense_base, pos) 

1764 nonlocal sense_data # this kludge causes duplicated raw_ 

1765 # glosses data if this is not done; 

1766 # if the top-level (cropped_node) 

1767 # does not push_sense() properly or 

1768 # parse_sense_node() returns early, 

1769 # sense_data is not reset. This happens 

1770 # for example when you have a no-gloss 

1771 # string like "(intransitive)": 

1772 # no gloss, push_sense() returns early 

1773 # and sense_data has duplicate data with 

1774 # sense_base 

1775 sense_data = {} 

1776 added |= parse_sense_node(slc[0], sense_base, pos) 

1777 return added 

1778 

1779 return process_gloss_contents( 

1780 contents, 

1781 pos, 

1782 sense_base, 

1783 subentries, 

1784 others, 

1785 gloss_template_args, 

1786 added, 

1787 ) 

1788 

1789 def process_gloss_contents( 

1790 contents: list[Union[str, WikiNode]], 

1791 pos: str, 

1792 sense_base: SenseData, 

1793 subentries: list[WikiNode] = [], 

1794 others: list[WikiNode] = [], 

1795 gloss_template_args: Set[str] = set(), 

1796 added: bool = False, 

1797 ) -> bool: 

1798 def sense_template_fn( 

1799 name: str, ht: TemplateArgs, is_gloss: bool = False 

1800 ) -> Optional[str]: 

1801 # print(f"sense_template_fn: {name}, {ht}") 

1802 if name in wikipedia_templates: 1802 ↛ 1804line 1802 didn't jump to line 1804 because the condition on line 1802 was never true

1803 # parse_wikipedia_template(wxr, pos_data, ht) 

1804 return None 

1805 if is_panel_template(wxr, name): 1805 ↛ 1806line 1805 didn't jump to line 1806 because the condition on line 1805 was never true

1806 return "" 

1807 if name in INFO_TEMPLATE_FUNCS: 

1808 info_data, info_exp = parse_info_template_arguments( 

1809 wxr, name, ht, "sense" 

1810 ) 

1811 if info_data or info_exp: 1811 ↛ 1817line 1811 didn't jump to line 1817 because the condition on line 1811 was always true

1812 if info_data: 1812 ↛ 1814line 1812 didn't jump to line 1814 because the condition on line 1812 was always true

1813 data_append(sense_base, "info_templates", info_data) 

1814 if info_exp and isinstance(info_exp, str): 1814 ↛ 1816line 1814 didn't jump to line 1816 because the condition on line 1814 was always true

1815 return info_exp 

1816 return "" 

1817 if name in ("defdate",): 1817 ↛ 1818line 1817 didn't jump to line 1818 because the condition on line 1817 was never true

1818 return "" 

1819 if name == "senseid": 1819 ↛ 1820line 1819 didn't jump to line 1820 because the condition on line 1819 was never true

1820 langid = clean_node(wxr, None, ht.get(1, ())) 

1821 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1822 if re.match(r"Q\d+$", arg): 

1823 data_append(sense_base, "wikidata", arg) 

1824 data_append(sense_base, "senseid", langid + ":" + arg) 

1825 if name in sense_linkage_templates: 1825 ↛ 1827line 1825 didn't jump to line 1827 because the condition on line 1825 was never true

1826 # print(f"SENSE_TEMPLATE_FN: {name}") 

1827 parse_sense_linkage(wxr, sense_base, name, ht) 

1828 return "" 

1829 if name == "†" or name == "zh-obsolete": 1829 ↛ 1830line 1829 didn't jump to line 1830 because the condition on line 1829 was never true

1830 data_append(sense_base, "tags", "obsolete") 

1831 return "" 

1832 if name in { 

1833 "ux", 

1834 "uxi", 

1835 "usex", 

1836 "afex", 

1837 "prefixusex", 

1838 "ko-usex", 

1839 "ko-x", 

1840 "hi-x", 

1841 "ja-usex-inline", 

1842 "ja-x", 

1843 "quotei", 

1844 "he-x", 

1845 "hi-x", 

1846 "km-x", 

1847 "ne-x", 

1848 "shn-x", 

1849 "th-x", 

1850 "ur-x", 

1851 }: 

1852 # Usage examples are captured separately below. We don't 

1853 # want to expand them into glosses even when unusual coding 

1854 # is used in the entry. 

1855 # These templates may slip through inside another item, but 

1856 # currently we're separating out example entries (..#:) 

1857 # well enough that there seems to very little contamination. 

1858 if is_gloss: 1858 ↛ 1864line 1858 didn't jump to line 1864 because the condition on line 1858 was always true

1859 wxr.wtp.warning( 

1860 "Example template is used for gloss text", 

1861 sortid="extractor.en.page.sense_template_fn/1415", 

1862 ) 

1863 else: 

1864 return "" 

1865 if name == "w": 1865 ↛ 1866line 1865 didn't jump to line 1866 because the condition on line 1865 was never true

1866 if ht.get(2) == "Wp": 

1867 return "" 

1868 for k, v in ht.items(): 

1869 v = v.strip() 

1870 if v and "<" not in v: 1870 ↛ 1868line 1870 didn't jump to line 1868 because the condition on line 1870 was always true

1871 gloss_template_args.add(v) 

1872 return None 

1873 

1874 def extract_link_texts(item: GeneralNode) -> None: 

1875 """Recursively extracts link texts from the gloss source. This 

1876 information is used to select whether to remove final "." from 

1877 form_of/alt_of (e.g., ihm/Hunsrik).""" 

1878 if isinstance(item, (list, tuple)): 

1879 for x in item: 

1880 extract_link_texts(x) 

1881 return 

1882 if isinstance(item, str): 

1883 # There seem to be HTML sections that may futher contain 

1884 # unparsed links. 

1885 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1885 ↛ 1886line 1885 didn't jump to line 1886 because the loop on line 1885 never started

1886 print("ITER:", m.group(0)) 

1887 v = m.group(1).split("|")[-1].strip() 

1888 if v: 

1889 gloss_template_args.add(v) 

1890 return 

1891 if not isinstance(item, WikiNode): 1891 ↛ 1892line 1891 didn't jump to line 1892 because the condition on line 1891 was never true

1892 return 

1893 if item.kind == NodeKind.LINK: 

1894 v = item.largs[-1] 

1895 if ( 1895 ↛ 1901line 1895 didn't jump to line 1901 because the condition on line 1895 was always true

1896 isinstance(v, list) 

1897 and len(v) == 1 

1898 and isinstance(v[0], str) 

1899 ): 

1900 gloss_template_args.add(v[0].strip()) 

1901 for x in item.children: 

1902 extract_link_texts(x) 

1903 

1904 extract_link_texts(contents) 

1905 

1906 # get the raw text of non-list contents of this node, and other stuff 

1907 # like tag and category data added to sense_base 

1908 # cast = no-op type-setter for the type-checker 

1909 partial_template_fn = cast( 

1910 TemplateFnCallable, 

1911 partial(sense_template_fn, is_gloss=True), 

1912 ) 

1913 rawgloss = clean_node( 

1914 wxr, 

1915 sense_base, 

1916 contents, 

1917 template_fn=partial_template_fn, 

1918 collect_links=True, 

1919 ) 

1920 

1921 if not rawgloss: 1921 ↛ 1922line 1921 didn't jump to line 1922 because the condition on line 1921 was never true

1922 return False 

1923 

1924 # remove manually typed ordered list text at the start("1. ") 

1925 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

1926 

1927 # get stuff like synonyms and categories from "others", 

1928 # maybe examples and quotations 

1929 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

1930 

1931 # The gloss could contain templates that produce more list items. 

1932 # This happens commonly with, e.g., {{inflection of|...}}. Split 

1933 # to parts. However, e.g. Interlingua generates multiple glosses 

1934 # in HTML directly without Wikitext markup, so we must also split 

1935 # by just newlines. 

1936 subglosses = rawgloss.splitlines() 

1937 

1938 if len(subglosses) == 0: 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true

1939 return False 

1940 

1941 if any(s.startswith("#") for s in subglosses): 

1942 subtree = wxr.wtp.parse(rawgloss) 

1943 # from wikitextprocessor.parser import print_tree 

1944 # print("SUBTREE GENERATED BY TEMPLATE:") 

1945 # print_tree(subtree) 

1946 new_subentries = [ 

1947 x 

1948 for x in subtree.children 

1949 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

1950 ] 

1951 

1952 new_others = [ 

1953 x 

1954 for x in subtree.children 

1955 if isinstance(x, WikiNode) 

1956 and x.kind == NodeKind.LIST 

1957 and not x.sarg.endswith("#") 

1958 ] 

1959 

1960 new_contents = [ 

1961 clean_node(wxr, [], x) 

1962 for x in subtree.children 

1963 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1964 ] 

1965 

1966 subentries = subentries or new_subentries 

1967 others = others or new_others 

1968 subglosses = new_contents 

1969 rawgloss = "".join(subglosses) 

1970 # Generate no gloss for translation hub pages, but add the 

1971 # "translation-hub" tag for them 

1972 if rawgloss == "(This entry is a translation hub.)": 1972 ↛ 1973line 1972 didn't jump to line 1973 because the condition on line 1972 was never true

1973 data_append(sense_data, "tags", "translation-hub") 

1974 return push_sense() 

1975 

1976 # Remove certain substrings specific to outer glosses 

1977 strip_ends = [", particularly:"] 

1978 for x in strip_ends: 

1979 if rawgloss.endswith(x): 1979 ↛ 1980line 1979 didn't jump to line 1980 because the condition on line 1979 was never true

1980 rawgloss = rawgloss[: -len(x)].strip() 

1981 break 

1982 

1983 # A single gloss, or possibly an outer gloss. 

1984 # Check if the possible outer gloss starts with 

1985 # parenthesized tags/topics 

1986 

1987 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 1987 ↛ 1989line 1987 didn't jump to line 1989 because the condition on line 1987 was always true

1988 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

1989 m = QUALIFIERS_RE.match(rawgloss) 

1990 # (...): ... or (...(...)...): ... 

1991 if m: 

1992 q = m.group(1) 

1993 rawgloss = rawgloss[m.end() :].strip() 

1994 parse_sense_qualifier(wxr, q, sense_base) 

1995 if rawgloss == "A pejorative:": 1995 ↛ 1996line 1995 didn't jump to line 1996 because the condition on line 1995 was never true

1996 data_append(sense_base, "tags", "pejorative") 

1997 rawgloss = "" 

1998 elif rawgloss == "Short forms.": 1998 ↛ 1999line 1998 didn't jump to line 1999 because the condition on line 1998 was never true

1999 data_append(sense_base, "tags", "abbreviation") 

2000 rawgloss = "" 

2001 elif rawgloss == "Technical or specialized senses.": 2001 ↛ 2002line 2001 didn't jump to line 2002 because the condition on line 2001 was never true

2002 rawgloss = "" 

2003 elif rawgloss.startswith("inflection of "): 

2004 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2005 if parsed is not None: 2005 ↛ 2014line 2005 didn't jump to line 2014 because the condition on line 2005 was always true

2006 tags, origins = parsed 

2007 if origins is not None: 2007 ↛ 2009line 2007 didn't jump to line 2009 because the condition on line 2007 was always true

2008 data_extend(sense_base, "form_of", origins) 

2009 if tags is not None: 2009 ↛ 2012line 2009 didn't jump to line 2012 because the condition on line 2009 was always true

2010 data_extend(sense_base, "tags", tags) 

2011 else: 

2012 data_append(sense_base, "tags", "form-of") 

2013 else: 

2014 data_append(sense_base, "tags", "form-of") 

2015 if rawgloss: 2015 ↛ 2046line 2015 didn't jump to line 2046 because the condition on line 2015 was always true

2016 # Code duplicating a lot of clean-up operations from later in 

2017 # this block. We want to clean up the "supergloss" as much as 

2018 # possible, in almost the same way as a normal gloss. 

2019 supergloss = rawgloss 

2020 

2021 if supergloss.startswith("; "): 2021 ↛ 2022line 2021 didn't jump to line 2022 because the condition on line 2021 was never true

2022 supergloss = supergloss[1:].strip() 

2023 

2024 if supergloss.startswith(("^†", "†")): 

2025 data_append(sense_base, "tags", "obsolete") 

2026 supergloss = supergloss[2:].strip() 

2027 elif supergloss.startswith("^‡"): 2027 ↛ 2028line 2027 didn't jump to line 2028 because the condition on line 2027 was never true

2028 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2029 supergloss = supergloss[2:].strip() 

2030 

2031 # remove [14th century...] style brackets at the end 

2032 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2033 

2034 if supergloss.startswith((",", ":")): 2034 ↛ 2035line 2034 didn't jump to line 2035 because the condition on line 2034 was never true

2035 supergloss = supergloss[1:] 

2036 supergloss = supergloss.strip() 

2037 if supergloss.startswith("N. of "): 2037 ↛ 2038line 2037 didn't jump to line 2038 because the condition on line 2037 was never true

2038 supergloss = "Name of " + supergloss[6:] 

2039 supergloss = supergloss[2:] 

2040 data_append(sense_base, "glosses", supergloss) 

2041 if supergloss in ("A person:",): 2041 ↛ 2042line 2041 didn't jump to line 2042 because the condition on line 2041 was never true

2042 data_append(sense_base, "tags", "g-person") 

2043 

2044 # The main recursive call (except for the exceptions at the 

2045 # start of this function). 

2046 for sublist in subentries: 

2047 if not ( 2047 ↛ 2050line 2047 didn't jump to line 2050 because the condition on line 2047 was never true

2048 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2049 ): 

2050 wxr.wtp.debug( 

2051 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2052 f"with items that are not LISTs", 

2053 sortid="page/1511/20230119", 

2054 ) 

2055 continue 

2056 for item in sublist.children: 

2057 if not ( 2057 ↛ 2061line 2057 didn't jump to line 2061 because the condition on line 2057 was never true

2058 isinstance(item, WikiNode) 

2059 and item.kind == NodeKind.LIST_ITEM 

2060 ): 

2061 continue 

2062 # copy sense_base to prevent cross-contamination between 

2063 # subglosses and other subglosses and superglosses 

2064 sense_base2 = copy.deepcopy(sense_base) 

2065 if parse_sense_node(item, sense_base2, pos): 2065 ↛ 2056line 2065 didn't jump to line 2056 because the condition on line 2065 was always true

2066 added = True 

2067 

2068 # Capture examples. 

2069 # This is called after the recursive calls above so that 

2070 # sense_base is not contaminated with meta-data from 

2071 # example entries for *this* gloss. 

2072 examples = [] 

2073 if wxr.config.capture_examples: 2073 ↛ 2077line 2073 didn't jump to line 2077 because the condition on line 2073 was always true

2074 examples = extract_examples(others, sense_base) 

2075 

2076 # push_sense() succeeded somewhere down-river, so skip this level 

2077 if added: 

2078 if examples: 

2079 # this higher-up gloss has examples that we do not want to skip 

2080 wxr.wtp.debug( 

2081 "'{}[...]' gloss has examples we want to keep, " 

2082 "but there are subglosses.".format(repr(rawgloss[:30])), 

2083 sortid="page/1498/20230118", 

2084 ) 

2085 else: 

2086 return True 

2087 

2088 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2089 # after the gloss, but these sentences don't seem to be intended 

2090 # as glosses. Skip them. 

2091 indexed_subglosses = list( 

2092 (i, gl) 

2093 for i, gl in enumerate(subglosses) 

2094 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2095 ) 

2096 

2097 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2097 ↛ 2098line 2097 didn't jump to line 2098 because the condition on line 2097 was never true

2098 gl = indexed_subglosses[0][1].strip() 

2099 if gl.endswith(":"): 

2100 gl = gl[:-1].strip() 

2101 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2102 if parsed is not None: 

2103 infl_tags, infl_dts = parsed 

2104 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2105 # Interpret others as a particular form under 

2106 # "inflection of" 

2107 data_extend(sense_base, "tags", infl_tags) 

2108 data_extend(sense_base, "form_of", infl_dts) 

2109 indexed_subglosses = indexed_subglosses[1:] 

2110 elif not infl_dts: 

2111 data_extend(sense_base, "tags", infl_tags) 

2112 indexed_subglosses = indexed_subglosses[1:] 

2113 

2114 # Create senses for remaining subglosses 

2115 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2116 gloss = gloss.strip() 

2117 if not gloss and len(indexed_subglosses) > 1: 2117 ↛ 2118line 2117 didn't jump to line 2118 because the condition on line 2117 was never true

2118 continue 

2119 # Push a new sense (if the last one is not empty) 

2120 if push_sense(): 2120 ↛ 2121line 2120 didn't jump to line 2121 because the condition on line 2120 was never true

2121 added = True 

2122 # if gloss not in sense_data.get("raw_glosses", ()): 

2123 # data_append(sense_data, "raw_glosses", gloss) 

2124 if i == 0 and examples: 

2125 # In a multi-line gloss, associate examples 

2126 # with only one of them. 

2127 # XXX or you could use gloss_i == len(indexed_subglosses) 

2128 # to associate examples with the *last* one. 

2129 data_extend(sense_data, "examples", examples) 

2130 if gloss.startswith("; ") and gloss_i > 0: 2130 ↛ 2131line 2130 didn't jump to line 2131 because the condition on line 2130 was never true

2131 gloss = gloss[1:].strip() 

2132 # If the gloss starts with †, mark as obsolete 

2133 if gloss.startswith("^†"): 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true

2134 data_append(sense_data, "tags", "obsolete") 

2135 gloss = gloss[2:].strip() 

2136 elif gloss.startswith("^‡"): 2136 ↛ 2137line 2136 didn't jump to line 2137 because the condition on line 2136 was never true

2137 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2138 gloss = gloss[2:].strip() 

2139 # Copy data for all senses to this sense 

2140 for k, v in sense_base.items(): 

2141 if isinstance(v, (list, tuple)): 

2142 if k != "tags": 

2143 # Tags handled below (countable/uncountable special) 

2144 data_extend(sense_data, k, v) 

2145 else: 

2146 assert k not in ("tags", "categories", "topics") 

2147 sense_data[k] = v # type:ignore[literal-required] 

2148 # Parse the gloss for this particular sense 

2149 m = QUALIFIERS_RE.match(gloss) 

2150 # (...): ... or (...(...)...): ... 

2151 if m: 

2152 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2153 gloss = gloss[m.end() :].strip() 

2154 

2155 # Remove common suffix "[from 14th c.]" and similar 

2156 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2157 

2158 # Check to make sure we don't have unhandled list items in gloss 

2159 ofs = max(gloss.find("#"), gloss.find("* ")) 

2160 if ofs > 10 and "(#)" not in gloss: 2160 ↛ 2161line 2160 didn't jump to line 2161 because the condition on line 2160 was never true

2161 wxr.wtp.debug( 

2162 "gloss may contain unhandled list items: {}".format(gloss), 

2163 sortid="page/1412", 

2164 ) 

2165 elif "\n" in gloss: 2165 ↛ 2166line 2165 didn't jump to line 2166 because the condition on line 2165 was never true

2166 wxr.wtp.debug( 

2167 "gloss contains newline: {}".format(gloss), 

2168 sortid="page/1416", 

2169 ) 

2170 

2171 # Kludge, some glosses have a comma after initial qualifiers in 

2172 # parentheses 

2173 if gloss.startswith((",", ":")): 2173 ↛ 2174line 2173 didn't jump to line 2174 because the condition on line 2173 was never true

2174 gloss = gloss[1:] 

2175 gloss = gloss.strip() 

2176 if gloss.endswith(":"): 2176 ↛ 2177line 2176 didn't jump to line 2177 because the condition on line 2176 was never true

2177 gloss = gloss[:-1].strip() 

2178 if gloss.startswith("N. of "): 2178 ↛ 2179line 2178 didn't jump to line 2179 because the condition on line 2178 was never true

2179 gloss = "Name of " + gloss[6:] 

2180 if gloss.startswith("†"): 2180 ↛ 2181line 2180 didn't jump to line 2181 because the condition on line 2180 was never true

2181 data_append(sense_data, "tags", "obsolete") 

2182 gloss = gloss[1:] 

2183 elif gloss.startswith("^†"): 2183 ↛ 2184line 2183 didn't jump to line 2184 because the condition on line 2183 was never true

2184 data_append(sense_data, "tags", "obsolete") 

2185 gloss = gloss[2:] 

2186 

2187 # Copy tags from sense_base if any. This will not copy 

2188 # countable/uncountable if either was specified in the sense, 

2189 # as sometimes both are specified in word head but only one 

2190 # in individual senses. 

2191 countability_tags = [] 

2192 base_tags = sense_base.get("tags", ()) 

2193 sense_tags = sense_data.get("tags", ()) 

2194 for tag in base_tags: 

2195 if tag in ("countable", "uncountable"): 

2196 if tag not in countability_tags: 2196 ↛ 2198line 2196 didn't jump to line 2198 because the condition on line 2196 was always true

2197 countability_tags.append(tag) 

2198 continue 

2199 if tag not in sense_tags: 

2200 data_append(sense_data, "tags", tag) 

2201 if countability_tags: 

2202 if ( 2202 ↛ 2211line 2202 didn't jump to line 2211 because the condition on line 2202 was always true

2203 "countable" not in sense_tags 

2204 and "uncountable" not in sense_tags 

2205 ): 

2206 data_extend(sense_data, "tags", countability_tags) 

2207 

2208 # If outer gloss specifies a form-of ("inflection of", see 

2209 # aquamarine/German), try to parse the inner glosses as 

2210 # tags for an inflected form. 

2211 if "form-of" in sense_base.get("tags", ()): 

2212 parsed = parse_alt_or_inflection_of( 

2213 wxr, gloss, gloss_template_args 

2214 ) 

2215 if parsed is not None: 2215 ↛ 2221line 2215 didn't jump to line 2221 because the condition on line 2215 was always true

2216 infl_tags, infl_dts = parsed 

2217 if not infl_dts and infl_tags: 2217 ↛ 2221line 2217 didn't jump to line 2221 because the condition on line 2217 was always true

2218 # Interpret as a particular form under "inflection of" 

2219 data_extend(sense_data, "tags", infl_tags) 

2220 

2221 if not gloss: 2221 ↛ 2222line 2221 didn't jump to line 2222 because the condition on line 2221 was never true

2222 data_append(sense_data, "tags", "empty-gloss") 

2223 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 2223 ↛ 2224line 2223 didn't jump to line 2224 because the condition on line 2223 was never true

2224 if ( 

2225 gloss_i == 0 

2226 and len(sense_data.get("glosses", tuple())) >= 1 

2227 ): 

2228 # If we added a "high-level gloss" from rawgloss, but this 

2229 # is that same gloss_i, add this instead of the raw_gloss 

2230 # from before if they're different: the rawgloss was not 

2231 # cleaned exactly the same as this later gloss 

2232 sense_data["glosses"][-1] = gloss 

2233 else: 

2234 # Add the gloss for the sense. 

2235 data_append(sense_data, "glosses", gloss) 

2236 

2237 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2238 # two abbreviations in the same sense, both generated by the 

2239 # {{abbreviation of|...}} template. Handle these with some magic. 

2240 position = 0 

2241 split_glosses = [] 

2242 for m in re.finditer(r"Abbreviation of ", gloss): 2242 ↛ 2243line 2242 didn't jump to line 2243 because the loop on line 2242 never started

2243 if m.start() != position: 

2244 split_glosses.append(gloss[position : m.start()]) 

2245 position = m.start() 

2246 split_glosses.append(gloss[position:]) 

2247 for gloss in split_glosses: 

2248 # Check if this gloss describes an alt-of or inflection-of 

2249 if ( 

2250 lang_code != "en" 

2251 and " " not in gloss 

2252 and distw([word], gloss) < 0.3 

2253 ): 

2254 # Don't try to parse gloss if it is one word 

2255 # that is close to the word itself for non-English words 

2256 # (probable translations of a tag/form name) 

2257 continue 

2258 parsed = parse_alt_or_inflection_of( 

2259 wxr, gloss, gloss_template_args 

2260 ) 

2261 if parsed is None: 

2262 continue 

2263 tags, dts = parsed 

2264 if not dts and tags: 2264 ↛ 2267line 2264 didn't jump to line 2267 because the condition on line 2264 was always true

2265 data_extend(sense_data, "tags", tags) 

2266 continue 

2267 for dt in dts: # type:ignore[union-attr] 

2268 ftags = list(tag for tag in tags if tag != "form-of") 

2269 if "alt-of" in tags: 

2270 data_extend(sense_data, "tags", ftags) 

2271 data_append(sense_data, "alt_of", dt) 

2272 elif "compound-of" in tags: 

2273 data_extend(sense_data, "tags", ftags) 

2274 data_append(sense_data, "compound_of", dt) 

2275 elif "synonym-of" in tags: 

2276 data_extend(dt, "tags", ftags) 

2277 data_append(sense_data, "synonyms", dt) 

2278 elif tags and dt.get("word", "").startswith("of "): 

2279 dt["word"] = dt["word"][3:] 

2280 data_append(sense_data, "tags", "form-of") 

2281 data_extend(sense_data, "tags", ftags) 

2282 data_append(sense_data, "form_of", dt) 

2283 elif "form-of" in tags: 

2284 data_extend(sense_data, "tags", tags) 

2285 data_append(sense_data, "form_of", dt) 

2286 

2287 if len(sense_data) == 0: 

2288 if len(sense_base.get("tags", [])) == 0: 2288 ↛ 2290line 2288 didn't jump to line 2290 because the condition on line 2288 was always true

2289 del sense_base["tags"] 

2290 sense_data.update(sense_base) 

2291 if push_sense(): 2291 ↛ 2295line 2291 didn't jump to line 2295 because the condition on line 2291 was always true

2292 # push_sense succeded in adding a sense to pos_data 

2293 added = True 

2294 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2295 return added 

2296 

2297 def parse_inflection( 

2298 node: WikiNode, section: str, pos: Optional[str] 

2299 ) -> None: 

2300 """Parses inflection data (declension, conjugation) from the given 

2301 page. This retrieves the actual inflection template 

2302 parameters, which are very useful for applications that need 

2303 to learn the inflection classes and generate inflected 

2304 forms.""" 

2305 assert isinstance(node, WikiNode) 

2306 assert isinstance(section, str) 

2307 assert pos is None or isinstance(pos, str) 

2308 # print("parse_inflection:", node) 

2309 

2310 if pos is None: 2310 ↛ 2311line 2310 didn't jump to line 2311 because the condition on line 2310 was never true

2311 wxr.wtp.debug( 

2312 "inflection table outside part-of-speech", sortid="page/1812" 

2313 ) 

2314 return 

2315 

2316 def inflection_template_fn( 

2317 name: str, ht: TemplateArgs 

2318 ) -> Optional[str]: 

2319 # print("decl_conj_template_fn", name, ht) 

2320 if is_panel_template(wxr, name): 

2321 return "" 

2322 if name in ("is-u-mutation",): 

2323 # These are not to be captured as an exception to the 

2324 # generic code below 

2325 return None 

2326 m = re.search( 

2327 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2328 r"declension|inflection|mut|mutation)($|-)", 

2329 name, 

2330 ) 

2331 if m: 

2332 args_ht = clean_template_args(wxr, ht) 

2333 dt = {"name": name, "args": args_ht} 

2334 data_append(pos_data, "inflection_templates", dt) 

2335 

2336 return None 

2337 

2338 # Convert the subtree back to Wikitext, then expand all and parse, 

2339 # capturing templates in the process 

2340 text = wxr.wtp.node_to_wikitext(node.children) 

2341 

2342 # Split text into separate sections for each to-level template 

2343 brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] 

2344 template_sections = [] 

2345 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2346 # Because there is the possibility of triple curly braces 

2347 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2348 # count nesting depth using pairs of two brackets, but 

2349 # instead use singular braces ("{ }"). 

2350 # Because template delimiters should be balanced, regardless 

2351 # of whether {{ or {{{ is used, and because we only care 

2352 # about the outer-most delimiters (the highest level template) 

2353 # we can just count the single braces when those single 

2354 # braces are part of a group. 

2355 

2356 # print(f"Parse inflection: {text=}") 

2357 # print(repr(brace_matches)) 

2358 if len(brace_matches) > 1: 2358 ↛ 2359line 2358 didn't jump to line 2359 because the condition on line 2358 was never true

2359 tsection: list[str] = [] 

2360 after_templates = False # kludge to keep any text 

2361 # before first template 

2362 # with the first template; 

2363 # otherwise, text 

2364 # goes with preceding template 

2365 for m in brace_matches: 

2366 if m.startswith("\n; ") and after_templates: 

2367 after_templates = False 

2368 template_sections.append(tsection) 

2369 tsection = [] 

2370 tsection.append(m) 

2371 elif m.startswith("{{"): 

2372 if template_nesting == 0 and after_templates: 

2373 template_sections.append(tsection) 

2374 tsection = [] 

2375 # start new section 

2376 after_templates = True 

2377 template_nesting += len(m) 

2378 tsection.append(m) 

2379 elif m.startswith("}}"): 

2380 template_nesting -= len(m) 

2381 if template_nesting < 0: 

2382 wxr.wtp.error( 

2383 "Negatively nested braces, " 

2384 "couldn't split inflection templates, " 

2385 "{}/{} section {}".format(word, language, section), 

2386 sortid="page/1871", 

2387 ) 

2388 template_sections = [] # use whole text 

2389 break 

2390 tsection.append(m) 

2391 else: 

2392 tsection.append(m) 

2393 if tsection: # dangling tsection 

2394 template_sections.append(tsection) 

2395 # Why do it this way around? The parser has a preference 

2396 # to associate bits outside of tables with the preceding 

2397 # table (`after`-variable), so a new tsection begins 

2398 # at {{ and everything before it belongs to the previous 

2399 # template. 

2400 

2401 texts = [] 

2402 if not template_sections: 2402 ↛ 2405line 2402 didn't jump to line 2405 because the condition on line 2402 was always true

2403 texts = [text] 

2404 else: 

2405 for tsection in template_sections: 

2406 texts.append("".join(tsection)) 

2407 if template_nesting != 0: 2407 ↛ 2408line 2407 didn't jump to line 2408 because the condition on line 2407 was never true

2408 wxr.wtp.error( 

2409 "Template nesting error: " 

2410 "template_nesting = {} " 

2411 "couldn't split inflection templates, " 

2412 "{}/{} section {}".format( 

2413 template_nesting, word, language, section 

2414 ), 

2415 sortid="page/1896", 

2416 ) 

2417 texts = [text] 

2418 for text in texts: 

2419 tree = wxr.wtp.parse( 

2420 text, expand_all=True, template_fn=inflection_template_fn 

2421 ) 

2422 

2423 # Parse inflection tables from the section. The data is stored 

2424 # under "forms". 

2425 if wxr.config.capture_inflections: 2425 ↛ 2418line 2425 didn't jump to line 2418 because the condition on line 2425 was always true

2426 tablecontext = None 

2427 m = re.search(r"{{([^}{|]+)\|?", text) 

2428 if m: 2428 ↛ 2429line 2428 didn't jump to line 2429 because the condition on line 2428 was never true

2429 template_name = m.group(1) 

2430 tablecontext = TableContext(template_name) 

2431 

2432 parse_inflection_section( 

2433 wxr, 

2434 pos_data, 

2435 word, 

2436 language, 

2437 pos, 

2438 section, 

2439 tree, 

2440 tablecontext=tablecontext, 

2441 ) 

2442 

2443 def get_subpage_section( 

2444 title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]] 

2445 ) -> Optional[Union[WikiNode, str]]: 

2446 """Loads a subpage of the given page, and finds the section 

2447 for the given language, part-of-speech, and section title. This 

2448 is used for finding translations and other sections on subpages.""" 

2449 assert isinstance(language, str) 

2450 assert isinstance(title, str) 

2451 assert isinstance(subtitle, str) 

2452 assert isinstance(seq, (list, tuple)) 

2453 for x in seq: 

2454 assert isinstance(x, str) 

2455 subpage_title = word + "/" + subtitle 

2456 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2457 if subpage_content is None: 

2458 wxr.wtp.error( 

2459 "/translations not found despite " 

2460 "{{see translation subpage|...}}", 

2461 sortid="page/1934", 

2462 ) 

2463 return None 

2464 

2465 def recurse( 

2466 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2467 ) -> Optional[Union[str, WikiNode]]: 

2468 # print(f"seq: {seq}") 

2469 if not seq: 

2470 return node 

2471 if not isinstance(node, WikiNode): 

2472 return None 

2473 # print(f"node.kind: {node.kind}") 

2474 if node.kind in LEVEL_KINDS: 

2475 t = clean_node(wxr, None, node.largs[0]) 

2476 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2477 if t.lower() == seq[0].lower(): 

2478 seq = seq[1:] 

2479 if not seq: 

2480 return node 

2481 for n in node.children: 

2482 ret = recurse(n, seq) 

2483 if ret is not None: 

2484 return ret 

2485 return None 

2486 

2487 tree = wxr.wtp.parse( 

2488 subpage_content, 

2489 pre_expand=True, 

2490 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2491 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2492 ) 

2493 assert tree.kind == NodeKind.ROOT 

2494 ret = recurse(tree, seq) 

2495 if ret is None: 

2496 wxr.wtp.debug( 

2497 "Failed to find subpage section {}/{} seq {}".format( 

2498 title, subtitle, seq 

2499 ), 

2500 sortid="page/1963", 

2501 ) 

2502 return ret 

2503 

2504 def parse_linkage( 

2505 data: WordData, field: str, linkagenode: WikiNode 

2506 ) -> None: 

2507 assert isinstance(data, dict) 

2508 assert isinstance(field, str) 

2509 assert isinstance(linkagenode, WikiNode) 

2510 # if field == "synonyms": 

2511 # print("field", field) 

2512 # print("data", data) 

2513 # print("children:") 

2514 # print(linkagenode.children) 

2515 if not wxr.config.capture_linkages: 2515 ↛ 2516line 2515 didn't jump to line 2516 because the condition on line 2515 was never true

2516 return 

2517 have_panel_template = False 

2518 toplevel_text = [] 

2519 next_navframe_sense = None # Used for "(sense):" before NavFrame 

2520 

2521 def parse_linkage_item( 

2522 contents: list[Union[str, WikiNode]], 

2523 field: str, 

2524 sense: Optional[str] = None, 

2525 ): 

2526 assert isinstance(contents, (list, tuple)) 

2527 assert isinstance(field, str) 

2528 assert sense is None or isinstance(sense, str) 

2529 

2530 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

2531 # .format(field, sense, contents)) 

2532 

2533 parts: list[str] = [] 

2534 ruby: list[tuple[str, str]] = [] 

2535 urls: list[str] = [] 

2536 # data about link text; this is used to skip splitting on 

2537 # linkage text items that contain stuff like commas; for 

2538 # example "Hunde, die bellen, beißen nicht" in article 

2539 # beißen is split into "Hunde", "die bellen" etc. 

2540 # We take that link text and use it, eventually, 

2541 # in split_at_comma_semi to skip splitting on those 

2542 # commas. 

2543 links_that_should_not_be_split: list[str] = [] 

2544 

2545 def item_recurse( 

2546 contents: list[Union[str, WikiNode]], italic=False 

2547 ) -> None: 

2548 assert isinstance(contents, (list, tuple)) 

2549 nonlocal sense 

2550 nonlocal ruby 

2551 nonlocal parts 

2552 # print("ITEM_RECURSE:", contents) 

2553 for node in contents: 

2554 if isinstance(node, str): 2554 ↛ 2557line 2554 didn't jump to line 2557 because the condition on line 2554 was always true

2555 parts.append(node) 

2556 continue 

2557 kind = node.kind 

2558 # print("ITEM_RECURSE KIND:", kind, 

2559 # node.sarg if node.sarg else node.largs) 

2560 if kind == NodeKind.LIST: 

2561 if parts: 

2562 sense1: Optional[str] 

2563 sense1 = clean_node(wxr, None, parts) 

2564 if sense1.endswith(":"): 

2565 sense1 = sense1[:-1].strip() 

2566 if sense1.startswith("(") and sense1.endswith(")"): 

2567 sense1 = sense1[1:-1].strip() 

2568 if sense1.lower() == TRANSLATIONS_TITLE: 

2569 sense1 = None 

2570 # print("linkage item_recurse LIST sense1:", sense1) 

2571 parse_linkage_recurse( 

2572 node.children, field, sense=sense1 or sense 

2573 ) 

2574 parts = [] 

2575 else: 

2576 parse_linkage_recurse(node.children, field, sense) 

2577 elif kind in ( 

2578 NodeKind.TABLE, 

2579 NodeKind.TABLE_ROW, 

2580 NodeKind.TABLE_CELL, 

2581 ): 

2582 parse_linkage_recurse(node.children, field, sense) 

2583 elif kind in ( 

2584 NodeKind.TABLE_HEADER_CELL, 

2585 NodeKind.TABLE_CAPTION, 

2586 ): 

2587 continue 

2588 elif kind == NodeKind.HTML: 

2589 classes = (node.attrs.get("class") or "").split() 

2590 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2591 continue 

2592 elif node.sarg == "ruby": 

2593 rb = parse_ruby(wxr, node) 

2594 if rb: 

2595 ruby.append(rb) 

2596 parts.append(rb[0]) 

2597 continue 

2598 elif node.sarg == "math": 

2599 parts.append(clean_node(wxr, None, node)) 

2600 continue 

2601 elif "interProject" in classes: 

2602 continue # These do not seem to be displayed 

2603 if "NavFrame" in classes: 

2604 parse_linkage_recurse(node.children, field, sense) 

2605 else: 

2606 item_recurse(node.children, italic=italic) 

2607 elif kind == NodeKind.ITALIC: 

2608 item_recurse(node.children, italic=True) 

2609 elif kind == NodeKind.LINK: 

2610 ignore = False 

2611 if isinstance(node.largs[0][0], str): 

2612 v1 = node.largs[0][0].strip().lower() 

2613 if v1.startswith( 

2614 ns_title_prefix_tuple(wxr, "Category", True) 

2615 + ns_title_prefix_tuple(wxr, "File", True) 

2616 ): 

2617 ignore = True 

2618 if not ignore: 

2619 v = node.largs[-1] 

2620 if ( 

2621 len(node.largs) == 1 

2622 and len(v) > 0 

2623 and isinstance(v[0], str) 

2624 and v[0][0] == ":" 

2625 ): 

2626 v = [v[0][1:]] + list(v[1:]) # type:ignore 

2627 if isinstance(v[0], str) and not v[0].isalnum(): 

2628 links_that_should_not_be_split.append( 

2629 "".join(v[0]) 

2630 ) # type: ignore 

2631 item_recurse(v, italic=italic) 

2632 elif kind == NodeKind.URL: 

2633 if len(node.largs) < 2 and node.largs: 

2634 # Naked url captured 

2635 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

2636 continue 

2637 if len(node.largs) == 2: 

2638 # Url from link with text 

2639 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

2640 # print(f"{node.largs=!r}") 

2641 # print("linkage recurse URL {}".format(node)) 

2642 item_recurse(node.largs[-1], italic=italic) 

2643 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 

2644 item_recurse(node.children, italic=italic) 

2645 else: 

2646 wxr.wtp.debug( 

2647 "linkage item_recurse unhandled {}: {}".format( 

2648 node.kind, node 

2649 ), 

2650 sortid="page/2073", 

2651 ) 

2652 

2653 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

2654 # .format(contents)) 

2655 

2656 item_recurse(contents) 

2657 item = clean_node(wxr, None, parts) 

2658 # print("LINKAGE ITEM CONTENTS:", parts) 

2659 # print("CLEANED ITEM: {!r}".format(item)) 

2660 # print(f"URLS {urls=!r}") 

2661 

2662 return parse_linkage_item_text( 

2663 wxr, 

2664 word, 

2665 data, 

2666 field, 

2667 item, 

2668 sense, 

2669 ruby, 

2670 pos_datas, 

2671 is_reconstruction, 

2672 urls or None, 

2673 links_that_should_not_be_split or None, 

2674 ) 

2675 

2676 def parse_linkage_recurse( 

2677 contents: list[Union[WikiNode, str]], 

2678 field: str, 

2679 sense: Optional[str], 

2680 ) -> None: 

2681 assert isinstance(contents, (list, tuple)) 

2682 assert sense is None or isinstance(sense, str) 

2683 nonlocal next_navframe_sense 

2684 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

2685 for node in contents: 

2686 if isinstance(node, str): 

2687 # Ignore top-level text, generally comments before the 

2688 # linkages list. However, if no linkages are found, then 

2689 # use this for linkages (not all words use bullet points 

2690 # for linkages). 

2691 toplevel_text.append(node) 

2692 continue 

2693 assert isinstance(node, WikiNode) 

2694 kind = node.kind 

2695 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

2696 if kind == NodeKind.LIST: 

2697 parse_linkage_recurse(node.children, field, sense) 

2698 elif kind == NodeKind.LIST_ITEM: 2698 ↛ 2705line 2698 didn't jump to line 2705 because the condition on line 2698 was always true

2699 v = parse_linkage_item(node.children, field, sense) 

2700 if v: 2700 ↛ 2704line 2700 didn't jump to line 2704 because the condition on line 2700 was never true

2701 # parse_linkage_item() can return a value that should 

2702 # be used as the sense for the follow-on linkages, 

2703 # which are typically provided in a table (see 滿) 

2704 next_navframe_sense = v 

2705 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

2706 parse_linkage_recurse(node.children, field, sense) 

2707 elif kind == NodeKind.TABLE_CELL: 

2708 parse_linkage_item(node.children, field, sense) 

2709 elif kind in ( 

2710 NodeKind.TABLE_CAPTION, 

2711 NodeKind.TABLE_HEADER_CELL, 

2712 NodeKind.PREFORMATTED, 

2713 NodeKind.BOLD, 

2714 ): 

2715 continue 

2716 elif kind == NodeKind.HTML: 

2717 # Recurse to process inside the HTML for most tags 

2718 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2719 continue 

2720 classes = (node.attrs.get("class") or "").split() 

2721 if node.sarg == "li": 

2722 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑ 

2723 v = parse_linkage_item(node.children, field, sense) 

2724 if v: 

2725 next_navframe_sense = v 

2726 elif "qualifier-content" in classes: 

2727 sense1 = clean_node(wxr, None, node.children) 

2728 if sense1.endswith(":"): 

2729 sense1 = sense1[:-1].strip() 

2730 if sense and sense1: 

2731 wxr.wtp.debug( 

2732 "linkage qualifier-content on multiple " 

2733 "levels: {!r} and {!r}".format(sense, sense1), 

2734 sortid="page/2170", 

2735 ) 

2736 parse_linkage_recurse(node.children, field, sense1) 

2737 elif "NavFrame" in classes: 

2738 # NavFrame uses previously assigned next_navframe_sense 

2739 # (from a "(sense):" item) and clears it afterwards 

2740 parse_linkage_recurse( 

2741 node.children, field, sense or next_navframe_sense 

2742 ) 

2743 next_navframe_sense = None 

2744 else: 

2745 parse_linkage_recurse(node.children, field, sense) 

2746 elif kind in LEVEL_KINDS: 

2747 # Just recurse to any possible subsections 

2748 parse_linkage_recurse(node.children, field, sense) 

2749 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

2750 # Skip these on top level; at least sometimes bold is 

2751 # used for indicating a subtitle 

2752 continue 

2753 elif kind == NodeKind.LINK: 

2754 # Recurse into the last argument 

2755 # Apparently ":/" is used as a link to "/", so strip 

2756 # initial value 

2757 parse_linkage_recurse(node.largs[-1], field, sense) 

2758 else: 

2759 wxr.wtp.debug( 

2760 "parse_linkage_recurse unhandled {}: {}".format( 

2761 kind, node 

2762 ), 

2763 sortid="page/2196", 

2764 ) 

2765 

2766 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

2767 nonlocal have_panel_template 

2768 if is_panel_template(wxr, name): 

2769 have_panel_template = True 

2770 return "" 

2771 return None 

2772 

2773 def parse_zh_synonyms( 

2774 parsed: list[Union[WikiNode, str]], 

2775 data: list[LinkageData], 

2776 hdrs: list[str], 

2777 root_word: str, 

2778 ) -> None: 

2779 """Parses Chinese dialectal synonyms tables""" 

2780 for item in parsed: 

2781 if isinstance(item, WikiNode): 

2782 if item.kind == NodeKind.TABLE_ROW: 

2783 cleaned = clean_node(wxr, None, item.children) 

2784 # print("cleaned:", repr(cleaned)) 

2785 if any( 

2786 [ 

2787 "Variety" in cleaned, 

2788 "Location" in cleaned, 

2789 "Words" in cleaned, 

2790 ] 

2791 ): 

2792 pass 

2793 else: 

2794 split = cleaned.split("\n") 

2795 new_hdrs = split[:-1] 

2796 if len(new_hdrs) == 2: 

2797 hdrs = [new_hdrs[0]] 

2798 new_hdrs.pop(0) 

2799 combined_hdrs = [x.strip() for x in hdrs + new_hdrs] 

2800 tags = [] 

2801 words = split[-1].split(",") 

2802 for hdr in combined_hdrs: 

2803 hdr = hdr.replace("(", ",") 

2804 hdr = hdr.replace(")", "") 

2805 hdr = hdr.replace("N.", "Northern,") 

2806 hdr = hdr.replace("S.", "Southern,") 

2807 new = hdr.split(",") 

2808 for tag in sorted(new): 

2809 tag = tag.strip() 

2810 tag = tag.replace(" ", "-") 

2811 if tag in valid_tags: 

2812 tags.append(tag) 

2813 else: 

2814 if tag in zh_tag_lookup: 

2815 tags.extend(zh_tag_lookup[tag]) 

2816 else: 

2817 print( 

2818 f"MISSING ZH SYNONYM TAG for " 

2819 f"root {root_word}, word " 

2820 f"{words}: {tag}" 

2821 ) 

2822 sys.stdout.flush() 

2823 

2824 for word in words: 

2825 data.append( 

2826 {"word": word.strip(), "tags": tags} 

2827 ) 

2828 elif item.kind == NodeKind.HTML: 

2829 cleaned = clean_node(wxr, None, item.children) 

2830 if "Synonyms of" in cleaned: 

2831 cleaned = cleaned.replace("Synonyms of ", "") 

2832 root_word = cleaned 

2833 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2834 else: 

2835 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2836 

2837 def parse_zh_synonyms_list( 

2838 parsed: list[Union[WikiNode, str]], 

2839 data: list[LinkageData], 

2840 hdrs: list[str], 

2841 root_word: str, 

2842 ) -> None: 

2843 """Parses Chinese dialectal synonyms tables (list format)""" 

2844 for item in parsed: 

2845 if isinstance(item, WikiNode): 

2846 if item.kind == NodeKind.LIST_ITEM: 

2847 cleaned = clean_node(wxr, None, item.children) 

2848 # print("cleaned:", repr(cleaned)) 

2849 if any( 

2850 [ 

2851 "Variety" in cleaned, 

2852 "Location" in cleaned, 

2853 "Words" in cleaned, 

2854 ] 

2855 ): 

2856 pass 

2857 else: 

2858 cleaned = cleaned.replace("(", ",") 

2859 cleaned = cleaned.replace(")", "") 

2860 split = cleaned.split(",") 

2861 # skip empty words / titles 

2862 if split[0] == "": 

2863 continue 

2864 words = split[0].split("/") 

2865 new_hdrs = [x.strip() for x in split[1:]] 

2866 tags = [] 

2867 roman = None 

2868 for tag in sorted(new_hdrs): 

2869 if tag in valid_tags: 

2870 tags.append(tag) 

2871 elif tag in zh_tag_lookup: 

2872 tags.extend(zh_tag_lookup[tag]) 

2873 elif ( 

2874 classify_desc(tag) == "romanization" 

2875 and roman is None 

2876 ): 

2877 roman = tag 

2878 else: 

2879 print( 

2880 f"MISSING ZH SYNONYM TAG " 

2881 f"(possibly pinyin) - root " 

2882 f"{root_word}, word {words}: {tag}" 

2883 ) 

2884 sys.stdout.flush() 

2885 

2886 for word in words: 

2887 dt: LinkageData = {"word": word.strip()} 

2888 if tags: 

2889 dt["tags"] = tags 

2890 if roman is not None: 

2891 dt["roman"] = roman 

2892 data.append(dt) 

2893 elif item.kind == NodeKind.HTML: 

2894 cleaned = clean_node(wxr, None, item.children) 

2895 if cleaned.find("Synonyms of") >= 0: 

2896 cleaned = cleaned.replace("Synonyms of ", "") 

2897 root_word = cleaned 

2898 parse_zh_synonyms_list( 

2899 item.children, data, hdrs, root_word 

2900 ) 

2901 else: 

2902 parse_zh_synonyms_list( 

2903 item.children, data, hdrs, root_word 

2904 ) 

2905 

2906 def contains_kind( 

2907 children: list[Union[WikiNode, str]], nodekind: NodeKind 

2908 ) -> bool: 

2909 assert isinstance(children, list) 

2910 for item in children: 

2911 if not isinstance(item, WikiNode): 

2912 continue 

2913 if item.kind == nodekind: 

2914 return True 

2915 elif contains_kind(item.children, nodekind): 

2916 return True 

2917 return False 

2918 

2919 # Main body of parse_linkage() 

2920 text = wxr.wtp.node_to_wikitext(linkagenode.children) 

2921 parsed = wxr.wtp.parse( 

2922 text, expand_all=True, template_fn=linkage_template_fn1 

2923 ) 

2924 if field == "synonyms" and lang_code == "zh": 2924 ↛ 2925line 2924 didn't jump to line 2925 because the condition on line 2924 was never true

2925 synonyms: list[LinkageData] = [] 

2926 if contains_kind(parsed.children, NodeKind.LIST): 

2927 parse_zh_synonyms_list(parsed.children, synonyms, [], "") 

2928 else: 

2929 parse_zh_synonyms(parsed.children, synonyms, [], "") 

2930 # print(json.dumps(synonyms, indent=4, ensure_ascii=False)) 

2931 data_extend(data, "synonyms", synonyms) 

2932 parse_linkage_recurse(parsed.children, field, None) 

2933 if not data.get(field) and not have_panel_template: 2933 ↛ 2934line 2933 didn't jump to line 2934 because the condition on line 2933 was never true

2934 text = "".join(toplevel_text).strip() 

2935 if "\n" not in text and "," in text and text.count(",") > 3: 

2936 if not text.startswith("See "): 

2937 parse_linkage_item([text], field, None) 

2938 

2939 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2940 """Parses translations for a word. This may also pull in translations 

2941 from separate translation subpages.""" 

2942 assert isinstance(data, dict) 

2943 assert isinstance(xlatnode, WikiNode) 

2944 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2945 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2946 # print("parse_translations xlatnode={}".format(xlatnode)) 

2947 if not wxr.config.capture_translations: 2947 ↛ 2948line 2947 didn't jump to line 2948 because the condition on line 2947 was never true

2948 return 

2949 sense_parts: list[Union[WikiNode, str]] = [] 

2950 sense: Optional[str] = None 

2951 

2952 def parse_translation_item( 

2953 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

2954 ) -> None: 

2955 nonlocal sense 

2956 assert isinstance(contents, list) 

2957 assert lang is None or isinstance(lang, str) 

2958 # print("PARSE_TRANSLATION_ITEM:", contents) 

2959 

2960 langcode: Optional[str] = None 

2961 if sense is None: 

2962 sense = clean_node(wxr, data, sense_parts).strip() 

2963 # print("sense <- clean_node: ", sense) 

2964 idx = sense.find("See also translations at") 

2965 if idx > 0: 2965 ↛ 2966line 2965 didn't jump to line 2966 because the condition on line 2965 was never true

2966 wxr.wtp.debug( 

2967 "Skipping translation see also: {}".format(sense), 

2968 sortid="page/2361", 

2969 ) 

2970 sense = sense[:idx].strip() 

2971 if sense.endswith(":"): 2971 ↛ 2972line 2971 didn't jump to line 2972 because the condition on line 2971 was never true

2972 sense = sense[:-1].strip() 

2973 if sense.endswith("—"): 2973 ↛ 2974line 2973 didn't jump to line 2974 because the condition on line 2973 was never true

2974 sense = sense[:-1].strip() 

2975 translations_from_template: list[str] = [] 

2976 

2977 def translation_item_template_fn( 

2978 name: str, ht: TemplateArgs 

2979 ) -> Optional[str]: 

2980 nonlocal langcode 

2981 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

2982 if is_panel_template(wxr, name): 

2983 return "" 

2984 if name in ("t+check", "t-check", "t-needed"): 

2985 # We ignore these templates. They seem to have outright 

2986 # garbage in some entries, and very varying formatting in 

2987 # others. These should be transitory and unreliable 

2988 # anyway. 

2989 return "__IGNORE__" 

2990 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

2991 code = ht.get(1) 

2992 if code: 

2993 if langcode and code != langcode: 

2994 wxr.wtp.debug( 

2995 "inconsistent language codes {} vs " 

2996 "{} in translation item: {!r} {}".format( 

2997 langcode, code, name, ht 

2998 ), 

2999 sortid="page/2386", 

3000 ) 

3001 langcode = code 

3002 tr = ht.get(2) 

3003 if tr: 

3004 tr = clean_node(wxr, None, [tr]) 

3005 translations_from_template.append(tr) 

3006 return None 

3007 if name == "t-egy": 

3008 langcode = "egy" 

3009 return None 

3010 if name == "ttbc": 

3011 code = ht.get(1) 

3012 if code: 

3013 langcode = code 

3014 return None 

3015 if name == "trans-see": 

3016 wxr.wtp.error( 

3017 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

3018 ) 

3019 return "" 

3020 if name.endswith("-top"): 

3021 return "" 

3022 if name.endswith("-bottom"): 

3023 return "" 

3024 if name.endswith("-mid"): 

3025 return "" 

3026 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

3027 # .format(name), 

3028 # sortid="page/2414") 

3029 return None 

3030 

3031 sublists = list( 

3032 x 

3033 for x in contents 

3034 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

3035 ) 

3036 contents = list( 

3037 x 

3038 for x in contents 

3039 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

3040 ) 

3041 

3042 item = clean_node( 

3043 wxr, data, contents, template_fn=translation_item_template_fn 

3044 ) 

3045 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

3046 

3047 # Parse the translation item. 

3048 if item: 3048 ↛ exitline 3048 didn't return from function 'parse_translation_item' because the condition on line 3048 was always true

3049 lang = parse_translation_item_text( 

3050 wxr, 

3051 word, 

3052 data, 

3053 item, 

3054 sense, 

3055 lang, 

3056 langcode, 

3057 translations_from_template, 

3058 is_reconstruction, 

3059 ) 

3060 

3061 # Handle sublists. They are frequently used for different 

3062 # scripts for the language and different variants of the 

3063 # language. We will include the lower-level header as a 

3064 # tag in those cases. 

3065 for listnode in sublists: 3065 ↛ 3066line 3065 didn't jump to line 3066 because the loop on line 3065 never started

3066 assert listnode.kind == NodeKind.LIST 

3067 for node in listnode.children: 

3068 if not isinstance(node, WikiNode): 

3069 continue 

3070 if node.kind == NodeKind.LIST_ITEM: 

3071 parse_translation_item(node.children, lang=lang) 

3072 

3073 def parse_translation_template(node: WikiNode) -> None: 

3074 assert isinstance(node, WikiNode) 

3075 

3076 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3077 nonlocal sense_parts 

3078 nonlocal sense 

3079 if is_panel_template(wxr, name): 

3080 return "" 

3081 if name == "see also": 

3082 # XXX capture 

3083 # XXX for example, "/" has top-level list containing 

3084 # see also items. So also should parse those. 

3085 return "" 

3086 if name == "trans-see": 

3087 # XXX capture 

3088 return "" 

3089 if name == "see translation subpage": 

3090 sense_parts = [] 

3091 sense = None 

3092 sub = ht.get(1, "") 

3093 if sub: 

3094 m = re.match( 

3095 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

3096 ) 

3097 else: 

3098 m = None 

3099 etym = "" 

3100 etym_numbered = "" 

3101 pos = "" 

3102 if m: 

3103 etym_numbered = m.group(1) 

3104 etym = m.group(2) 

3105 pos = m.group(3) 

3106 if not sub: 

3107 wxr.wtp.debug( 

3108 "no part-of-speech in " 

3109 "{{see translation subpage|...}}, " 

3110 "defaulting to just wxr.wtp.section " 

3111 "(= language)", 

3112 sortid="page/2468", 

3113 ) 

3114 # seq sent to get_subpage_section without sub and pos 

3115 seq = [ 

3116 language, 

3117 TRANSLATIONS_TITLE, 

3118 ] 

3119 elif ( 

3120 m 

3121 and etym.lower().strip() in ETYMOLOGY_TITLES 

3122 and pos.lower() in POS_TITLES 

3123 ): 

3124 seq = [ 

3125 language, 

3126 etym_numbered, 

3127 pos, 

3128 TRANSLATIONS_TITLE, 

3129 ] 

3130 elif sub.lower() in POS_TITLES: 

3131 # seq with sub but not pos 

3132 seq = [ 

3133 language, 

3134 sub, 

3135 TRANSLATIONS_TITLE, 

3136 ] 

3137 else: 

3138 # seq with sub and pos 

3139 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3140 if pos.lower() not in POS_TITLES: 

3141 wxr.wtp.debug( 

3142 "unhandled see translation subpage: " 

3143 "language={} sub={} " 

3144 "wxr.wtp.subsection={}".format( 

3145 language, sub, wxr.wtp.subsection 

3146 ), 

3147 sortid="page/2478", 

3148 ) 

3149 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

3150 subnode = get_subpage_section( 

3151 wxr.wtp.title or "MISSING_TITLE", 

3152 TRANSLATIONS_TITLE, 

3153 seq, 

3154 ) 

3155 if subnode is not None and isinstance(subnode, WikiNode): 

3156 parse_translations(data, subnode) 

3157 else: 

3158 # Failed to find the normal subpage section 

3159 seq = [TRANSLATIONS_TITLE] 

3160 subnode = get_subpage_section( 

3161 wxr.wtp.title or "MISSING_TITLE", 

3162 TRANSLATIONS_TITLE, 

3163 seq, 

3164 ) 

3165 if subnode is not None and isinstance( 

3166 subnode, WikiNode 

3167 ): 

3168 parse_translations(data, subnode) 

3169 return "" 

3170 if name in ( 

3171 "c", 

3172 "C", 

3173 "categorize", 

3174 "cat", 

3175 "catlangname", 

3176 "topics", 

3177 "top", 

3178 "qualifier", 

3179 "cln", 

3180 ): 

3181 # These are expanded in the default way 

3182 return None 

3183 if name in ("trans-top",): 

3184 # XXX capture id from trans-top? Capture sense here 

3185 # instead of trying to parse it from expanded content? 

3186 if ht.get(1): 

3187 sense_parts = [] 

3188 sense = ht.get(1) 

3189 else: 

3190 sense_parts = [] 

3191 sense = None 

3192 return None 

3193 if name in ( 

3194 "trans-bottom", 

3195 "trans-mid", 

3196 "checktrans-mid", 

3197 "checktrans-bottom", 

3198 ): 

3199 return None 

3200 if name == "checktrans-top": 

3201 sense_parts = [] 

3202 sense = None 

3203 return "" 

3204 if name == "trans-top-also": 

3205 # XXX capture? 

3206 sense_parts = [] 

3207 sense = None 

3208 return "" 

3209 wxr.wtp.error( 

3210 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3211 name, ht 

3212 ), 

3213 sortid="page/2517", 

3214 ) 

3215 return "" 

3216 

3217 wxr.wtp.expand( 

3218 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3219 ) 

3220 

3221 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3222 nonlocal sense 

3223 nonlocal sense_parts 

3224 for node in xlatnode.children: 

3225 # print(node) 

3226 if isinstance(node, str): 

3227 if sense: 3227 ↛ 3228line 3227 didn't jump to line 3228 because the condition on line 3227 was never true

3228 if not node.isspace(): 

3229 wxr.wtp.debug( 

3230 "skipping string in the middle of " 

3231 "translations: {}".format(node), 

3232 sortid="page/2530", 

3233 ) 

3234 continue 

3235 # Add a part to the sense 

3236 sense_parts.append(node) 

3237 sense = None 

3238 continue 

3239 assert isinstance(node, WikiNode) 

3240 kind = node.kind 

3241 if kind == NodeKind.LIST: 3241 ↛ 3250line 3241 didn't jump to line 3250 because the condition on line 3241 was always true

3242 for item in node.children: 

3243 if not isinstance(item, WikiNode): 3243 ↛ 3244line 3243 didn't jump to line 3244 because the condition on line 3243 was never true

3244 continue 

3245 if item.kind != NodeKind.LIST_ITEM: 3245 ↛ 3246line 3245 didn't jump to line 3246 because the condition on line 3245 was never true

3246 continue 

3247 if item.sarg == ":": 3247 ↛ 3248line 3247 didn't jump to line 3248 because the condition on line 3247 was never true

3248 continue 

3249 parse_translation_item(item.children) 

3250 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 

3251 # Silently skip list items that are just indented; these 

3252 # are used for text between translations, such as indicating 

3253 # translations that need to be checked. 

3254 pass 

3255 elif kind == NodeKind.TEMPLATE: 

3256 parse_translation_template(node) 

3257 elif kind in ( 

3258 NodeKind.TABLE, 

3259 NodeKind.TABLE_ROW, 

3260 NodeKind.TABLE_CELL, 

3261 ): 

3262 parse_translation_recurse(node) 

3263 elif kind == NodeKind.HTML: 

3264 if node.attrs.get("class") == "NavFrame": 

3265 # Reset ``sense_parts`` (and force recomputing 

3266 # by clearing ``sense``) as each NavFrame specifies 

3267 # its own sense. This helps eliminate garbage coming 

3268 # from text at the beginning at the translations 

3269 # section. 

3270 sense_parts = [] 

3271 sense = None 

3272 # for item in node.children: 

3273 # if not isinstance(item, WikiNode): 

3274 # continue 

3275 # parse_translation_recurse(item) 

3276 parse_translation_recurse(node) 

3277 elif kind in LEVEL_KINDS: 

3278 # Sub-levels will be recursed elsewhere 

3279 pass 

3280 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3281 parse_translation_recurse(node) 

3282 elif kind == NodeKind.PREFORMATTED: 

3283 print("parse_translation_recurse: PREFORMATTED:", node) 

3284 elif kind == NodeKind.LINK: 

3285 arg0 = node.largs[0] 

3286 # Kludge: I've seen occasional normal links to translation 

3287 # subpages from main pages (e.g., language/English/Noun 

3288 # in July 2021) instead of the normal 

3289 # {{see translation subpage|...}} template. This should 

3290 # handle them. Note: must be careful not to read other 

3291 # links, particularly things like in "human being": 

3292 # "a human being -- see [[man/translations]]" (group title) 

3293 if ( 

3294 isinstance(arg0, (list, tuple)) 

3295 and arg0 

3296 and isinstance(arg0[0], str) 

3297 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3298 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3299 == wxr.wtp.title 

3300 ): 

3301 wxr.wtp.debug( 

3302 "translations subpage link found on main " 

3303 "page instead " 

3304 "of normal {{see translation subpage|...}}", 

3305 sortid="page/2595", 

3306 ) 

3307 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3308 if sub.lower() in POS_TITLES: 

3309 seq = [ 

3310 language, 

3311 sub, 

3312 TRANSLATIONS_TITLE, 

3313 ] 

3314 subnode = get_subpage_section( 

3315 wxr.wtp.title, 

3316 TRANSLATIONS_TITLE, 

3317 seq, 

3318 ) 

3319 if subnode is not None and isinstance( 

3320 subnode, WikiNode 

3321 ): 

3322 parse_translations(data, subnode) 

3323 else: 

3324 wxr.wtp.error( 

3325 "/translations link outside part-of-speech" 

3326 ) 

3327 

3328 if ( 

3329 len(arg0) >= 1 

3330 and isinstance(arg0[0], str) 

3331 and not arg0[0].lower().startswith("category:") 

3332 ): 

3333 for x in node.largs[-1]: 

3334 if isinstance(x, str): 

3335 sense_parts.append(x) 

3336 else: 

3337 parse_translation_recurse(x) 

3338 elif not sense: 

3339 sense_parts.append(node) 

3340 else: 

3341 wxr.wtp.debug( 

3342 "skipping text between translation items/senses: " 

3343 "{}".format(node), 

3344 sortid="page/2621", 

3345 ) 

3346 

3347 # Main code of parse_translation(). We want ``sense`` to be assigned 

3348 # regardless of recursion levels, and thus the code is structured 

3349 # to define at this level and recurse in parse_translation_recurse(). 

3350 parse_translation_recurse(xlatnode) 

3351 

3352 def parse_etymology(data: WordData, node: WikiNode) -> None: 

3353 """Parses an etymology section.""" 

3354 assert isinstance(data, dict) 

3355 assert isinstance(node, WikiNode) 

3356 

3357 templates: list[TemplateData] = [] 

3358 

3359 # Counter for preventing the capture of etymology templates 

3360 # when we are inside templates that we want to ignore (i.e., 

3361 # not capture). 

3362 ignore_count = 0 

3363 

3364 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3365 nonlocal ignore_count 

3366 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3367 return "" 

3368 if re.match(ignored_etymology_templates_re, name): 

3369 ignore_count += 1 

3370 return None 

3371 

3372 # CONTINUE_HERE 

3373 

3374 def etym_post_template_fn( 

3375 name: str, ht: TemplateArgs, expansion: str 

3376 ) -> None: 

3377 nonlocal ignore_count 

3378 if name in wikipedia_templates: 

3379 parse_wikipedia_template(wxr, data, ht) 

3380 return None 

3381 if re.match(ignored_etymology_templates_re, name): 

3382 ignore_count -= 1 

3383 return None 

3384 if ignore_count == 0: 

3385 ht = clean_template_args(wxr, ht) 

3386 expansion = clean_node(wxr, None, expansion) 

3387 templates.append( 

3388 {"name": name, "args": ht, "expansion": expansion} 

3389 ) 

3390 return None 

3391 

3392 # Remove any subsections 

3393 contents = list( 

3394 x 

3395 for x in node.children 

3396 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3397 ) 

3398 # Convert to text, also capturing templates using post_template_fn 

3399 text = clean_node( 

3400 wxr, 

3401 None, 

3402 contents, 

3403 template_fn=etym_template_fn, 

3404 post_template_fn=etym_post_template_fn, 

3405 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3406 # Save the collected information. 

3407 if len(text) > 0: 3407 ↛ 3409line 3407 didn't jump to line 3409 because the condition on line 3407 was always true

3408 data["etymology_text"] = text 

3409 if len(templates) > 0: 3409 ↛ 3414line 3409 didn't jump to line 3414 because the condition on line 3409 was never true

3410 # Some etymology templates, like Template:root do not generate 

3411 # text, so they should be added here. Elsewhere, we check 

3412 # for Template:root and add some text to the expansion to please 

3413 # the validation. 

3414 data["etymology_templates"] = templates 

3415 

3416 for child_node in node.find_child_recursively( 3416 ↛ exitline 3416 didn't return from function 'parse_etymology' because the loop on line 3416 didn't complete

3417 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3418 ): 

3419 if child_node.kind in LEVEL_KIND_FLAGS: 3419 ↛ 3421line 3419 didn't jump to line 3421 because the condition on line 3419 was always true

3420 break 

3421 elif isinstance( 

3422 child_node, TemplateNode 

3423 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3424 if "etymology_examples" not in data: 

3425 data["etymology_examples"] = [] 

3426 data["etymology_examples"].extend( 

3427 extract_template_zh_x( 

3428 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3429 ) 

3430 ) 

3431 

3432 def parse_descendants( 

3433 data: WordData, node: WikiNode, is_proto_root_derived_section=False 

3434 ) -> None: 

3435 """Parses a Descendants section. Also used on Derived terms and 

3436 Extensions sections when we are dealing with a root of a reconstructed 

3437 language (i.e. is_proto_root_derived_section == True), as they use the 

3438 same structure. In the latter case, The wiktionary convention is not to 

3439 title the section as descendants since the immediate offspring of the 

3440 roots are morphologically derived terms within the same proto-language. 

3441 Still, since the rest of the section lists true descendants, we use the 

3442 same function. Entries in the descendants list that are technically 

3443 derived terms will have a field "tags": ["derived"].""" 

3444 assert isinstance(data, dict) 

3445 assert isinstance(node, WikiNode) 

3446 assert isinstance(is_proto_root_derived_section, bool) 

3447 

3448 descendants = [] 

3449 

3450 # Most templates that are not in a LIST should be ignored as they only 

3451 # add formatting, like "desc-top", "der-top3", etc. Any template in 

3452 # unignored_non_list_templates actually contains relevant descendant 

3453 # info. E.g. "CJKV" is often the only line at all in descendants 

3454 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would 

3455 # be skipped if we didn't handle it specially as it is not part of a 

3456 # LIST, and additionally is in panel_templates. There are probably more 

3457 # such templates that should be added to this... 

3458 unignored_non_list_templates: list[str] = ["CJKV"] 

3459 

3460 def process_list_item_children( 

3461 sarg: str, children: list[Union[str, WikiNode]] 

3462 ) -> None: 

3463 assert isinstance(sarg, str) 

3464 assert isinstance(children, list) 

3465 # The descendants section is a hierarchical bulleted listed. sarg is 

3466 # usually some number of "*" characters indicating the level of 

3467 # indentation of the line, e.g. "***" indicates the line will be 

3468 # thrice-indented. A bare ";" is used to indicate a subtitle-like 

3469 # line with no indentation. ":" at the end of one or more "*"s is 

3470 # used to indicate that the bullet will not be displayed. 

3471 item_data: DescendantData = {"depth": sarg.count("*")} 

3472 templates: list[TemplateData] = [] 

3473 is_derived = False 

3474 

3475 # Counter for preventing the capture of templates when we are inside 

3476 # templates that we want to ignore (i.e., not capture). 

3477 ignore_count = 0 

3478 

3479 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3480 nonlocal ignore_count 

3481 if ( 

3482 is_panel_template(wxr, name) 

3483 and name not in unignored_non_list_templates 

3484 ): 

3485 return "" 

3486 if re.match(ignored_descendants_templates_re, name): 

3487 ignore_count += 1 

3488 return None 

3489 

3490 def desc_post_template_fn( 

3491 name: str, ht: TemplateArgs, expansion: str 

3492 ) -> None: 

3493 nonlocal ignore_count 

3494 if name in wikipedia_templates: 

3495 parse_wikipedia_template(wxr, data, ht) 

3496 return None 

3497 if re.match(ignored_descendants_templates_re, name): 

3498 ignore_count -= 1 

3499 return None 

3500 if ignore_count == 0: 

3501 ht = clean_template_args(wxr, ht) 

3502 nonlocal is_derived 

3503 # If we're in a proto-root Derived terms or Extensions 

3504 # section, and the current list item has a link template 

3505 # to a term in the same proto-language, then we tag this 

3506 # descendant entry with "derived" 

3507 is_derived = ( 

3508 is_proto_root_derived_section 

3509 and (name == "l" or name == "link") 

3510 and ("1" in ht and ht["1"] == lang_code) 

3511 ) 

3512 expansion = clean_node(wxr, None, expansion) 

3513 templates.append( 

3514 {"name": name, "args": ht, "expansion": expansion} 

3515 ) 

3516 return None 

3517 

3518 text = clean_node( 

3519 wxr, 

3520 None, 

3521 children, 

3522 template_fn=desc_template_fn, 

3523 post_template_fn=desc_post_template_fn, 

3524 ) 

3525 item_data["templates"] = templates 

3526 item_data["text"] = text 

3527 if is_derived: 

3528 item_data["tags"] = ["derived"] 

3529 descendants.append(item_data) 

3530 

3531 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]: 

3532 for i, child in enumerate(node.children): 

3533 if isinstance(child, WikiNode): 

3534 yield (i, child) 

3535 

3536 def get_sublist_index(list_item: WikiNode) -> Optional[int]: 

3537 for i, child in node_children(list_item): 

3538 if child.kind == NodeKind.LIST: 

3539 return i 

3540 return None 

3541 

3542 def get_descendants(node: WikiNode) -> None: 

3543 """Appends the data for every list item in every list in node 

3544 to descendants.""" 

3545 for _, c in node_children(node): 

3546 if ( 

3547 c.kind == NodeKind.TEMPLATE 

3548 and c.largs 

3549 and len(c.largs[0]) == 1 

3550 and isinstance(c.largs[0][0], str) 

3551 and c.largs[0][0] in unignored_non_list_templates 

3552 ): 

3553 # Some Descendants sections have no wikitext list. Rather, 

3554 # the list is entirely generated by a single template (see 

3555 # e.g. the use of {{CJKV}} in Chinese entries). 

3556 process_list_item_children("", [c]) 

3557 elif c.kind == NodeKind.HTML: 

3558 # The Descendants sections for many languages feature 

3559 # templates that generate html to add styling (e.g. using 

3560 # multiple columns) to the list, so that the actual wikitext 

3561 # list items are found within a <div>. We look within the 

3562 # children of the html node for the actual list items. 

3563 get_descendants(c) 

3564 elif c.kind == NodeKind.LIST: 

3565 get_descendants(c) 

3566 elif c.kind == NodeKind.LIST_ITEM: 

3567 # If a LIST_ITEM has subitems in a sublist, usually its 

3568 # last child is a LIST. However, sometimes after the LIST 

3569 # there is one or more trailing LIST_ITEMs, like "\n" or 

3570 # a reference template. If there is a sublist, we discard 

3571 # everything after it. 

3572 i = get_sublist_index(c) 

3573 if i is not None: 

3574 process_list_item_children(c.sarg, c.children[:i]) 

3575 get_descendants(c.children[i]) # type: ignore[arg-type] 

3576 else: 

3577 process_list_item_children(c.sarg, c.children) 

3578 

3579 # parse_descendants() actual work starts here 

3580 get_descendants(node) 

3581 

3582 # if e.g. on a PIE page, there may be both Derived terms and Extensions 

3583 # sections, in which case this function will be called multiple times, 

3584 # so we have to check if descendants exists first. 

3585 if "descendants" in data: 

3586 data["descendants"].extend(descendants) 

3587 else: 

3588 data["descendants"] = descendants 

3589 

3590 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3591 """This recurses into a subtree in the parse tree for a page.""" 

3592 nonlocal etym_data 

3593 nonlocal pos_data 

3594 nonlocal inside_level_four 

3595 

3596 redirect_list: list[str] = [] # for `zh-see` template 

3597 

3598 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3599 """This is called for otherwise unprocessed parts of the page. 

3600 We still expand them so that e.g. Category links get captured.""" 

3601 if name in wikipedia_templates: 3601 ↛ 3602line 3601 didn't jump to line 3602 because the condition on line 3601 was never true

3602 data = select_data() 

3603 parse_wikipedia_template(wxr, data, ht) 

3604 return None 

3605 if is_panel_template(wxr, name): 3605 ↛ 3606line 3605 didn't jump to line 3606 because the condition on line 3605 was never true

3606 return "" 

3607 return None 

3608 

3609 for node in treenode.children: 

3610 # print(node) 

3611 if not isinstance(node, WikiNode): 

3612 # print(" X{}".format(repr(node)[:40])) 

3613 continue 

3614 if isinstance(node, TemplateNode): 

3615 if process_soft_redirect_template(wxr, node, redirect_list): 

3616 continue 

3617 elif node.template_name == "zh-forms": 3617 ↛ 3618line 3617 didn't jump to line 3618 because the condition on line 3617 was never true

3618 process_zh_forms_templates(wxr, node, base_data) 

3619 

3620 if node.kind not in LEVEL_KINDS: 

3621 # XXX handle e.g. wikipedia links at the top of a language 

3622 # XXX should at least capture "also" at top of page 

3623 if node.kind in ( 

3624 NodeKind.HLINE, 

3625 NodeKind.LIST, 

3626 NodeKind.LIST_ITEM, 

3627 ): 

3628 continue 

3629 # print(" UNEXPECTED: {}".format(node)) 

3630 # Clean the node to collect category links 

3631 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3632 continue 

3633 t = clean_node( 

3634 wxr, etym_data, node.sarg if node.sarg else node.largs 

3635 ) 

3636 t = t.lower() 

3637 # XXX these counts were never implemented fully, and even this 

3638 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3639 wxr.config.section_counts[t] += 1 

3640 # print("PROCESS_CHILDREN: T:", repr(t)) 

3641 if t in IGNORED_TITLES: 3641 ↛ 3642line 3641 didn't jump to line 3642 because the condition on line 3641 was never true

3642 pass 

3643 elif t.startswith(PRONUNCIATION_TITLE): 3643 ↛ 3648line 3643 didn't jump to line 3648 because the condition on line 3643 was never true

3644 # Chinese Pronunciation section kludge; we demote these to 

3645 # be level 4 instead of 3 so that they're part of a larger 

3646 # etymology hierarchy; usually the data here is empty and 

3647 # acts as an inbetween between POS and Etymology data 

3648 inside_level_four = True 

3649 if t.startswith(PRONUNCIATION_TITLE + " "): 

3650 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3651 # and each of them may have senses under Definition 

3652 push_level_four_section() 

3653 wxr.wtp.start_subsection(None) 

3654 if wxr.config.capture_pronunciation: 

3655 data = select_data() 

3656 parse_pronunciation( 

3657 wxr, 

3658 node, 

3659 data, 

3660 etym_data, 

3661 have_etym, 

3662 base_data, 

3663 lang_code, 

3664 ) 

3665 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3666 push_etym() 

3667 wxr.wtp.start_subsection(None) 

3668 if wxr.config.capture_etymologies: 3668 ↛ 3733line 3668 didn't jump to line 3733 because the condition on line 3668 was always true

3669 m = re.search(r"\s(\d+)$", t) 

3670 if m: 3670 ↛ 3671line 3670 didn't jump to line 3671 because the condition on line 3670 was never true

3671 etym_data["etymology_number"] = int(m.group(1)) 

3672 parse_etymology(etym_data, node) 

3673 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 3673 ↛ 3674line 3673 didn't jump to line 3674 because the condition on line 3673 was never true

3674 data = select_data() 

3675 parse_descendants(data, node) 

3676 elif ( 3676 ↛ 3682line 3676 didn't jump to line 3682 because the condition on line 3676 was never true

3677 t in PROTO_ROOT_DERIVED_TITLES 

3678 and pos == "root" 

3679 and is_reconstruction 

3680 and wxr.config.capture_descendants 

3681 ): 

3682 data = select_data() 

3683 parse_descendants(data, node, True) 

3684 elif t == TRANSLATIONS_TITLE: 

3685 data = select_data() 

3686 parse_translations(data, node) 

3687 elif t in INFLECTION_TITLES: 3687 ↛ 3688line 3687 didn't jump to line 3688 because the condition on line 3687 was never true

3688 parse_inflection(node, t, pos) 

3689 else: 

3690 lst = t.split() 

3691 while len(lst) > 1 and lst[-1].isdigit(): 3691 ↛ 3692line 3691 didn't jump to line 3692 because the condition on line 3691 was never true

3692 lst = lst[:-1] 

3693 t_no_number = " ".join(lst).lower() 

3694 if t_no_number in POS_TITLES: 

3695 push_pos() 

3696 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3697 pos = dt["pos"] or "MISSING_POS" 

3698 wxr.wtp.start_subsection(t) 

3699 if "debug" in dt: 3699 ↛ 3700line 3699 didn't jump to line 3700 because the condition on line 3699 was never true

3700 wxr.wtp.debug( 

3701 "{} in section {}".format(dt["debug"], t), 

3702 sortid="page/2755", 

3703 ) 

3704 if "warning" in dt: 3704 ↛ 3705line 3704 didn't jump to line 3705 because the condition on line 3704 was never true

3705 wxr.wtp.warning( 

3706 "{} in section {}".format(dt["warning"], t), 

3707 sortid="page/2759", 

3708 ) 

3709 if "error" in dt: 3709 ↛ 3710line 3709 didn't jump to line 3710 because the condition on line 3709 was never true

3710 wxr.wtp.error( 

3711 "{} in section {}".format(dt["error"], t), 

3712 sortid="page/2763", 

3713 ) 

3714 # Parse word senses for the part-of-speech 

3715 parse_part_of_speech(node, pos) 

3716 if "tags" in dt: 3716 ↛ 3717line 3716 didn't jump to line 3717 because the condition on line 3716 was never true

3717 for pdata in pos_datas: 

3718 data_extend(pdata, "tags", dt["tags"]) 

3719 elif t_no_number in LINKAGE_TITLES: 3719 ↛ 3723line 3719 didn't jump to line 3723 because the condition on line 3719 was always true

3720 rel = LINKAGE_TITLES[t_no_number] 

3721 data = select_data() 

3722 parse_linkage(data, rel, node) 

3723 elif t_no_number == COMPOUNDS_TITLE: 

3724 data = select_data() 

3725 if wxr.config.capture_compounds: 

3726 parse_linkage(data, "derived", node) 

3727 

3728 # XXX parse interesting templates also from other sections. E.g., 

3729 # {{Letter|...}} in ===See also=== 

3730 # Also <gallery> 

3731 

3732 # Recurse to children of this node, processing subtitles therein 

3733 stack.append(t) 

3734 process_children(node, pos) 

3735 stack.pop() 

3736 

3737 if len(redirect_list) > 0: 

3738 if len(pos_data) > 0: 

3739 pos_data["redirects"] = redirect_list 

3740 if "pos" not in pos_data: 3740 ↛ 3741line 3740 didn't jump to line 3741 because the condition on line 3740 was never true

3741 pos_data["pos"] = "soft-redirect" 

3742 else: 

3743 new_page_data = copy.deepcopy(base_data) 

3744 new_page_data["redirects"] = redirect_list 

3745 if "pos" not in new_page_data: 3745 ↛ 3747line 3745 didn't jump to line 3747 because the condition on line 3745 was always true

3746 new_page_data["pos"] = "soft-redirect" 

3747 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3748 page_datas.append(new_page_data) 

3749 

3750 def extract_examples( 

3751 others: list[WikiNode], sense_base: SenseData 

3752 ) -> list[ExampleData]: 

3753 """Parses through a list of definitions and quotes to find examples. 

3754 Returns a list of example dicts to be added to sense data. Adds 

3755 meta-data, mostly categories, into sense_base.""" 

3756 assert isinstance(others, list) 

3757 examples: list[ExampleData] = [] 

3758 

3759 for sub in others: 

3760 if not sub.sarg.endswith((":", "*")): 3760 ↛ 3761line 3760 didn't jump to line 3761 because the condition on line 3760 was never true

3761 continue 

3762 for item in sub.children: 

3763 if not isinstance(item, WikiNode): 3763 ↛ 3764line 3763 didn't jump to line 3764 because the condition on line 3763 was never true

3764 continue 

3765 if item.kind != NodeKind.LIST_ITEM: 3765 ↛ 3766line 3765 didn't jump to line 3766 because the condition on line 3765 was never true

3766 continue 

3767 usex_type = None 

3768 example_template_args = [] 

3769 example_template_names = [] 

3770 taxons = set() 

3771 

3772 # Bypass this function when parsing Chinese, Japanese and 

3773 # quotation templates. 

3774 new_example_lists = extract_example_list_item( 

3775 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3776 ) 

3777 if len(new_example_lists) > 0: 3777 ↛ 3778line 3777 didn't jump to line 3778 because the condition on line 3777 was never true

3778 examples.extend(new_example_lists) 

3779 continue 

3780 

3781 def usex_template_fn( 

3782 name: str, ht: TemplateArgs 

3783 ) -> Optional[str]: 

3784 nonlocal usex_type 

3785 if is_panel_template(wxr, name): 

3786 return "" 

3787 if name in usex_templates: 

3788 usex_type = "example" 

3789 example_template_args.append(ht) 

3790 example_template_names.append(name) 

3791 elif name in quotation_templates: 

3792 usex_type = "quotation" 

3793 elif name in taxonomy_templates: 

3794 taxons.update(ht.get(1, "").split()) 

3795 for prefix in template_linkages: 

3796 if re.search( 

3797 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3798 ): 

3799 return "" 

3800 return None 

3801 

3802 # bookmark 

3803 ruby: list[tuple[str, str]] = [] 

3804 contents = item.children 

3805 if lang_code == "ja": 

3806 # Capture ruby contents if this is a Japanese language 

3807 # example. 

3808 # print(contents) 

3809 if ( 3809 ↛ 3814line 3809 didn't jump to line 3814 because the condition on line 3809 was never true

3810 contents 

3811 and isinstance(contents, str) 

3812 and re.match(r"\s*$", contents[0]) 

3813 ): 

3814 contents = contents[1:] 

3815 exp = wxr.wtp.parse( 

3816 wxr.wtp.node_to_wikitext(contents), 

3817 # post_template_fn=head_post_template_fn, 

3818 expand_all=True, 

3819 ) 

3820 rub, rest = extract_ruby(wxr, exp.children) 

3821 if rub: 3821 ↛ 3825line 3821 didn't jump to line 3825 because the condition on line 3821 was always true

3822 for rtup in rub: 

3823 ruby.append(rtup) 

3824 contents = rest 

3825 subtext = clean_node( 

3826 wxr, sense_base, contents, template_fn=usex_template_fn 

3827 ) 

3828 

3829 frozen_taxons = frozenset(taxons) 

3830 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3831 

3832 # print(f"{subtext=}") 

3833 subtext = re.sub( 

3834 r"\s*\(please add an English " 

3835 r"translation of this " 

3836 r"(example|usage example|quote)\)", 

3837 "", 

3838 subtext, 

3839 ).strip() 

3840 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3841 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3842 # print("subtext:", repr(subtext)) 

3843 

3844 lines = subtext.splitlines() 

3845 # print(lines) 

3846 

3847 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3848 lines = list( 

3849 x 

3850 for x in lines 

3851 if not re.match( 

3852 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3853 r"Synonym: |Antonym: |Hyponym: |" 

3854 r"Hypernyms: |Derived terms: |" 

3855 r"Related terms: |" 

3856 r"Hypernym: |Derived term: |" 

3857 r"Coordinate terms:|" 

3858 r"Related term: |" 

3859 r"For more quotations using )", 

3860 x, 

3861 ) 

3862 ) 

3863 tr = "" 

3864 ref = "" 

3865 roman = "" 

3866 # for line in lines: 

3867 # print("LINE:", repr(line)) 

3868 # print(classify_desc(line)) 

3869 if len(lines) == 1 and lang_code != "en": 3869 ↛ 3870line 3869 didn't jump to line 3870 because the condition on line 3869 was never true

3870 parts = example_splitter_re.split(lines[0]) 

3871 if ( 

3872 len(parts) > 2 

3873 and len(example_template_args) == 1 

3874 and any( 

3875 ("―" in s) or ("—" in s) 

3876 for s in example_template_args[0].values() 

3877 ) 

3878 ): 

3879 if nparts := synch_splits_with_args( 

3880 lines[0], example_template_args[0] 

3881 ): 

3882 parts = nparts 

3883 if ( 

3884 len(example_template_args) == 1 

3885 and "lit" in example_template_args[0] 

3886 ): 

3887 # ugly brute-force kludge in case there's a lit= arg 

3888 literally = example_template_args[0].get("lit", "") 

3889 if literally: 

3890 literally = ( 

3891 " (literally, “" 

3892 + clean_value(wxr, literally) 

3893 + "”)" 

3894 ) 

3895 else: 

3896 literally = "" 

3897 if ( 

3898 len(example_template_args) == 1 

3899 and len(parts) == 2 

3900 and len(example_template_args[0]) 

3901 - ( 

3902 # horrible kludge to ignore these arguments 

3903 # when calculating how many there are 

3904 sum( 

3905 s in example_template_args[0] 

3906 for s in ( 

3907 "lit", # generates text, but we handle it 

3908 "inline", 

3909 "noenum", 

3910 "nocat", 

3911 "sort", 

3912 ) 

3913 ) 

3914 ) 

3915 == 3 

3916 and clean_value( 

3917 wxr, example_template_args[0].get(2, "") 

3918 ) 

3919 == parts[0].strip() 

3920 and clean_value( 

3921 wxr, 

3922 ( 

3923 example_template_args[0].get(3) 

3924 or example_template_args[0].get("translation") 

3925 or example_template_args[0].get("t", "") 

3926 ) 

3927 + literally, # in case there's a lit= argument 

3928 ) 

3929 == parts[1].strip() 

3930 ): 

3931 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3932 # is a pretty reliable 'heuristic', so we use it here 

3933 # before the others. To be extra sure the template 

3934 # doesn't do anything weird, we compare the arguments 

3935 # and the output to each other. 

3936 lines = [parts[0].strip()] 

3937 tr = parts[1].strip() 

3938 elif ( 

3939 len(parts) == 2 

3940 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3941 ): 

3942 # These other branches just do some simple heuristics w/ 

3943 # the expanded output of the template (if applicable). 

3944 lines = [parts[0].strip()] 

3945 tr = parts[1].strip() 

3946 elif ( 

3947 len(parts) == 3 

3948 and classify_desc2(parts[1]) 

3949 in ("romanization", "english") 

3950 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3951 ): 

3952 lines = [parts[0].strip()] 

3953 roman = parts[1].strip() 

3954 tr = parts[2].strip() 

3955 else: 

3956 parts = re.split(r"\s+-\s+", lines[0]) 

3957 if ( 

3958 len(parts) == 2 

3959 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3960 ): 

3961 lines = [parts[0].strip()] 

3962 tr = parts[1].strip() 

3963 elif len(lines) > 1: 

3964 if any( 3964 ↛ 3967line 3964 didn't jump to line 3967 because the condition on line 3964 was never true

3965 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3966 ) and not (len(example_template_names) == 1): 

3967 refs: list[str] = [] 

3968 for i in range(len(lines)): 

3969 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 

3970 break 

3971 refs.append(lines[i].strip()) 

3972 if re.search(r"[]\d:)]\s*$", lines[i]): 

3973 break 

3974 ref = " ".join(refs) 

3975 lines = lines[i + 1 :] 

3976 if ( 

3977 lang_code != "en" 

3978 and len(lines) >= 2 

3979 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3980 ): 

3981 i = len(lines) - 1 

3982 while ( 

3983 i > 1 

3984 and classify_desc2(lines[i - 1]) 

3985 in ENGLISH_TEXTS 

3986 ): 

3987 i -= 1 

3988 tr = "\n".join(lines[i:]) 

3989 lines = lines[:i] 

3990 if len(lines) >= 2: 

3991 if classify_desc2(lines[-1]) == "romanization": 

3992 roman = lines[-1].strip() 

3993 lines = lines[:-1] 

3994 

3995 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true

3996 ref = lines[0] 

3997 lines = lines[1:] 

3998 elif lang_code != "en" and len(lines) == 2: 3998 ↛ 3999line 3998 didn't jump to line 3999 because the condition on line 3998 was never true

3999 cls1 = classify_desc2(lines[0]) 

4000 cls2 = classify_desc2(lines[1]) 

4001 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

4002 tr = lines[1] 

4003 lines = [lines[0]] 

4004 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 

4005 tr = lines[0] 

4006 lines = [lines[1]] 

4007 elif ( 

4008 re.match(r"^[#*]*:+", lines[1]) 

4009 and classify_desc2( 

4010 re.sub(r"^[#*:]+\s*", "", lines[1]) 

4011 ) 

4012 in ENGLISH_TEXTS 

4013 ): 

4014 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

4015 lines = [lines[0]] 

4016 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

4017 # Both were classified as English, but 

4018 # presumably one is not. Assume first is 

4019 # non-English, as that seems more common. 

4020 tr = lines[1] 

4021 lines = [lines[0]] 

4022 elif ( 4022 ↛ 4038line 4022 didn't jump to line 4038 because the condition on line 4022 was always true

4023 usex_type != "quotation" 

4024 and lang_code != "en" 

4025 and len(lines) == 3 

4026 ): 

4027 cls1 = classify_desc2(lines[0]) 

4028 cls2 = classify_desc2(lines[1]) 

4029 cls3 = classify_desc2(lines[2]) 

4030 if ( 4030 ↛ 4061line 4030 didn't jump to line 4061 because the condition on line 4030 was always true

4031 cls3 == "english" 

4032 and cls2 in ("english", "romanization") 

4033 and cls1 != "english" 

4034 ): 

4035 tr = lines[2].strip() 

4036 roman = lines[1].strip() 

4037 lines = [lines[0].strip()] 

4038 elif ( 

4039 usex_type == "quotation" 

4040 and lang_code != "en" 

4041 and len(lines) > 2 

4042 ): 

4043 # for x in lines: 

4044 # print(" LINE: {}: {}" 

4045 # .format(classify_desc2(x), x)) 

4046 if re.match(r"^[#*]*:+\s*$", lines[1]): 

4047 ref = lines[0] 

4048 lines = lines[2:] 

4049 cls1 = classify_desc2(lines[-1]) 

4050 if cls1 == "english": 

4051 i = len(lines) - 1 

4052 while ( 

4053 i > 1 

4054 and classify_desc2(lines[i - 1]) 

4055 == ENGLISH_TEXTS 

4056 ): 

4057 i -= 1 

4058 tr = "\n".join(lines[i:]) 

4059 lines = lines[:i] 

4060 

4061 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

4062 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

4063 tr = re.sub(r"^[#*:]+\s*", "", tr) 

4064 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

4065 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

4066 ref = re.sub(r"^[#*:]+\s*", "", ref) 

4067 ref = re.sub( 

4068 r", (volume |number |page )?“?" 

4069 r"\(please specify ([^)]|\(s\))*\)”?|" 

4070 ", text here$", 

4071 "", 

4072 ref, 

4073 ) 

4074 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

4075 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

4076 subtext = "\n".join(x for x in lines if x) 

4077 if not tr and lang_code != "en": 4077 ↛ 4078line 4077 didn't jump to line 4078 because the condition on line 4077 was never true

4078 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

4079 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 

4080 tr = m.group(2) 

4081 subtext = subtext[: m.start()] + m.group(1) 

4082 elif lines: 

4083 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

4084 if ( 

4085 len(parts) == 2 

4086 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

4087 ): 

4088 subtext = parts[0].strip() 

4089 tr = parts[1].strip() 

4090 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

4091 subtext = re.sub( 

4092 r"(please add an English translation of " 

4093 r"this (quote|usage example))", 

4094 "", 

4095 subtext, 

4096 ) 

4097 subtext = re.sub( 

4098 r"\s*→New International Version " "translation$", 

4099 "", 

4100 subtext, 

4101 ) # e.g. pis/Tok Pisin (Bible) 

4102 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

4103 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

4104 note = None 

4105 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

4106 if ( 4106 ↛ 4114line 4106 didn't jump to line 4114 because the condition on line 4106 was never true

4107 m is not None 

4108 and lang_code != "en" 

4109 and ( 

4110 m.group(1).startswith("with ") 

4111 or classify_desc2(m.group(1)) == "english" 

4112 ) 

4113 ): 

4114 note = m.group(1) 

4115 subtext = subtext[m.end() :] 

4116 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

4117 ref = re.sub(r",\s*→ISBN", "", ref) 

4118 ref = ref.strip() 

4119 if ref.endswith(":") or ref.endswith(","): 4119 ↛ 4120line 4119 didn't jump to line 4120 because the condition on line 4119 was never true

4120 ref = ref[:-1].strip() 

4121 ref = re.sub(r"\s+,\s+", ", ", ref) 

4122 ref = re.sub(r"\s+", " ", ref) 

4123 if ref and not subtext: 4123 ↛ 4124line 4123 didn't jump to line 4124 because the condition on line 4123 was never true

4124 subtext = ref 

4125 ref = "" 

4126 if subtext: 4126 ↛ 3762line 4126 didn't jump to line 3762 because the condition on line 4126 was always true

4127 dt: ExampleData = {"text": subtext} 

4128 if ref: 4128 ↛ 4129line 4128 didn't jump to line 4129 because the condition on line 4128 was never true

4129 dt["ref"] = ref 

4130 if tr: 

4131 dt["english"] = tr 

4132 if usex_type: 4132 ↛ 4133line 4132 didn't jump to line 4133 because the condition on line 4132 was never true

4133 dt["type"] = usex_type 

4134 if note: 4134 ↛ 4135line 4134 didn't jump to line 4135 because the condition on line 4134 was never true

4135 dt["note"] = note 

4136 if roman: 

4137 dt["roman"] = roman 

4138 if ruby: 

4139 dt["ruby"] = ruby 

4140 examples.append(dt) 

4141 

4142 return examples 

4143 

4144 # Main code of parse_language() 

4145 # Process the section 

4146 stack.append(language) 

4147 process_children(langnode, None) 

4148 stack.pop() 

4149 

4150 # Finalize word entires 

4151 push_etym() 

4152 ret = [] 

4153 for data in page_datas: 

4154 merge_base(data, base_data) 

4155 ret.append(data) 

4156 

4157 # Copy all tags to word senses 

4158 for data in ret: 

4159 if "senses" not in data: 4159 ↛ 4160line 4159 didn't jump to line 4160 because the condition on line 4159 was never true

4160 continue 

4161 # WordData should not have a 'tags' field, but if it does, it's 

4162 # deleted and its contents removed and placed in each sense; 

4163 # that's why the type ignores. 

4164 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

4165 if "tags" in data: 4165 ↛ 4166line 4165 didn't jump to line 4166 because the condition on line 4165 was never true

4166 del data["tags"] # type: ignore[typeddict-item] 

4167 for sense in data["senses"]: 

4168 data_extend(sense, "tags", tags) 

4169 

4170 return ret 

4171 

4172 

4173def parse_wikipedia_template( 

4174 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

4175) -> None: 

4176 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

4177 assert isinstance(wxr, WiktextractContext) 

4178 assert isinstance(data, dict) 

4179 assert isinstance(ht, dict) 

4180 langid = clean_node(wxr, data, ht.get("lang", ())) 

4181 pagename = ( 

4182 clean_node(wxr, data, ht.get(1, ())) 

4183 or wxr.wtp.title 

4184 or "MISSING_PAGE_TITLE" 

4185 ) 

4186 if langid: 

4187 data_append(data, "wikipedia", langid + ":" + pagename) 

4188 else: 

4189 data_append(data, "wikipedia", pagename) 

4190 

4191 

4192def parse_top_template( 

4193 wxr: WiktextractContext, node: WikiNode, data: WordData 

4194) -> None: 

4195 """Parses a template that occurs on the top-level in a page, before any 

4196 language subtitles.""" 

4197 assert isinstance(wxr, WiktextractContext) 

4198 assert isinstance(node, WikiNode) 

4199 assert isinstance(data, dict) 

4200 

4201 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

4202 if name in wikipedia_templates: 

4203 parse_wikipedia_template(wxr, data, ht) 

4204 return None 

4205 if is_panel_template(wxr, name): 

4206 return "" 

4207 if name in ("reconstruction",): 

4208 return "" 

4209 if name.lower() == "also": 

4210 # XXX shows related words that might really have been the intended 

4211 # word, capture them 

4212 return "" 

4213 if name == "see also": 

4214 # XXX capture 

4215 return "" 

4216 if name == "cardinalbox": 

4217 # XXX capture 

4218 return "" 

4219 if name == "character info": 

4220 # XXX capture 

4221 return "" 

4222 if name == "commonscat": 

4223 # XXX capture link to Wikimedia commons 

4224 return "" 

4225 if name == "wrongtitle": 

4226 # XXX this should be captured to replace page title with the 

4227 # correct title. E.g. ⿰亻革家 

4228 return "" 

4229 if name == "wikidata": 

4230 arg = clean_node(wxr, data, ht.get(1, ())) 

4231 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

4232 data_append(data, "wikidata", arg) 

4233 return "" 

4234 wxr.wtp.debug( 

4235 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

4236 sortid="page/2870", 

4237 ) 

4238 return "" 

4239 

4240 clean_node(wxr, None, [node], template_fn=top_template_fn) 

4241 

4242 

4243def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

4244 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

4245 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

4246 that are next to each other.""" 

4247 

4248 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

4249 # section get overwritten. In this case, let's just combine the two. 

4250 

4251 # In Chinese entries, Pronunciation can be preceded on the 

4252 # same level 3 by its Etymology *and* Glyph Origin sections: 

4253 # ===Glyph Origin=== 

4254 # ===Etymology=== 

4255 # ===Pronunciation=== 

4256 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

4257 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

4258 # are now level 6 

4259 

4260 # Known lowercase PoS names are in part_of_speech_map 

4261 # Known lowercase linkage section names are in linkage_map 

4262 

4263 old = re.split( 

4264 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

4265 ) 

4266 

4267 parts = [] 

4268 npar = 4 # Number of parentheses in above expression 

4269 parts.append(old[0]) 

4270 prev_level = None 

4271 level = None 

4272 skip_level_title = False # When combining etymology sections 

4273 for i in range(1, len(old), npar + 1): 

4274 left = old[i] 

4275 right = old[i + npar - 1] 

4276 # remove Wikilinks in title 

4277 title = re.sub(r"^\[\[", "", old[i + 1]) 

4278 title = re.sub(r"\]\]$", "", title) 

4279 prev_level = level 

4280 level = len(left) 

4281 part = old[i + npar] 

4282 if level != len(right): 4282 ↛ 4283line 4282 didn't jump to line 4283 because the condition on line 4282 was never true

4283 wxr.wtp.debug( 

4284 "subtitle has unbalanced levels: " 

4285 "{!r} has {} on the left and {} on the right".format( 

4286 title, left, right 

4287 ), 

4288 sortid="page/2904", 

4289 ) 

4290 lc = title.lower() 

4291 if name_to_code(title, "en") != "": 

4292 if level > 2: 4292 ↛ 4293line 4292 didn't jump to line 4293 because the condition on line 4292 was never true

4293 wxr.wtp.debug( 

4294 "subtitle has language name {} at level {}".format( 

4295 title, level 

4296 ), 

4297 sortid="page/2911", 

4298 ) 

4299 level = 2 

4300 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4301 if level > 3: 4301 ↛ 4302line 4301 didn't jump to line 4302 because the condition on line 4301 was never true

4302 wxr.wtp.debug( 

4303 "etymology section {} at level {}".format(title, level), 

4304 sortid="page/2917", 

4305 ) 

4306 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 4306 ↛ 4308line 4306 didn't jump to line 4308 because the condition on line 4306 was never true

4307 # sections cheek-to-cheek 

4308 skip_level_title = True 

4309 # Modify the title of previous ("Glyph Origin") section, in 

4310 # case we have a meaningful title like "Etymology 1" 

4311 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4312 level = 3 

4313 elif lc.startswith(PRONUNCIATION_TITLE): 4313 ↛ 4316line 4313 didn't jump to line 4316 because the condition on line 4313 was never true

4314 # Pronunciation is now a level between POS and Etymology, so 

4315 # we need to shift everything down by one 

4316 level = 4 

4317 elif lc in POS_TITLES: 

4318 level = 5 

4319 elif lc == TRANSLATIONS_TITLE: 

4320 level = 6 

4321 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 4321 ↛ 4323line 4321 didn't jump to line 4323 because the condition on line 4321 was always true

4322 level = 6 

4323 elif lc in INFLECTION_TITLES: 

4324 level = 6 

4325 elif lc == DESCENDANTS_TITLE: 

4326 level = 6 

4327 elif title in PROTO_ROOT_DERIVED_TITLES: 

4328 level = 6 

4329 elif lc in IGNORED_TITLES: 

4330 level = 6 

4331 else: 

4332 level = 6 

4333 if skip_level_title: 4333 ↛ 4334line 4333 didn't jump to line 4334 because the condition on line 4333 was never true

4334 skip_level_title = False 

4335 parts.append(part) 

4336 else: 

4337 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4338 parts.append(part) 

4339 # print("=" * level, title) 

4340 # if level != len(left): 

4341 # print(" FIXED LEVEL OF {} {} -> {}" 

4342 # .format(title, len(left), level)) 

4343 

4344 text = "".join(parts) 

4345 # print(text) 

4346 return text 

4347 

4348 

4349def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4350 # Skip translation pages 

4351 if word.endswith("/" + TRANSLATIONS_TITLE): 4351 ↛ 4352line 4351 didn't jump to line 4352 because the condition on line 4351 was never true

4352 return [] 

4353 

4354 if wxr.config.verbose: 4354 ↛ 4355line 4354 didn't jump to line 4355 because the condition on line 4354 was never true

4355 logger.info(f"Parsing page: {word}") 

4356 

4357 wxr.config.word = word 

4358 wxr.wtp.start_page(word) 

4359 

4360 # Remove <noinclude> and similar tags from main pages. They 

4361 # should not appear there, but at least net/Elfdala has one and it 

4362 # is probably not the only one. 

4363 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4364 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4365 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4366 

4367 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4368 # pages that have, for example, Translations section under Linkage, or 

4369 # Translations section on the same level as Noun. Enforce a proper 

4370 # hierarchy by manipulating the subtitle levels in certain cases. 

4371 text = fix_subtitle_hierarchy(wxr, text) 

4372 

4373 # Parse the page, pre-expanding those templates that are likely to 

4374 # influence parsing 

4375 tree = wxr.wtp.parse( 

4376 text, 

4377 pre_expand=True, 

4378 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4379 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4380 ) 

4381 # from wikitextprocessor.parser import print_tree 

4382 # print("PAGE PARSE:", print_tree(tree)) 

4383 

4384 top_data: WordData = {} 

4385 

4386 # Iterate over top-level titles, which should be languages for normal 

4387 # pages 

4388 by_lang = defaultdict(list) 

4389 for langnode in tree.children: 

4390 if not isinstance(langnode, WikiNode): 

4391 continue 

4392 if langnode.kind == NodeKind.TEMPLATE: 4392 ↛ 4393line 4392 didn't jump to line 4393 because the condition on line 4392 was never true

4393 parse_top_template(wxr, langnode, top_data) 

4394 continue 

4395 if langnode.kind == NodeKind.LINK: 4395 ↛ 4397line 4395 didn't jump to line 4397 because the condition on line 4395 was never true

4396 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4397 continue 

4398 if langnode.kind != NodeKind.LEVEL2: 4398 ↛ 4399line 4398 didn't jump to line 4399 because the condition on line 4398 was never true

4399 wxr.wtp.debug( 

4400 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4401 ) 

4402 continue 

4403 lang = clean_node( 

4404 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4405 ) 

4406 lang_code = name_to_code(lang, "en") 

4407 if lang_code == "": 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true

4408 wxr.wtp.debug( 

4409 f"unrecognized language name: {lang}", sortid="page/3019" 

4410 ) 

4411 if ( 4411 ↛ 4415line 4411 didn't jump to line 4415 because the condition on line 4411 was never true

4412 wxr.config.capture_language_codes 

4413 and lang_code not in wxr.config.capture_language_codes 

4414 ): 

4415 continue 

4416 wxr.wtp.start_section(lang) 

4417 

4418 # Collect all words from the page. 

4419 # print(f"{langnode=}") 

4420 datas = parse_language(wxr, langnode, lang, lang_code) 

4421 

4422 # Propagate fields resulting from top-level templates to this 

4423 # part-of-speech. 

4424 for data in datas: 

4425 if "lang" not in data: 4425 ↛ 4426line 4425 didn't jump to line 4426 because the condition on line 4425 was never true

4426 wxr.wtp.debug( 

4427 "internal error -- no lang in data: {}".format(data), 

4428 sortid="page/3034", 

4429 ) 

4430 continue 

4431 for k, v in top_data.items(): 4431 ↛ 4432line 4431 didn't jump to line 4432 because the loop on line 4431 never started

4432 assert isinstance(v, (list, tuple)) 

4433 data_extend(data, k, v) 

4434 by_lang[data["lang"]].append(data) 

4435 

4436 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4437 # field. FIX OR REMOVE. 

4438 # Do some post-processing on the words. For example, we may distribute 

4439 # conjugation information to all the words. 

4440 ret = [] 

4441 for lang, lang_datas in by_lang.items(): 

4442 ret.extend(lang_datas) 

4443 

4444 for x in ret: 

4445 if x["word"] != word: 

4446 if word.startswith("Unsupported titles/"): 4446 ↛ 4452line 4446 didn't jump to line 4452 because the condition on line 4446 was always true

4447 wxr.wtp.debug( 

4448 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4449 sortid="20231101/3578page.py", 

4450 ) 

4451 else: 

4452 wxr.wtp.debug( 

4453 f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'", 

4454 sortid="20231101/3582page.py", 

4455 ) 

4456 x["original_title"] = word 

4457 # validate tag data 

4458 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4459 return ret 

4460 

4461 

4462def recursively_separate_raw_tags( 

4463 wxr: WiktextractContext, data: dict[str, Any] 

4464) -> None: 

4465 if not isinstance(data, dict): 4465 ↛ 4466line 4465 didn't jump to line 4466 because the condition on line 4465 was never true

4466 wxr.wtp.error( 

4467 "'data' is not dict; most probably " 

4468 "data has a list that contains at least one dict and " 

4469 "at least one non-dict item", 

4470 sortid="en/page-4016/20240419", 

4471 ) 

4472 return 

4473 new_tags: list[str] = [] 

4474 raw_tags: list[str] = data.get("raw_tags", []) 

4475 for field, val in data.items(): 

4476 if field == "tags": 

4477 for tag in val: 

4478 if tag not in valid_tags: 4478 ↛ 4479line 4478 didn't jump to line 4479 because the condition on line 4478 was never true

4479 raw_tags.append(tag) 

4480 else: 

4481 new_tags.append(tag) 

4482 if isinstance(val, list): 

4483 if len(val) > 0 and isinstance(val[0], dict): 

4484 for d in val: 

4485 recursively_separate_raw_tags(wxr, d) 

4486 if "tags" in data and not new_tags: 4486 ↛ 4487line 4486 didn't jump to line 4487 because the condition on line 4486 was never true

4487 del data["tags"] 

4488 elif new_tags: 

4489 data["tags"] = new_tags 

4490 if raw_tags: 4490 ↛ 4491line 4490 didn't jump to line 4491 because the condition on line 4490 was never true

4491 data["raw_tags"] = raw_tags 

4492 

4493 

4494def process_soft_redirect_template( 

4495 wxr: WiktextractContext, 

4496 template_node: TemplateNode, 

4497 redirect_pages: list[str], 

4498) -> bool: 

4499 # return `True` if the template is soft redirect template 

4500 if template_node.template_name == "zh-see": 

4501 # https://en.wiktionary.org/wiki/Template:zh-see 

4502 title = clean_node( 

4503 wxr, None, template_node.template_parameters.get(1, "") 

4504 ) 

4505 if title != "": 4505 ↛ 4507line 4505 didn't jump to line 4507 because the condition on line 4505 was always true

4506 redirect_pages.append(title) 

4507 return True 

4508 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4509 # https://en.wiktionary.org/wiki/Template:ja-see 

4510 for key, value in template_node.template_parameters.items(): 

4511 if isinstance(key, int): 4511 ↛ 4510line 4511 didn't jump to line 4510 because the condition on line 4511 was always true

4512 title = clean_node(wxr, None, value) 

4513 if title != "": 4513 ↛ 4510line 4513 didn't jump to line 4510 because the condition on line 4513 was always true

4514 redirect_pages.append(title) 

4515 return True 

4516 return False 

4517 

4518 

4519def process_zh_forms_templates( 

4520 wxr: WiktextractContext, 

4521 template_node: TemplateNode, 

4522 base_data: WordData, 

4523) -> None: 

4524 # https://en.wiktionary.org/wiki/Template:zh-forms 

4525 if "forms" not in base_data: 

4526 base_data["forms"] = [] 

4527 for p_name, p_value in template_node.template_parameters.items(): 

4528 if not isinstance(p_name, str): 

4529 continue 

4530 if re.fullmatch(r"s\d*", p_name): 

4531 form_data: FormData = { 

4532 "form": clean_node(wxr, None, p_value), 

4533 "tags": ["Simplified Chinese"], 

4534 } 

4535 if len(form_data["form"]) > 0: 

4536 base_data["forms"].append(form_data) 

4537 elif re.fullmatch(r"t\d+", p_name): 

4538 form_data = { 

4539 "form": clean_node(wxr, None, p_value), 

4540 "tags": ["Traditional Chinese"], 

4541 } 

4542 if len(form_data["form"]) > 0: 

4543 base_data["forms"].append(form_data) 

4544 elif p_name == "alt": 

4545 for form_text in clean_node(wxr, None, p_value).split(","): 

4546 texts = form_text.split("-") 

4547 form_data = {"form": texts[0]} 

4548 if len(texts) > 1: 

4549 # pronunciation data could be added after "-" 

4550 # see https://en.wiktionary.org/wiki/新婦 

4551 form_data["raw_tags"] = texts[1:] 

4552 if len(form_data["form"]) > 0: 

4553 base_data["forms"].append(form_data) 

4554 elif p_name == "lit": 

4555 lit = clean_node(wxr, None, p_value) 

4556 if lit != "": 

4557 base_data["literal_meaning"] = lit 

4558 if len(base_data["forms"]) == 0: 

4559 del base_data["forms"]