Coverage for src/wiktextract/extractor/en/page.py: 76%

1938 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8from collections import defaultdict 

9from functools import partial 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 Iterable, 

14 Optional, 

15 Set, 

16 Union, 

17 cast, 

18) 

19 

20from mediawiki_langcodes import get_all_names, name_to_code 

21from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

22from wikitextprocessor.parser import ( 

23 LEVEL_KIND_FLAGS, 

24 GeneralNode, 

25 HTMLNode, 

26 LevelNode, 

27 NodeKind, 

28 TemplateNode, 

29 WikiNode, 

30) 

31 

32from ...clean import clean_template_args, clean_value 

33from ...datautils import ( 

34 data_append, 

35 data_extend, 

36 ns_title_prefix_tuple, 

37) 

38from ...page import ( 

39 LEVEL_KINDS, 

40 clean_node, 

41 is_panel_template, 

42 recursively_extract, 

43) 

44from ...tags import valid_tags 

45from ...wxr_context import WiktextractContext 

46from ...wxr_logging import logger 

47from ..ruby import extract_ruby, parse_ruby 

48from ..share import strip_nodes 

49from .descendant import extract_descendant_section 

50from .example import extract_example_list_item, extract_template_zh_x 

51from .form_descriptions import ( 

52 classify_desc, 

53 decode_tags, 

54 distw, 

55 parse_alt_or_inflection_of, 

56 parse_sense_qualifier, 

57 parse_word_head, 

58) 

59from .inflection import TableContext, parse_inflection_section 

60from .info_templates import ( 

61 INFO_TEMPLATE_FUNCS, 

62 parse_info_template_arguments, 

63 parse_info_template_node, 

64) 

65from .linkages import ( 

66 extract_alt_form_section, 

67 extract_zh_dial_template, 

68 parse_linkage_item_text, 

69) 

70from .parts_of_speech import PARTS_OF_SPEECH 

71from .section_titles import ( 

72 COMPOUNDS_TITLE, 

73 DESCENDANTS_TITLE, 

74 ETYMOLOGY_TITLES, 

75 IGNORED_TITLES, 

76 INFLECTION_TITLES, 

77 LINKAGE_TITLES, 

78 POS_TITLES, 

79 PRONUNCIATION_TITLE, 

80 PROTO_ROOT_DERIVED_TITLES, 

81 TRANSLATIONS_TITLE, 

82) 

83from .translations import parse_translation_item_text 

84from .type_utils import ( 

85 AttestationData, 

86 ExampleData, 

87 LinkageData, 

88 ReferenceData, 

89 SenseData, 

90 SoundData, 

91 TemplateData, 

92 WordData, 

93) 

94from .unsupported_titles import unsupported_title_map 

95 

96# When determining whether a string is 'english', classify_desc 

97# might return 'taxonomic' which is English text 99% of the time. 

98ENGLISH_TEXTS = ("english", "taxonomic") 

99 

100# Matches head tag 

101HEAD_TAG_RE = re.compile( 

102 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

103 r"hangul-symbol|syllable-hangul)$|" 

104 + r"^(latin|" 

105 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

106 + r")-(" 

107 + "|".join( 

108 [ 

109 "abbr", 

110 "adj", 

111 "adjective", 

112 "adjective form", 

113 "adjective-form", 

114 "adv", 

115 "adverb", 

116 "affix", 

117 "animal command", 

118 "art", 

119 "article", 

120 "aux", 

121 "bound pronoun", 

122 "bound-pronoun", 

123 "Buyla", 

124 "card num", 

125 "card-num", 

126 "cardinal", 

127 "chunom", 

128 "classifier", 

129 "clitic", 

130 "cls", 

131 "cmene", 

132 "cmavo", 

133 "colloq-verb", 

134 "colverbform", 

135 "combining form", 

136 "combining-form", 

137 "comparative", 

138 "con", 

139 "concord", 

140 "conj", 

141 "conjunction", 

142 "conjug", 

143 "cont", 

144 "contr", 

145 "converb", 

146 "daybox", 

147 "decl", 

148 "decl noun", 

149 "def", 

150 "dem", 

151 "det", 

152 "determ", 

153 "Deva", 

154 "ending", 

155 "entry", 

156 "form", 

157 "fuhivla", 

158 "gerund", 

159 "gismu", 

160 "hanja", 

161 "hantu", 

162 "hanzi", 

163 "head", 

164 "ideophone", 

165 "idiom", 

166 "inf", 

167 "indef", 

168 "infixed pronoun", 

169 "infixed-pronoun", 

170 "infl", 

171 "inflection", 

172 "initialism", 

173 "int", 

174 "interfix", 

175 "interj", 

176 "interjection", 

177 "jyut", 

178 "latin", 

179 "letter", 

180 "locative", 

181 "lujvo", 

182 "monthbox", 

183 "mutverb", 

184 "name", 

185 "nisba", 

186 "nom", 

187 "noun", 

188 "noun form", 

189 "noun-form", 

190 "noun plural", 

191 "noun-plural", 

192 "nounprefix", 

193 "num", 

194 "number", 

195 "numeral", 

196 "ord", 

197 "ordinal", 

198 "par", 

199 "part", 

200 "part form", 

201 "part-form", 

202 "participle", 

203 "particle", 

204 "past", 

205 "past neg", 

206 "past-neg", 

207 "past participle", 

208 "past-participle", 

209 "perfect participle", 

210 "perfect-participle", 

211 "personal pronoun", 

212 "personal-pronoun", 

213 "pref", 

214 "prefix", 

215 "phrase", 

216 "pinyin", 

217 "plural noun", 

218 "plural-noun", 

219 "pos", 

220 "poss-noun", 

221 "post", 

222 "postp", 

223 "postposition", 

224 "PP", 

225 "pp", 

226 "ppron", 

227 "pred", 

228 "predicative", 

229 "prep", 

230 "prep phrase", 

231 "prep-phrase", 

232 "preposition", 

233 "present participle", 

234 "present-participle", 

235 "pron", 

236 "prondem", 

237 "pronindef", 

238 "pronoun", 

239 "prop", 

240 "proper noun", 

241 "proper-noun", 

242 "proper noun form", 

243 "proper-noun form", 

244 "proper noun-form", 

245 "proper-noun-form", 

246 "prov", 

247 "proverb", 

248 "prpn", 

249 "prpr", 

250 "punctuation mark", 

251 "punctuation-mark", 

252 "regnoun", 

253 "rel", 

254 "rom", 

255 "romanji", 

256 "root", 

257 "sign", 

258 "suff", 

259 "suffix", 

260 "syllable", 

261 "symbol", 

262 "verb", 

263 "verb form", 

264 "verb-form", 

265 "verbal noun", 

266 "verbal-noun", 

267 "verbnec", 

268 "vform", 

269 ] 

270 ) 

271 + r")(-|/|\+|$)" 

272) 

273 

274# Head-templates causing problems (like newlines) that can be squashed into 

275# an empty string in the template handler while saving their template 

276# data for later. 

277WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

278 

279FLOATING_TABLE_TEMPLATES: set[str] = { 

280 # az-suffix-form creates a style=floatright div that is otherwise 

281 # deleted; if it is not pre-expanded, we can intercept the template 

282 # so we add this set into do_not_pre_expand, and intercept the 

283 # templates in parse_part_of_speech 

284 "az-suffix-forms", 

285 "az-inf-p", 

286 "kk-suffix-forms", 

287 "ky-suffix-forms", 

288 "tr-inf-p", 

289 "tr-suffix-forms", 

290 "tt-suffix-forms", 

291 "uz-suffix-forms", 

292} 

293# These two should contain template names that should always be 

294# pre-expanded when *first* processing the tree, or not pre-expanded 

295# so that the template are left in place with their identifying 

296# name intact for later filtering. 

297 

298DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

299DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

300 

301# Additional templates to be expanded in the pre-expand phase 

302ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

303 "multitrans", 

304 "multitrans-nowiki", 

305 "trans-top", 

306 "trans-top-also", 

307 "trans-bottom", 

308 "checktrans-top", 

309 "checktrans-bottom", 

310 "col1", 

311 "col2", 

312 "col3", 

313 "col4", 

314 "col5", 

315 "col1-u", 

316 "col2-u", 

317 "col3-u", 

318 "col4-u", 

319 "col5-u", 

320 "check deprecated lang param usage", 

321 "deprecated code", 

322 "ru-verb-alt-ё", 

323 "ru-noun-alt-ё", 

324 "ru-adj-alt-ё", 

325 "ru-proper noun-alt-ё", 

326 "ru-pos-alt-ё", 

327 "ru-alt-ё", 

328 "inflection of", 

329 "no deprecated lang param usage", 

330 "transclude", # these produce sense entries (or other lists) 

331 "tcl", 

332} 

333 

334# Inverse linkage for those that have them 

335linkage_inverses: dict[str, str] = { 

336 # XXX this is not currently used, move to post-processing 

337 "synonyms": "synonyms", 

338 "hypernyms": "hyponyms", 

339 "hyponyms": "hypernyms", 

340 "holonyms": "meronyms", 

341 "meronyms": "holonyms", 

342 "derived": "derived_from", 

343 "coordinate_terms": "coordinate_terms", 

344 "troponyms": "hypernyms", 

345 "antonyms": "antonyms", 

346 "instances": "instance_of", 

347 "related": "related", 

348} 

349 

350# Templates that are used to form panels on pages and that 

351# should be ignored in various positions 

352PANEL_TEMPLATES: set[str] = { 

353 "Character info", 

354 "CJKV", 

355 "French personal pronouns", 

356 "French possessive adjectives", 

357 "French possessive pronouns", 

358 "Han etym", 

359 "Japanese demonstratives", 

360 "Latn-script", 

361 "LDL", 

362 "MW1913Abbr", 

363 "Number-encoding", 

364 "Nuttall", 

365 "Spanish possessive adjectives", 

366 "Spanish possessive pronouns", 

367 "USRegionDisputed", 

368 "Webster 1913", 

369 "ase-rfr", 

370 "attention", 

371 "attn", 

372 "beer", 

373 "broken ref", 

374 "ca-compass", 

375 "character info", 

376 "character info/var", 

377 "checksense", 

378 "compass-fi", 

379 "copyvio suspected", 

380 "delete", 

381 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

382 "etystub", 

383 "examples", 

384 "hu-corr", 

385 "hu-suff-pron", 

386 "interwiktionary", 

387 "ja-kanjitab", 

388 "ko-hanja-search", 

389 "look", 

390 "maintenance box", 

391 "maintenance line", 

392 "mediagenic terms", 

393 "merge", 

394 "missing template", 

395 "morse links", 

396 "move", 

397 "multiple images", 

398 "no inline", 

399 "picdic", 

400 "picdicimg", 

401 "picdiclabel", 

402 "polyominoes", 

403 "predidential nomics", 

404 "punctuation", # This actually gets pre-expanded 

405 "reconstructed", 

406 "request box", 

407 "rf-sound example", 

408 "rfaccents", 

409 "rfap", 

410 "rfaspect", 

411 "rfc", 

412 "rfc-auto", 

413 "rfc-header", 

414 "rfc-level", 

415 "rfc-pron-n", 

416 "rfc-sense", 

417 "rfclarify", 

418 "rfd", 

419 "rfd-redundant", 

420 "rfd-sense", 

421 "rfdate", 

422 "rfdatek", 

423 "rfdef", 

424 "rfe", 

425 "rfe/dowork", 

426 "rfex", 

427 "rfexp", 

428 "rfform", 

429 "rfgender", 

430 "rfi", 

431 "rfinfl", 

432 "rfm", 

433 "rfm-sense", 

434 "rfp", 

435 "rfp-old", 

436 "rfquote", 

437 "rfquote-sense", 

438 "rfquotek", 

439 "rfref", 

440 "rfscript", 

441 "rft2", 

442 "rftaxon", 

443 "rftone", 

444 "rftranslit", 

445 "rfv", 

446 "rfv-etym", 

447 "rfv-pron", 

448 "rfv-quote", 

449 "rfv-sense", 

450 "selfref", 

451 "split", 

452 "stroke order", # XXX consider capturing this? 

453 "stub entry", 

454 "t-needed", 

455 "tbot entry", 

456 "tea room", 

457 "tea room sense", 

458 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

459 "unblock", 

460 "unsupportedpage", 

461 "video frames", 

462 "was wotd", 

463 "wrongtitle", 

464 "zh-forms", 

465 "zh-hanzi-box", 

466 "no entry", 

467} 

468 

469# Template name prefixes used for language-specific panel templates (i.e., 

470# templates that create side boxes or notice boxes or that should generally 

471# be ignored). 

472PANEL_PREFIXES: set[str] = { 

473 "list:compass points/", 

474 "list:Gregorian calendar months/", 

475 "RQ:", 

476} 

477 

478# Templates used for wikipedia links. 

479wikipedia_templates: set[str] = { 

480 "wikipedia", 

481 "slim-wikipedia", 

482 "w", 

483 "W", 

484 "swp", 

485 "wiki", 

486 "Wikipedia", 

487 "wtorw", 

488} 

489for x in PANEL_PREFIXES & wikipedia_templates: 489 ↛ 490line 489 didn't jump to line 490 because the loop on line 489 never started

490 print( 

491 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

492 x 

493 ) 

494 ) 

495 

496# Mapping from a template name (without language prefix) for the main word 

497# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

498# it could validly occur. This is used as just a sanity check to give 

499# warnings about probably incorrect coding in Wiktionary. 

500template_allowed_pos_map: dict[str, list[str]] = { 

501 "abbr": ["abbrev"], 

502 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

503 "plural noun": ["noun", "name"], 

504 "plural-noun": ["noun", "name"], 

505 "proper noun": ["noun", "name"], 

506 "proper-noun": ["name", "noun"], 

507 "prop": ["name", "noun"], 

508 "verb": ["verb", "phrase"], 

509 "gerund": ["verb"], 

510 "particle": ["adv", "particle"], 

511 "adj": ["adj", "adj_noun"], 

512 "pron": ["pron", "noun"], 

513 "name": ["name", "noun"], 

514 "adv": ["adv", "intj", "conj", "particle"], 

515 "phrase": ["phrase", "prep_phrase"], 

516 "noun phrase": ["phrase"], 

517 "ordinal": ["num"], 

518 "number": ["num"], 

519 "pos": ["affix", "name", "num"], 

520 "suffix": ["suffix", "affix"], 

521 "character": ["character"], 

522 "letter": ["character"], 

523 "kanji": ["character"], 

524 "cont": ["abbrev"], 

525 "interj": ["intj"], 

526 "con": ["conj"], 

527 "part": ["particle"], 

528 "prep": ["prep", "postp"], 

529 "postp": ["postp"], 

530 "misspelling": ["noun", "adj", "verb", "adv"], 

531 "part-form": ["verb"], 

532} 

533for k, v in template_allowed_pos_map.items(): 

534 for x in v: 

535 if x not in PARTS_OF_SPEECH: 535 ↛ 536line 535 didn't jump to line 536 because the condition on line 535 was never true

536 print( 

537 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

538 "".format(x, k, v) 

539 ) 

540 assert False 

541 

542 

543# Templates ignored during etymology extraction, i.e., these will not be listed 

544# in the extracted etymology templates. 

545ignored_etymology_templates: list[str] = [ 

546 "...", 

547 "IPAchar", 

548 "ipachar", 

549 "ISBN", 

550 "isValidPageName", 

551 "redlink category", 

552 "deprecated code", 

553 "check deprecated lang param usage", 

554 "para", 

555 "p", 

556 "cite", 

557 "Cite news", 

558 "Cite newsgroup", 

559 "cite paper", 

560 "cite MLLM 1976", 

561 "cite journal", 

562 "cite news/documentation", 

563 "cite paper/documentation", 

564 "cite video game", 

565 "cite video game/documentation", 

566 "cite newsgroup", 

567 "cite newsgroup/documentation", 

568 "cite web/documentation", 

569 "cite news", 

570 "Cite book", 

571 "Cite-book", 

572 "cite book", 

573 "cite web", 

574 "cite-usenet", 

575 "cite-video/documentation", 

576 "Cite-journal", 

577 "rfe", 

578 "catlangname", 

579 "cln", 

580 "langname-lite", 

581 "no deprecated lang param usage", 

582 "mention", 

583 "m", 

584 "m-self", 

585 "link", 

586 "l", 

587 "ll", 

588 "l-self", 

589] 

590# Regexp for matching ignored etymology template names. This adds certain 

591# prefixes to the names listed above. 

592ignored_etymology_templates_re = re.compile( 

593 r"^((cite-|R:|RQ:).*|" 

594 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

595 + r")$" 

596) 

597 

598# Regexp for matching ignored descendants template names. Right now we just 

599# copy the ignored etymology templates 

600ignored_descendants_templates_re = ignored_etymology_templates_re 

601 

602# Set of template names that are used to define usage examples. If the usage 

603# example contains one of these templates, then it its type is set to 

604# "example" 

605usex_templates: set[str] = { 

606 "afex", 

607 "affixusex", 

608 "co", # {{collocation}} acts like a example template, specifically for 

609 # pairs of combinations of words that are more common than you'd 

610 # except would be randomly; hlavní#Czech 

611 "coi", 

612 "collocation", 

613 "el-example", 

614 "el-x", 

615 "example", 

616 "examples", 

617 "he-usex", 

618 "he-x", 

619 "hi-usex", 

620 "hi-x", 

621 "ja-usex-inline", 

622 "ja-usex", 

623 "ja-x", 

624 "jbo-example", 

625 "jbo-x", 

626 "km-usex", 

627 "km-x", 

628 "ko-usex", 

629 "ko-x", 

630 "lo-usex", 

631 "lo-x", 

632 "ne-x", 

633 "ne-usex", 

634 "prefixusex", 

635 "ryu-usex", 

636 "ryu-x", 

637 "shn-usex", 

638 "shn-x", 

639 "suffixusex", 

640 "th-usex", 

641 "th-x", 

642 "ur-usex", 

643 "ur-x", 

644 "usex", 

645 "usex-suffix", 

646 "ux", 

647 "uxi", 

648} 

649 

650stop_head_at_these_templates: set[str] = { 

651 "category", 

652 "cat", 

653 "topics", 

654 "catlangname", 

655 "c", 

656 "C", 

657 "top", 

658 "cln", 

659} 

660 

661# Set of template names that are used to define quotation examples. If the 

662# usage example contains one of these templates, then its type is set to 

663# "quotation". 

664quotation_templates: set[str] = { 

665 "collapse-quote", 

666 "quote-av", 

667 "quote-book", 

668 "quote-GYLD", 

669 "quote-hansard", 

670 "quotei", 

671 "quote-journal", 

672 "quotelite", 

673 "quote-mailing list", 

674 "quote-meta", 

675 "quote-newsgroup", 

676 "quote-song", 

677 "quote-text", 

678 "quote", 

679 "quote-us-patent", 

680 "quote-video game", 

681 "quote-web", 

682 "quote-wikipedia", 

683 "wikiquote", 

684 "Wikiquote", 

685} 

686 

687taxonomy_templates = { 

688 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

689 "taxfmt", 

690 "taxlink", 

691 "taxlink2", 

692 "taxlinknew", 

693 "taxlook", 

694} 

695 

696# Template name component to linkage section listing. Integer section means 

697# default section, starting at that argument. 

698# XXX not used anymore, except for the first elements: moved to 

699# template_linkages 

700# template_linkage_mappings: list[list[Union[str, int]]] = [ 

701# ["syn", "synonyms"], 

702# ["synonyms", "synonyms"], 

703# ["ant", "antonyms"], 

704# ["antonyms", "antonyms"], 

705# ["hyp", "hyponyms"], 

706# ["hyponyms", "hyponyms"], 

707# ["der", "derived"], 

708# ["derived terms", "derived"], 

709# ["coordinate terms", "coordinate_terms"], 

710# ["rel", "related"], 

711# ["col", 2], 

712# ] 

713 

714# Template names, this was exctracted from template_linkage_mappings, 

715# because the code using template_linkage_mappings was actually not used 

716# (but not removed). 

717template_linkages_to_ignore_in_examples: set[str] = { 

718 "syn", 

719 "synonyms", 

720 "ant", 

721 "antonyms", 

722 "hyp", 

723 "hyponyms", 

724 "der", 

725 "derived terms", 

726 "coordinate terms", 

727 "cot", 

728 "rel", 

729 "col", 

730 "inline alt forms", 

731 "alti", 

732 "comeronyms", 

733 "holonyms", 

734 "holo", 

735 "hypernyms", 

736 "hyper", 

737 "meronyms", 

738 "mero", 

739 "troponyms", 

740 "perfectives", 

741 "pf", 

742 "imperfectives", 

743 "impf", 

744 "syndiff", 

745 "synsee", 

746 # not linkage nor example templates 

747 "sense", 

748 "s", 

749 "color panel", 

750 "colour panel", 

751} 

752 

753# Maps template name used in a word sense to a linkage field that it adds. 

754sense_linkage_templates: dict[str, str] = { 

755 "syn": "synonyms", 

756 "synonyms": "synonyms", 

757 "synsee": "synonyms", 

758 "syndiff": "synonyms", 

759 "hyp": "hyponyms", 

760 "hyponyms": "hyponyms", 

761 "ant": "antonyms", 

762 "antonyms": "antonyms", 

763 "alti": "related", 

764 "inline alt forms": "related", 

765 "coordinate terms": "coordinate_terms", 

766 "cot": "coordinate_terms", 

767 "comeronyms": "related", 

768 "holonyms": "holonyms", 

769 "holo": "holonyms", 

770 "hypernyms": "hypernyms", 

771 "hyper": "hypernyms", 

772 "meronyms": "meronyms", 

773 "mero": "meronyms", 

774 "troponyms": "troponyms", 

775 "perfectives": "related", 

776 "pf": "related", 

777 "imperfectives": "related", 

778 "impf": "related", 

779} 

780 

781sense_linkage_templates_tags: dict[str, list[str]] = { 

782 "alti": ["alternative"], 

783 "inline alt forms": ["alternative"], 

784 "comeronyms": ["comeronym"], 

785 "perfectives": ["perfective"], 

786 "pf": ["perfective"], 

787 "imperfectives": ["imperfective"], 

788 "impf": ["imperfective"], 

789} 

790 

791 

792def decode_html_entities(v: Union[str, int]) -> str: 

793 """Decodes HTML entities from a value, converting them to the respective 

794 Unicode characters/strings.""" 

795 if isinstance(v, int): 

796 # I changed this to return str(v) instead of v = str(v), 

797 # but there might have been the intention to have more logic 

798 # here. html.unescape would not do anything special with an integer, 

799 # it needs html escape symbols (&xx;). 

800 return str(v) 

801 return html.unescape(v) 

802 

803 

804def parse_sense_linkage( 

805 wxr: WiktextractContext, 

806 data: SenseData, 

807 name: str, 

808 ht: TemplateArgs, 

809 pos: str, 

810) -> None: 

811 """Parses a linkage (synonym, etc) specified in a word sense.""" 

812 assert isinstance(wxr, WiktextractContext) 

813 assert isinstance(data, dict) 

814 assert isinstance(name, str) 

815 assert isinstance(ht, dict) 

816 field = sense_linkage_templates[name] 

817 field_tags = sense_linkage_templates_tags.get(name, []) 

818 for i in range(2, 20): 

819 w = ht.get(i) or "" 

820 w = clean_node(wxr, data, w) 

821 is_thesaurus = False 

822 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

823 if w.startswith(alias): 823 ↛ 824line 823 didn't jump to line 824 because the condition on line 823 was never true

824 is_thesaurus = True 

825 w = w[len(alias) :] 

826 if w != wxr.wtp.title: 

827 from ...thesaurus import search_thesaurus 

828 

829 lang_code = clean_node(wxr, None, ht.get(1, "")) 

830 for t_data in search_thesaurus( 

831 wxr.thesaurus_db_conn, w, lang_code, pos, field 

832 ): 

833 l_data = { 

834 "word": t_data.term, 

835 "source": "Thesaurus:" + w, 

836 } 

837 if len(t_data.tags) > 0: 

838 l_data["tags"] = t_data.tags 

839 if len(t_data.raw_tags) > 0: 

840 l_data["raw_tags"] = t_data.raw_tags 

841 data_append(data, field, l_data) 

842 break 

843 if not w: 

844 break 

845 if is_thesaurus: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true

846 continue 

847 tags: list[str] = [] 

848 topics: list[str] = [] 

849 english: Optional[str] = None 

850 # Try to find qualifiers for this synonym 

851 q = ht.get("q{}".format(i - 1)) 

852 if q: 

853 cls = classify_desc(q) 

854 if cls == "tags": 

855 tagsets1, topics1 = decode_tags(q) 

856 for ts in tagsets1: 

857 tags.extend(ts) 

858 topics.extend(topics1) 

859 elif cls == "english": 859 ↛ 865line 859 didn't jump to line 865 because the condition on line 859 was always true

860 if english: 860 ↛ 861line 860 didn't jump to line 861 because the condition on line 860 was never true

861 english += "; " + q 

862 else: 

863 english = q 

864 # Try to find English translation for this synonym 

865 t = ht.get("t{}".format(i - 1)) 

866 if t: 866 ↛ 867line 866 didn't jump to line 867 because the condition on line 866 was never true

867 if english: 

868 english += "; " + t 

869 else: 

870 english = t 

871 

872 # See if the linkage contains a parenthesized alt 

873 alt = None 

874 m = re.search(r"\(([^)]+)\)$", w) 

875 if m: 875 ↛ 876line 875 didn't jump to line 876 because the condition on line 875 was never true

876 w = w[: m.start()].strip() 

877 alt = m.group(1) 

878 

879 dt = {"word": w} 

880 if field_tags: 880 ↛ 881line 880 didn't jump to line 881 because the condition on line 880 was never true

881 data_extend(dt, "tags", field_tags) 

882 if tags: 

883 data_extend(dt, "tags", tags) 

884 if topics: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true

885 data_extend(dt, "topics", topics) 

886 if english: 

887 dt["english"] = english # DEPRECATED for "translation" 

888 dt["translation"] = english 

889 if alt: 889 ↛ 890line 889 didn't jump to line 890 because the condition on line 889 was never true

890 dt["alt"] = alt 

891 data_append(data, field, dt) 

892 

893 

894EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

895example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

896captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

897 

898 

899def synch_splits_with_args( 

900 line: str, targs: TemplateArgs 

901) -> Optional[list[str]]: 

902 """If it looks like there's something weird with how a line of example 

903 text has been split, this function will do the splitting after counting 

904 occurences of the splitting regex inside the two main template arguments 

905 containing the string data for the original language example and the 

906 English translations. 

907 """ 

908 # Previously, we split without capturing groups, but here we want to 

909 # keep the original splitting hyphen regex intact. 

910 fparts = captured_splitters_re.split(line) 

911 new_parts = [] 

912 # ["First", " – ", "second", " – ", "third..."] from OL argument 

913 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

914 new_parts.append("".join(fparts[:first])) 

915 # Translation argument 

916 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

917 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

918 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

919 new_parts.append("".join(fparts[first + 1 : second])) 

920 

921 if all(new_parts): # no empty strings from the above spaghetti 

922 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

923 return new_parts 

924 else: 

925 return None 

926 

927 

928QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

929QUALIFIERS_RE = re.compile(QUALIFIERS) 

930# (...): ... or (...(...)...): ... 

931 

932 

933def parse_language( 

934 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

935) -> list[WordData]: 

936 """Iterates over the text of the page, returning words (parts-of-speech) 

937 defined on the page one at a time. (Individual word senses for the 

938 same part-of-speech are typically encoded in the same entry.)""" 

939 # imported here to avoid circular import 

940 from .pronunciation import parse_pronunciation 

941 

942 assert isinstance(wxr, WiktextractContext) 

943 assert isinstance(langnode, WikiNode) 

944 assert isinstance(language, str) 

945 assert isinstance(lang_code, str) 

946 # print("parse_language", language) 

947 

948 is_reconstruction = False 

949 word: str = wxr.wtp.title # type: ignore[assignment] 

950 unsupported_prefix = "Unsupported titles/" 

951 if word.startswith(unsupported_prefix): 

952 w = word[len(unsupported_prefix) :] 

953 if w in unsupported_title_map: 953 ↛ 956line 953 didn't jump to line 956 because the condition on line 953 was always true

954 word = unsupported_title_map[w] 

955 else: 

956 wxr.wtp.error( 

957 "Unimplemented unsupported title: {}".format(word), 

958 sortid="page/870", 

959 ) 

960 word = w 

961 elif word.startswith("Reconstruction:"): 

962 word = word[word.find("/") + 1 :] 

963 is_reconstruction = True 

964 

965 base_data: WordData = { 

966 "word": word, 

967 "lang": language, 

968 "lang_code": lang_code, 

969 } 

970 if is_reconstruction: 

971 data_append(base_data, "tags", "reconstruction") 

972 sense_data: SenseData = {} 

973 pos_data: WordData = {} # For a current part-of-speech 

974 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

975 etym_data: WordData = {} # For one etymology 

976 sense_datas: list[SenseData] = [] 

977 sense_ordinal = 0 # The recursive sense parsing messes up the ordering 

978 # Never reset, do not use as data 

979 level_four_datas: list[WordData] = [] 

980 etym_datas: list[WordData] = [] 

981 page_datas: list[WordData] = [] 

982 have_etym = False 

983 inside_level_four = False # This is for checking if the etymology section 

984 # or article has a Pronunciation section, for Chinese mostly; because 

985 # Chinese articles can have three level three sections (two etymology 

986 # sections and pronunciation sections) one after another, we need a kludge 

987 # to better keep track of whether we're in a normal "etym" or inside a 

988 # "level four" (which is what we've turned the level three Pron sections 

989 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

990 # a step. 

991 stack: list[str] = [] # names of items on the "stack" 

992 

993 def merge_base(data: WordData, base: WordData) -> None: 

994 for k, v in base.items(): 

995 # Copy the value to ensure that we don't share lists or 

996 # dicts between structures (even nested ones). 

997 v = copy.deepcopy(v) 

998 if k not in data: 

999 # The list was copied above, so this will not create shared ref 

1000 data[k] = v # type: ignore[literal-required] 

1001 continue 

1002 if data[k] == v: # type: ignore[literal-required] 

1003 continue 

1004 if ( 1004 ↛ 1012line 1004 didn't jump to line 1012 because the condition on line 1004 was always true

1005 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

1006 or isinstance( 

1007 v, 

1008 (list, tuple), # Should this be "and"? 

1009 ) 

1010 ): 

1011 data[k] = list(data[k]) + list(v) # type: ignore 

1012 elif data[k] != v: # type: ignore[literal-required] 

1013 wxr.wtp.warning( 

1014 "conflicting values for {} in merge_base: " 

1015 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1016 sortid="page/904", 

1017 ) 

1018 

1019 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1020 """Remove unnecessary keys from dict values 

1021 in a list comprehension...""" 

1022 if key in pron: 

1023 pron.pop(key) # type: ignore 

1024 return pron 

1025 

1026 # If the result has sounds, eliminate sounds that have a prefix that 

1027 # does not match "word" or one of "forms" 

1028 if "sounds" in data and "word" in data: 

1029 accepted = [data["word"]] 

1030 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1031 data["sounds"] = list( 

1032 s 

1033 for s in data["sounds"] 

1034 if "form" not in s or s["form"] in accepted 

1035 ) 

1036 # If the result has sounds, eliminate sounds that have a pos that 

1037 # does not match "pos" 

1038 if "sounds" in data and "pos" in data: 

1039 data["sounds"] = list( 

1040 complementary_pop(s, "pos") 

1041 for s in data["sounds"] 

1042 # "pos" is not a field of SoundData, correctly, so we're 

1043 # removing it here. It's a kludge on a kludge on a kludge. 

1044 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

1045 ) 

1046 

1047 def push_sense(sorting_ordinal: int | None = None) -> bool: 

1048 """Starts collecting data for a new word sense. This returns True 

1049 if a sense was added.""" 

1050 nonlocal sense_data 

1051 if sorting_ordinal is None: 

1052 sorting_ordinal = sense_ordinal 

1053 tags = sense_data.get("tags", ()) 

1054 if ( 

1055 not sense_data.get("glosses") 

1056 and "translation-hub" not in tags 

1057 and "no-gloss" not in tags 

1058 ): 

1059 return False 

1060 

1061 if ( 1061 ↛ 1071line 1061 didn't jump to line 1071 because the condition on line 1061 was never true

1062 ( 

1063 "participle" in sense_data.get("tags", ()) 

1064 or "infinitive" in sense_data.get("tags", ()) 

1065 ) 

1066 and "alt_of" not in sense_data 

1067 and "form_of" not in sense_data 

1068 and "etymology_text" in etym_data 

1069 and etym_data["etymology_text"] != "" 

1070 ): 

1071 etym = etym_data["etymology_text"] 

1072 etym = etym.split(". ")[0] 

1073 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1074 if ret is not None: 

1075 tags, lst = ret 

1076 assert isinstance(lst, (list, tuple)) 

1077 if "form-of" in tags: 

1078 data_extend(sense_data, "form_of", lst) 

1079 data_extend(sense_data, "tags", tags) 

1080 elif "alt-of" in tags: 

1081 data_extend(sense_data, "alt_of", lst) 

1082 data_extend(sense_data, "tags", tags) 

1083 

1084 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1084 ↛ 1087line 1084 didn't jump to line 1087 because the condition on line 1084 was never true

1085 "tags", () 

1086 ): 

1087 data_append(sense_data, "tags", "no-gloss") 

1088 

1089 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal 

1090 sense_datas.append(sense_data) 

1091 sense_data = {} 

1092 return True 

1093 

1094 def push_pos(sorting_ordinal: int | None = None) -> None: 

1095 """Starts collecting data for a new part-of-speech.""" 

1096 nonlocal pos_data 

1097 nonlocal sense_datas 

1098 push_sense(sorting_ordinal) 

1099 if wxr.wtp.subsection: 

1100 data: WordData = {"senses": sense_datas} 

1101 merge_base(data, pos_data) 

1102 level_four_datas.append(data) 

1103 pos_data = {} 

1104 sense_datas = [] 

1105 wxr.wtp.start_subsection(None) 

1106 

1107 def push_level_four_section(clear_sound_data: bool) -> None: 

1108 """Starts collecting data for a new level four sections, which 

1109 is usually virtual and empty, unless the article has Chinese 

1110 'Pronunciation' sections that are etymology-section-like but 

1111 under etymology, and at the same level in the source. We modify 

1112 the source to demote Pronunciation sections like that to level 

1113 4, and other sections one step lower.""" 

1114 nonlocal level_four_data 

1115 nonlocal level_four_datas 

1116 nonlocal etym_datas 

1117 push_pos() 

1118 # print(f"======\n{etym_data=}") 

1119 # print(f"======\n{etym_datas=}") 

1120 # print(f"======\n{level_four_data=}") 

1121 # print(f"======\n{level_four_datas=}") 

1122 for data in level_four_datas: 

1123 merge_base(data, level_four_data) 

1124 etym_datas.append(data) 

1125 for data in etym_datas: 

1126 merge_base(data, etym_data) 

1127 page_datas.append(data) 

1128 if clear_sound_data: 

1129 level_four_data = {} 

1130 level_four_datas = [] 

1131 etym_datas = [] 

1132 

1133 def push_etym() -> None: 

1134 """Starts collecting data for a new etymology.""" 

1135 nonlocal etym_data 

1136 nonlocal etym_datas 

1137 nonlocal have_etym 

1138 nonlocal inside_level_four 

1139 have_etym = True 

1140 push_level_four_section(False) 

1141 inside_level_four = False 

1142 # etymology section could under pronunciation section 

1143 etym_data = ( 

1144 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {} 

1145 ) 

1146 

1147 def select_data() -> WordData: 

1148 """Selects where to store data (pos or etym) based on whether we 

1149 are inside a pos (part-of-speech).""" 

1150 # print(f"{wxr.wtp.subsection=}") 

1151 # print(f"{stack=}") 

1152 if wxr.wtp.subsection is not None: 

1153 return pos_data 

1154 if inside_level_four: 

1155 return level_four_data 

1156 if stack[-1] == language: 

1157 return base_data 

1158 return etym_data 

1159 

1160 term_label_templates: list[TemplateData] = [] 

1161 

1162 def head_post_template_fn( 

1163 name: str, ht: TemplateArgs, expansion: str 

1164 ) -> Optional[str]: 

1165 """Handles special templates in the head section of a word. Head 

1166 section is the text after part-of-speech subtitle and before word 

1167 sense list. Typically it generates the bold line for the word, but 

1168 may also contain other useful information that often ends in 

1169 side boxes. We want to capture some of that additional information.""" 

1170 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1171 if is_panel_template(wxr, name): 1171 ↛ 1174line 1171 didn't jump to line 1174 because the condition on line 1171 was never true

1172 # Completely ignore these templates (not even recorded in 

1173 # head_templates) 

1174 return "" 

1175 if name == "head": 

1176 # XXX are these also captured in forms? Should this special case 

1177 # be removed? 

1178 t = ht.get(2, "") 

1179 if t == "pinyin": 1179 ↛ 1180line 1179 didn't jump to line 1180 because the condition on line 1179 was never true

1180 data_append(pos_data, "tags", "Pinyin") 

1181 elif t == "romanization": 1181 ↛ 1182line 1181 didn't jump to line 1182 because the condition on line 1181 was never true

1182 data_append(pos_data, "tags", "romanization") 

1183 if ( 

1184 HEAD_TAG_RE.search(name) is not None 

1185 or name in WORD_LEVEL_HEAD_TEMPLATES 

1186 ): 

1187 args_ht = clean_template_args(wxr, ht) 

1188 cleaned_expansion = clean_node(wxr, None, expansion) 

1189 dt: TemplateData = { 

1190 "name": name, 

1191 "args": args_ht, 

1192 "expansion": cleaned_expansion, 

1193 } 

1194 data_append(pos_data, "head_templates", dt) 

1195 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1196 term_label_templates.append(dt) 

1197 # Squash these, their tags are applied to the whole word, 

1198 # and some cause problems like "term-label" 

1199 return "" 

1200 

1201 # The following are both captured in head_templates and parsed 

1202 # separately 

1203 

1204 if name in wikipedia_templates: 

1205 # Note: various places expect to have content from wikipedia 

1206 # templates, so cannot convert this to empty 

1207 parse_wikipedia_template(wxr, pos_data, ht) 

1208 return None 

1209 

1210 if name == "number box": 1210 ↛ 1212line 1210 didn't jump to line 1212 because the condition on line 1210 was never true

1211 # XXX extract numeric value? 

1212 return "" 

1213 if name == "enum": 

1214 # XXX extract? 

1215 return "" 

1216 if name == "cardinalbox": 1216 ↛ 1219line 1216 didn't jump to line 1219 because the condition on line 1216 was never true

1217 # XXX extract similar to enum? 

1218 # XXX this can also occur in top-level under language 

1219 return "" 

1220 if name == "Han simplified forms": 1220 ↛ 1222line 1220 didn't jump to line 1222 because the condition on line 1220 was never true

1221 # XXX extract? 

1222 return "" 

1223 # if name == "ja-kanji forms": 

1224 # # XXX extract? 

1225 # return "" 

1226 # if name == "vi-readings": 

1227 # # XXX extract? 

1228 # return "" 

1229 # if name == "ja-kanji": 

1230 # # XXX extract? 

1231 # return "" 

1232 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1232 ↛ 1234line 1232 didn't jump to line 1234 because the condition on line 1232 was never true

1233 # XXX extract? 

1234 return "" 

1235 

1236 return None 

1237 

1238 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1239 """Parses the subsection for a part-of-speech under a language on 

1240 a page.""" 

1241 assert isinstance(posnode, WikiNode) 

1242 assert isinstance(pos, str) 

1243 # print("parse_part_of_speech", pos) 

1244 pos_data["pos"] = pos 

1245 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1246 lists: list[list[WikiNode]] = [[]] # list of lists 

1247 first_para = True 

1248 first_head_tmplt = True 

1249 collecting_head = True 

1250 start_of_paragraph = True 

1251 

1252 # XXX extract templates from posnode with recursively_extract 

1253 # that break stuff, like ja-kanji or az-suffix-form. 

1254 # Do the extraction with a list of template names, combined from 

1255 # different lists, then separate out them into different lists 

1256 # that are handled at different points of the POS section. 

1257 # First, extract az-suffix-form, put it in `inflection`, 

1258 # and parse `inflection`'s content when appropriate later. 

1259 # The contents of az-suffix-form (and ja-kanji) that generate 

1260 # divs with "floatright" in their style gets deleted by 

1261 # clean_value, so templates that slip through from here won't 

1262 # break anything. 

1263 # XXX bookmark 

1264 # print("===================") 

1265 # print(posnode.children) 

1266 

1267 floaters, poschildren = recursively_extract( 

1268 posnode.children, 

1269 lambda x: ( 

1270 isinstance(x, WikiNode) 

1271 and ( 

1272 ( 

1273 x.kind == NodeKind.TEMPLATE 

1274 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES 

1275 ) 

1276 or ( 

1277 x.kind == NodeKind.LINK 

1278 # Need to check for stringiness because some links are 

1279 # broken; for example, if a template is missing an 

1280 # argument, a link might look like `[[{{{1}}}...]]` 

1281 and isinstance(x.largs[0][0], str) 

1282 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1283 ) 

1284 ) 

1285 ), 

1286 ) 

1287 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1288 tempnode.largs = [["Inflection"]] 

1289 tempnode.children = floaters 

1290 parse_inflection(tempnode, "Floating Div", pos) 

1291 # print(poschildren) 

1292 # XXX new above 

1293 

1294 if not poschildren: 1294 ↛ 1295line 1294 didn't jump to line 1295 because the condition on line 1294 was never true

1295 if not floaters: 

1296 wxr.wtp.debug( 

1297 "PoS section without contents", 

1298 sortid="en/page/1051/20230612", 

1299 ) 

1300 else: 

1301 wxr.wtp.debug( 

1302 "PoS section without contents except for a floating table", 

1303 sortid="en/page/1056/20230612", 

1304 ) 

1305 return 

1306 

1307 for node in poschildren: 

1308 if isinstance(node, str): 

1309 for m in re.finditer(r"\n+|[^\n]+", node): 

1310 p = m.group(0) 

1311 if p.startswith("\n\n") and pre: 

1312 first_para = False 

1313 start_of_paragraph = True 

1314 break 

1315 if p and collecting_head: 

1316 pre[-1].append(p) 

1317 continue 

1318 assert isinstance(node, WikiNode) 

1319 kind = node.kind 

1320 if kind == NodeKind.LIST: 

1321 lists[-1].append(node) 

1322 collecting_head = False 

1323 start_of_paragraph = True 

1324 continue 

1325 elif kind in LEVEL_KINDS: 

1326 # Stop parsing section if encountering any kind of 

1327 # level header (like ===Noun=== or ====Further Reading====). 

1328 # At a quick glance, this should be the default behavior, 

1329 # but if some kinds of source articles have sub-sub-sections 

1330 # that should be parsed XXX it should be handled by changing 

1331 # this break. 

1332 break 

1333 elif collecting_head and kind == NodeKind.LINK: 

1334 # We might collect relevant links as they are often pictures 

1335 # relating to the word 

1336 if len(node.largs[0]) >= 1 and isinstance( 1336 ↛ 1351line 1336 didn't jump to line 1351 because the condition on line 1336 was always true

1337 node.largs[0][0], str 

1338 ): 

1339 if node.largs[0][0].startswith( 1339 ↛ 1345line 1339 didn't jump to line 1345 because the condition on line 1339 was never true

1340 ns_title_prefix_tuple(wxr, "Category") 

1341 ): 

1342 # [[Category:...]] 

1343 # We're at the end of the file, probably, so stop 

1344 # here. Otherwise the head will get garbage. 

1345 break 

1346 if node.largs[0][0].startswith( 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true

1347 ns_title_prefix_tuple(wxr, "File") 

1348 ): 

1349 # Skips file links 

1350 continue 

1351 start_of_paragraph = False 

1352 pre[-1].extend(node.largs[-1]) 

1353 elif kind == NodeKind.HTML: 

1354 if node.sarg == "br": 

1355 if pre[-1]: 1355 ↛ 1307line 1355 didn't jump to line 1307 because the condition on line 1355 was always true

1356 pre.append([]) # Switch to next head 

1357 lists.append([]) # Lists parallels pre 

1358 collecting_head = True 

1359 start_of_paragraph = True 

1360 elif collecting_head and node.sarg not in ( 1360 ↛ 1366line 1360 didn't jump to line 1366 because the condition on line 1360 was never true

1361 "gallery", 

1362 "ref", 

1363 "cite", 

1364 "caption", 

1365 ): 

1366 start_of_paragraph = False 

1367 pre[-1].append(node) 

1368 else: 

1369 start_of_paragraph = False 

1370 elif isinstance(node, TemplateNode): 

1371 # XXX Insert code here that disambiguates between 

1372 # templates that generate word heads and templates 

1373 # that don't. 

1374 # There's head_tag_re that seems like a regex meant 

1375 # to identify head templates. Too bad it's None. 

1376 

1377 # ignore {{category}}, {{cat}}... etc. 

1378 if node.template_name in stop_head_at_these_templates: 

1379 # we've reached a template that should be at the end, 

1380 continue 

1381 

1382 # skip these templates; panel_templates is already used 

1383 # to skip certain templates else, but it also applies to 

1384 # head parsing quite well. 

1385 # node.largs[0][0] should always be str, but can't type-check 

1386 # that. 

1387 if is_panel_template(wxr, node.template_name): 

1388 continue 

1389 # skip these templates 

1390 # if node.largs[0][0] in skip_these_templates_in_head: 

1391 # first_head_tmplt = False # no first_head_tmplt at all 

1392 # start_of_paragraph = False 

1393 # continue 

1394 

1395 if first_head_tmplt and pre[-1]: 

1396 first_head_tmplt = False 

1397 start_of_paragraph = False 

1398 pre[-1].append(node) 

1399 elif pre[-1] and start_of_paragraph: 

1400 pre.append([]) # Switch to the next head 

1401 lists.append([]) # lists parallel pre 

1402 collecting_head = True 

1403 start_of_paragraph = False 

1404 pre[-1].append(node) 

1405 else: 

1406 pre[-1].append(node) 

1407 elif first_para: 

1408 start_of_paragraph = False 

1409 if collecting_head: 1409 ↛ 1307line 1409 didn't jump to line 1307 because the condition on line 1409 was always true

1410 pre[-1].append(node) 

1411 # XXX use template_fn in clean_node to check that the head macro 

1412 # is compatible with the current part-of-speech and generate warning 

1413 # if not. Use template_allowed_pos_map. 

1414 

1415 # Clean up empty pairs, and fix messes with extra newlines that 

1416 # separate templates that are followed by lists wiktextract issue #314 

1417 

1418 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1419 cleaned_lists: list[list[WikiNode]] = [] 

1420 pairless_pre_index = None 

1421 

1422 for pre1, ls in zip(pre, lists): 

1423 if pre1 and not ls: 

1424 pairless_pre_index = len(cleaned_pre) 

1425 if not pre1 and not ls: 1425 ↛ 1427line 1425 didn't jump to line 1427 because the condition on line 1425 was never true

1426 # skip [] + [] 

1427 continue 

1428 if not ls and all( 

1429 (isinstance(x, str) and not x.strip()) for x in pre1 

1430 ): 

1431 # skip ["\n", " "] + [] 

1432 continue 

1433 if ls and not pre1: 

1434 if pairless_pre_index is not None: 1434 ↛ 1435line 1434 didn't jump to line 1435 because the condition on line 1434 was never true

1435 cleaned_lists[pairless_pre_index] = ls 

1436 pairless_pre_index = None 

1437 continue 

1438 cleaned_pre.append(pre1) 

1439 cleaned_lists.append(ls) 

1440 

1441 pre = cleaned_pre 

1442 lists = cleaned_lists 

1443 

1444 there_are_many_heads = len(pre) > 1 

1445 header_tags: list[str] = [] 

1446 header_topics: list[str] = [] 

1447 previous_head_had_list = False 

1448 

1449 if not any(g for g in lists): 

1450 process_gloss_without_list( 

1451 poschildren, pos, pos_data, header_tags, header_topics 

1452 ) 

1453 else: 

1454 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1455 # if len(ls) == 0: 

1456 # # don't have gloss list 

1457 # # XXX add code here to filter out 'garbage', like text 

1458 # # that isn't a head template or head. 

1459 # continue 

1460 

1461 if all(not sl for sl in lists[i:]): 

1462 if i == 0: 1462 ↛ 1463line 1462 didn't jump to line 1463 because the condition on line 1462 was never true

1463 if isinstance(node, str): 

1464 wxr.wtp.debug( 

1465 "first head without list of senses," 

1466 "string: '{}[...]', {}/{}".format( 

1467 node[:20], word, language 

1468 ), 

1469 sortid="page/1689/20221215", 

1470 ) 

1471 if isinstance(node, WikiNode): 

1472 if node.largs and node.largs[0][0] in [ 

1473 "Han char", 

1474 ]: 

1475 # just ignore these templates 

1476 pass 

1477 else: 

1478 wxr.wtp.debug( 

1479 "first head without " 

1480 "list of senses, " 

1481 "template node " 

1482 "{}, {}/{}".format( 

1483 node.largs, word, language 

1484 ), 

1485 sortid="page/1694/20221215", 

1486 ) 

1487 else: 

1488 wxr.wtp.debug( 

1489 "first head without list of senses, " 

1490 "{}/{}".format(word, language), 

1491 sortid="page/1700/20221215", 

1492 ) 

1493 # no break here so that the first head always 

1494 # gets processed. 

1495 else: 

1496 if isinstance(node, str): 1496 ↛ 1497line 1496 didn't jump to line 1497 because the condition on line 1496 was never true

1497 wxr.wtp.debug( 

1498 "later head without list of senses," 

1499 "string: '{}[...]', {}/{}".format( 

1500 node[:20], word, language 

1501 ), 

1502 sortid="page/1708/20221215", 

1503 ) 

1504 if isinstance(node, WikiNode): 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true

1505 wxr.wtp.debug( 

1506 "later head without list of senses," 

1507 "template node " 

1508 "{}, {}/{}".format( 

1509 node.sarg if node.sarg else node.largs, 

1510 word, 

1511 language, 

1512 ), 

1513 sortid="page/1713/20221215", 

1514 ) 

1515 else: 

1516 wxr.wtp.debug( 

1517 "later head without list of senses, " 

1518 "{}/{}".format(word, language), 

1519 sortid="page/1719/20221215", 

1520 ) 

1521 break 

1522 head_group = i + 1 if there_are_many_heads else None 

1523 # print("parse_part_of_speech: {}: {}: pre={}" 

1524 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1525 

1526 if previous_head_had_list: 

1527 # We use a boolean flag here because we want to be able 

1528 # let the header_tags data pass through after the loop 

1529 # is over without accidentally emptying it, if there are 

1530 # no pos_datas and we need a dummy data. 

1531 header_tags.clear() 

1532 header_topics.clear() 

1533 

1534 process_gloss_header( 

1535 pre1, pos, head_group, pos_data, header_tags, header_topics 

1536 ) 

1537 for ln in ls: 

1538 # Parse each list associated with this head. 

1539 for node in ln.children: 

1540 # Parse nodes in l.children recursively. 

1541 # The recursion function uses push_sense() to 

1542 # add stuff into sense_datas, and returns True or 

1543 # False if something is added, which bubbles upward. 

1544 # If the bubble is "True", then higher levels of 

1545 # the recursion will not push_sense(), because 

1546 # the data is already pushed into a sub-gloss 

1547 # downstream, unless the higher level has examples 

1548 # that need to be put somewhere. 

1549 common_data: SenseData = { 

1550 "tags": list(header_tags), 

1551 "topics": list(header_topics), 

1552 } 

1553 if head_group: 

1554 common_data["head_nr"] = head_group 

1555 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1556 

1557 if len(ls) > 0: 

1558 previous_head_had_list = True 

1559 else: 

1560 previous_head_had_list = False 

1561 

1562 # If there are no senses extracted, add a dummy sense. We want to 

1563 # keep tags extracted from the head for the dummy sense. 

1564 push_sense() # Make sure unfinished data pushed, and start clean sense 

1565 if len(sense_datas) == 0: 

1566 data_extend(sense_data, "tags", header_tags) 

1567 data_extend(sense_data, "topics", header_topics) 

1568 data_append(sense_data, "tags", "no-gloss") 

1569 push_sense() 

1570 

1571 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) 

1572 

1573 for sd in sense_datas: 

1574 if "__temp_sense_sorting_ordinal" in sd: 1574 ↛ 1573line 1574 didn't jump to line 1573 because the condition on line 1574 was always true

1575 del sd["__temp_sense_sorting_ordinal"] 

1576 

1577 def process_gloss_header( 

1578 header_nodes: list[Union[WikiNode, str]], 

1579 pos_type: str, 

1580 header_group: Optional[int], 

1581 pos_data: WordData, 

1582 header_tags: list[str], 

1583 header_topics: list[str], 

1584 ) -> None: 

1585 ruby = [] 

1586 links: list[str] = [] 

1587 

1588 # process template parse nodes here 

1589 new_nodes = [] 

1590 info_template_data = [] 

1591 for node in header_nodes: 

1592 # print(f"{node=}") 

1593 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1594 if info_data or info_out: 

1595 if info_data: 1595 ↛ 1597line 1595 didn't jump to line 1597 because the condition on line 1595 was always true

1596 info_template_data.append(info_data) 

1597 if info_out: # including just the original node 1597 ↛ 1598line 1597 didn't jump to line 1598 because the condition on line 1597 was never true

1598 new_nodes.append(info_out) 

1599 else: 

1600 new_nodes.append(node) 

1601 header_nodes = new_nodes 

1602 

1603 if info_template_data: 

1604 if "info_templates" not in pos_data: 1604 ↛ 1607line 1604 didn't jump to line 1607 because the condition on line 1604 was always true

1605 pos_data["info_templates"] = info_template_data 

1606 else: 

1607 pos_data["info_templates"].extend(info_template_data) 

1608 

1609 if not word.isalnum(): 

1610 # `-` is kosher, add more of these if needed. 

1611 if word.replace("-", "").isalnum(): 

1612 pass 

1613 else: 

1614 # if the word contains non-letter or -number characters, it 

1615 # might have something that messes with split-at-semi-comma; we 

1616 # collect links so that we can skip splitting them. 

1617 exp = wxr.wtp.parse( 

1618 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1619 ) 

1620 link_nodes, _ = recursively_extract( 

1621 exp.children, 

1622 lambda x: isinstance(x, WikiNode) 

1623 and x.kind == NodeKind.LINK, 

1624 ) 

1625 for ln in link_nodes: 

1626 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1627 if not ltext.isalnum(): 

1628 links.append(ltext) 

1629 if word not in links: 1629 ↛ 1632line 1629 didn't jump to line 1632 because the condition on line 1629 was always true

1630 links.append(word) 

1631 

1632 if lang_code == "ja": 

1633 exp = wxr.wtp.parse( 

1634 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1635 ) 

1636 rub, _ = recursively_extract( 

1637 exp.children, 

1638 lambda x: isinstance(x, WikiNode) 

1639 and x.kind == NodeKind.HTML 

1640 and x.sarg == "ruby", 

1641 ) 

1642 if rub is not None: 1642 ↛ 1687line 1642 didn't jump to line 1687 because the condition on line 1642 was always true

1643 for r in rub: 

1644 if TYPE_CHECKING: 

1645 # we know the lambda above in recursively_extract 

1646 # returns only WikiNodes in rub 

1647 assert isinstance(r, WikiNode) 

1648 rt = parse_ruby(wxr, r) 

1649 if rt is not None: 

1650 ruby.append(rt) 

1651 elif lang_code == "vi": 

1652 # Handle vi-readings templates that have a weird structures for 

1653 # Chu Nom vietnamese characters heads 

1654 # https://en.wiktionary.org/wiki/Template:vi-readings 

1655 new_header_nodes = [] 

1656 related_readings: list[LinkageData] = [] 

1657 for node in header_nodes: 

1658 if ( 1658 ↛ 1682line 1658 didn't jump to line 1682 because the condition on line 1658 was always true

1659 isinstance(node, TemplateNode) 

1660 and node.template_name == "vi-readings" 

1661 ): 

1662 print(node.template_parameters) 

1663 for parameter, tag in ( 

1664 ("hanviet", "han-viet-reading"), 

1665 ("nom", "nom-reading"), 

1666 # we ignore the fanqie parameter "phienthiet" 

1667 ): 

1668 arg = node.template_parameters.get(parameter) 

1669 if arg is not None: 1669 ↛ 1663line 1669 didn't jump to line 1663 because the condition on line 1669 was always true

1670 text = clean_node(wxr, None, arg) 

1671 for w in text.split(","): 

1672 # ignore - separated references 

1673 if "-" in w: 

1674 w = w[: w.index("-")] 

1675 w = w.strip() 

1676 related_readings.append( 

1677 LinkageData(word=w, tags=[tag]) 

1678 ) 

1679 continue 

1680 

1681 # Skip the vi-reading template for the rest of the head parsing 

1682 new_header_nodes.append(node) 

1683 if len(related_readings) > 0: 1683 ↛ 1687line 1683 didn't jump to line 1687 because the condition on line 1683 was always true

1684 data_extend(pos_data, "related", related_readings) 

1685 header_nodes = new_header_nodes 

1686 

1687 header_text = clean_node( 

1688 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1689 ) 

1690 

1691 if not header_text.strip(): 

1692 return 

1693 

1694 term_label_tags: list[str] = [] 

1695 term_label_topics: list[str] = [] 

1696 if len(term_label_templates) > 0: 

1697 # parse term label templates; if there are other similar kinds 

1698 # of templates in headers that you want to squash and apply as 

1699 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1700 for templ_data in term_label_templates: 

1701 # print(templ_data) 

1702 expan = templ_data.get("expansion", "").strip("().,; ") 

1703 if not expan: 1703 ↛ 1704line 1703 didn't jump to line 1704 because the condition on line 1703 was never true

1704 continue 

1705 tlb_tagsets, tlb_topics = decode_tags(expan) 

1706 for tlb_tags in tlb_tagsets: 

1707 if len(tlb_tags) > 0 and not any( 

1708 t.startswith("error-") for t in tlb_tags 

1709 ): 

1710 term_label_tags.extend(tlb_tags) 

1711 term_label_topics.extend(tlb_topics) 

1712 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1713 

1714 header_text = re.sub(r"\s+", " ", header_text) 

1715 # print(f"{header_text=}") 

1716 parse_word_head( 

1717 wxr, 

1718 pos_type, 

1719 header_text, 

1720 pos_data, 

1721 is_reconstruction, 

1722 header_group, 

1723 ruby=ruby, 

1724 links=links, 

1725 ) 

1726 if "tags" in pos_data: 

1727 # pos_data can get "tags" data from some source; type-checkers 

1728 # doesn't like it, so let's ignore it. 

1729 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1730 del pos_data["tags"] # type: ignore[typeddict-item] 

1731 if len(term_label_tags) > 0: 

1732 header_tags.extend(term_label_tags) 

1733 if len(term_label_topics) > 0: 

1734 header_topics.extend(term_label_topics) 

1735 

1736 def process_gloss_without_list( 

1737 nodes: list[Union[WikiNode, str]], 

1738 pos_type: str, 

1739 pos_data: WordData, 

1740 header_tags: list[str], 

1741 header_topics: list[str], 

1742 ) -> None: 

1743 # gloss text might not inside a list 

1744 header_nodes: list[Union[str, WikiNode]] = [] 

1745 gloss_nodes: list[Union[str, WikiNode]] = [] 

1746 for node in strip_nodes(nodes): 

1747 if isinstance(node, WikiNode): 

1748 if isinstance(node, TemplateNode): 

1749 if node.template_name in ( 

1750 "zh-see", 

1751 "ja-see", 

1752 "ja-see-kango", 

1753 ): 

1754 continue # soft redirect 

1755 elif ( 

1756 node.template_name == "head" 

1757 or node.template_name.startswith(f"{lang_code}-") 

1758 ): 

1759 header_nodes.append(node) 

1760 continue 

1761 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1761 ↛ 1763line 1761 didn't jump to line 1763 because the condition on line 1761 was always true

1762 break 

1763 gloss_nodes.append(node) 

1764 

1765 if len(header_nodes) > 0: 

1766 process_gloss_header( 

1767 header_nodes, 

1768 pos_type, 

1769 None, 

1770 pos_data, 

1771 header_tags, 

1772 header_topics, 

1773 ) 

1774 if len(gloss_nodes) > 0: 

1775 process_gloss_contents( 

1776 gloss_nodes, 

1777 pos_type, 

1778 {"tags": list(header_tags), "topics": list(header_topics)}, 

1779 ) 

1780 

1781 def parse_sense_node( 

1782 node: Union[str, WikiNode], # never receives str 

1783 sense_base: SenseData, 

1784 pos: str, 

1785 ) -> bool: 

1786 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1787 Uses push_sense() to attempt adding data to pos_data in the scope 

1788 of parse_language() when it reaches deep in the recursion. push_sense() 

1789 returns True if it succeeds, and that is bubbled up the stack; if 

1790 a sense was added downstream, the higher levels (whose shared data 

1791 was already added by a subsense) do not push_sense(), unless it 

1792 has examples that need to be put somewhere. 

1793 """ 

1794 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1795 

1796 nonlocal sense_ordinal 

1797 my_ordinal = sense_ordinal # copies, not a reference 

1798 sense_ordinal += 1 # only use for sorting 

1799 

1800 if not isinstance(node, WikiNode): 1800 ↛ 1802line 1800 didn't jump to line 1802 because the condition on line 1800 was never true

1801 # This doesn't seem to ever happen in practice. 

1802 wxr.wtp.debug( 

1803 "{}: parse_sense_node called with" 

1804 "something that isn't a WikiNode".format(pos), 

1805 sortid="page/1287/20230119", 

1806 ) 

1807 return False 

1808 

1809 if node.kind != NodeKind.LIST_ITEM: 1809 ↛ 1810line 1809 didn't jump to line 1810 because the condition on line 1809 was never true

1810 wxr.wtp.debug( 

1811 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1812 ) 

1813 return False 

1814 

1815 if node.sarg == ":": 

1816 # Skip example entries at the highest level, ones without 

1817 # a sense ("...#") above them. 

1818 # If node.sarg is exactly and only ":", then it's at 

1819 # the highest level; lower levels would have more 

1820 # "indentation", like "#:" or "##:" 

1821 return False 

1822 

1823 # If a recursion call succeeds in push_sense(), bubble it up with 

1824 # `added`. 

1825 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1826 added = False 

1827 

1828 gloss_template_args: set[str] = set() 

1829 

1830 # For LISTs and LIST_ITEMS, their argument is something like 

1831 # "##" or "##:", and using that we can rudimentally determine 

1832 # list 'depth' if need be, and also what kind of list or 

1833 # entry it is; # is for normal glosses, : for examples (indent) 

1834 # and * is used for quotations on wiktionary. 

1835 current_depth = node.sarg 

1836 

1837 children = node.children 

1838 

1839 # subentries, (presumably) a list 

1840 # of subglosses below this. The list's 

1841 # argument ends with #, and its depth should 

1842 # be bigger than parent node. 

1843 subentries = [ 

1844 x 

1845 for x in children 

1846 if isinstance(x, WikiNode) 

1847 and x.kind == NodeKind.LIST 

1848 and x.sarg == current_depth + "#" 

1849 ] 

1850 

1851 # sublists of examples and quotations. .sarg 

1852 # does not end with "#". 

1853 others = [ 

1854 x 

1855 for x in children 

1856 if isinstance(x, WikiNode) 

1857 and x.kind == NodeKind.LIST 

1858 and x.sarg != current_depth + "#" 

1859 ] 

1860 

1861 # the actual contents of this particular node. 

1862 # can be a gloss (or a template that expands into 

1863 # many glosses which we can't easily pre-expand) 

1864 # or could be an "outer gloss" with more specific 

1865 # subglosses, or could be a qualfier for the subglosses. 

1866 contents = [ 

1867 x 

1868 for x in children 

1869 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1870 ] 

1871 # If this entry has sublists of entries, we should combine 

1872 # gloss information from both the "outer" and sublist content. 

1873 # Sometimes the outer gloss 

1874 # is more non-gloss or tags, sometimes it is a coarse sense 

1875 # and the inner glosses are more specific. The outer one 

1876 # does not seem to have qualifiers. 

1877 

1878 # If we have one sublist with one element, treat it 

1879 # specially as it may be a Wiktionary error; raise 

1880 # that nested element to the same level. 

1881 # XXX If need be, this block can be easily removed in 

1882 # the current recursive logicand the result is one sense entry 

1883 # with both glosses in the glosses list, as you would 

1884 # expect. If the higher entry has examples, there will 

1885 # be a higher entry with some duplicated data. 

1886 if len(subentries) == 1: 

1887 slc = subentries[0].children 

1888 if len(slc) == 1: 

1889 # copy current node and modify it so it doesn't 

1890 # loop infinitely. 

1891 cropped_node = copy.copy(node) 

1892 cropped_node.children = [ 

1893 x 

1894 for x in children 

1895 if not ( 

1896 isinstance(x, WikiNode) 

1897 and x.kind == NodeKind.LIST 

1898 and x.sarg == current_depth + "#" 

1899 ) 

1900 ] 

1901 added |= parse_sense_node(cropped_node, sense_base, pos) 

1902 nonlocal sense_data # this kludge causes duplicated raw_ 

1903 # glosses data if this is not done; 

1904 # if the top-level (cropped_node) 

1905 # does not push_sense() properly or 

1906 # parse_sense_node() returns early, 

1907 # sense_data is not reset. This happens 

1908 # for example when you have a no-gloss 

1909 # string like "(intransitive)": 

1910 # no gloss, push_sense() returns early 

1911 # and sense_data has duplicate data with 

1912 # sense_base 

1913 sense_data = {} 

1914 added |= parse_sense_node(slc[0], sense_base, pos) 

1915 return added 

1916 

1917 return process_gloss_contents( 

1918 contents, 

1919 pos, 

1920 sense_base, 

1921 subentries, 

1922 others, 

1923 gloss_template_args, 

1924 added, 

1925 my_ordinal, 

1926 ) 

1927 

1928 def process_gloss_contents( 

1929 contents: list[Union[str, WikiNode]], 

1930 pos: str, 

1931 sense_base: SenseData, 

1932 subentries: list[WikiNode] = [], 

1933 others: list[WikiNode] = [], 

1934 gloss_template_args: Set[str] = set(), 

1935 added: bool = False, 

1936 sorting_ordinal: int | None = None, 

1937 ) -> bool: 

1938 def sense_template_fn( 

1939 name: str, ht: TemplateArgs, is_gloss: bool = False 

1940 ) -> Optional[str]: 

1941 # print(f"sense_template_fn: {name}, {ht}") 

1942 if name in wikipedia_templates: 

1943 # parse_wikipedia_template(wxr, pos_data, ht) 

1944 return None 

1945 if is_panel_template(wxr, name): 

1946 return "" 

1947 if name in INFO_TEMPLATE_FUNCS: 

1948 info_data, info_exp = parse_info_template_arguments( 

1949 wxr, name, ht, "sense" 

1950 ) 

1951 if info_data or info_exp: 1951 ↛ 1957line 1951 didn't jump to line 1957 because the condition on line 1951 was always true

1952 if info_data: 1952 ↛ 1954line 1952 didn't jump to line 1954 because the condition on line 1952 was always true

1953 data_append(sense_base, "info_templates", info_data) 

1954 if info_exp and isinstance(info_exp, str): 1954 ↛ 1956line 1954 didn't jump to line 1956 because the condition on line 1954 was always true

1955 return info_exp 

1956 return "" 

1957 if name in ("defdate",): 

1958 date = clean_node(wxr, None, ht.get(1, ())) 

1959 if part_two := ht.get(2): 1959 ↛ 1961line 1959 didn't jump to line 1961 because the condition on line 1959 was never true

1960 # Unicode mdash, not '-' 

1961 date += "–" + clean_node(wxr, None, part_two) 

1962 refs: dict[str, ReferenceData] = {} 

1963 # ref, refn, ref2, ref2n, ref3, ref3n 

1964 # ref1 not valid 

1965 for k, v in sorted( 

1966 (k, v) for k, v in ht.items() if isinstance(k, str) 

1967 ): 

1968 if m := re.match(r"ref(\d?)(n?)", k): 1968 ↛ 1965line 1968 didn't jump to line 1965 because the condition on line 1968 was always true

1969 ref_v = clean_node(wxr, None, v) 

1970 if m.group(1) not in refs: # empty string or digit 

1971 refs[m.group(1)] = ReferenceData() 

1972 if m.group(2): 

1973 refs[m.group(1)]["refn"] = ref_v 

1974 else: 

1975 refs[m.group(1)]["text"] = ref_v 

1976 data_append( 

1977 sense_base, 

1978 "attestations", 

1979 AttestationData(date=date, references=list(refs.values())), 

1980 ) 

1981 return "" 

1982 if name == "senseid": 

1983 langid = clean_node(wxr, None, ht.get(1, ())) 

1984 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1985 if re.match(r"Q\d+$", arg): 

1986 data_append(sense_base, "wikidata", arg) 

1987 data_append(sense_base, "senseid", langid + ":" + arg) 

1988 if name in sense_linkage_templates: 

1989 # print(f"SENSE_TEMPLATE_FN: {name}") 

1990 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

1991 return "" 

1992 if name == "†" or name == "zh-obsolete": 

1993 data_append(sense_base, "tags", "obsolete") 

1994 return "" 

1995 if name in { 

1996 "ux", 

1997 "uxi", 

1998 "usex", 

1999 "afex", 

2000 "prefixusex", 

2001 "ko-usex", 

2002 "ko-x", 

2003 "hi-x", 

2004 "ja-usex-inline", 

2005 "ja-x", 

2006 "quotei", 

2007 "he-x", 

2008 "hi-x", 

2009 "km-x", 

2010 "ne-x", 

2011 "shn-x", 

2012 "th-x", 

2013 "ur-x", 

2014 }: 

2015 # Usage examples are captured separately below. We don't 

2016 # want to expand them into glosses even when unusual coding 

2017 # is used in the entry. 

2018 # These templates may slip through inside another item, but 

2019 # currently we're separating out example entries (..#:) 

2020 # well enough that there seems to very little contamination. 

2021 if is_gloss: 

2022 wxr.wtp.warning( 

2023 "Example template is used for gloss text", 

2024 sortid="extractor.en.page.sense_template_fn/1415", 

2025 ) 

2026 else: 

2027 return "" 

2028 if name == "w": 2028 ↛ 2029line 2028 didn't jump to line 2029 because the condition on line 2028 was never true

2029 if ht.get(2) == "Wp": 

2030 return "" 

2031 for k, v in ht.items(): 

2032 v = v.strip() 

2033 if v and "<" not in v: 

2034 gloss_template_args.add(v) 

2035 return None 

2036 

2037 def extract_link_texts(item: GeneralNode) -> None: 

2038 """Recursively extracts link texts from the gloss source. This 

2039 information is used to select whether to remove final "." from 

2040 form_of/alt_of (e.g., ihm/Hunsrik).""" 

2041 if isinstance(item, (list, tuple)): 

2042 for x in item: 

2043 extract_link_texts(x) 

2044 return 

2045 if isinstance(item, str): 

2046 # There seem to be HTML sections that may futher contain 

2047 # unparsed links. 

2048 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2048 ↛ 2049line 2048 didn't jump to line 2049 because the loop on line 2048 never started

2049 print("ITER:", m.group(0)) 

2050 v = m.group(1).split("|")[-1].strip() 

2051 if v: 

2052 gloss_template_args.add(v) 

2053 return 

2054 if not isinstance(item, WikiNode): 2054 ↛ 2055line 2054 didn't jump to line 2055 because the condition on line 2054 was never true

2055 return 

2056 if item.kind == NodeKind.LINK: 

2057 v = item.largs[-1] 

2058 if ( 2058 ↛ 2064line 2058 didn't jump to line 2064 because the condition on line 2058 was always true

2059 isinstance(v, list) 

2060 and len(v) == 1 

2061 and isinstance(v[0], str) 

2062 ): 

2063 gloss_template_args.add(v[0].strip()) 

2064 for x in item.children: 

2065 extract_link_texts(x) 

2066 

2067 extract_link_texts(contents) 

2068 

2069 # get the raw text of non-list contents of this node, and other stuff 

2070 # like tag and category data added to sense_base 

2071 # cast = no-op type-setter for the type-checker 

2072 partial_template_fn = cast( 

2073 TemplateFnCallable, 

2074 partial(sense_template_fn, is_gloss=True), 

2075 ) 

2076 rawgloss = clean_node( 

2077 wxr, 

2078 sense_base, 

2079 contents, 

2080 template_fn=partial_template_fn, 

2081 collect_links=True, 

2082 ) 

2083 

2084 if not rawgloss: 2084 ↛ 2085line 2084 didn't jump to line 2085 because the condition on line 2084 was never true

2085 return False 

2086 

2087 # remove manually typed ordered list text at the start("1. ") 

2088 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2089 

2090 # get stuff like synonyms and categories from "others", 

2091 # maybe examples and quotations 

2092 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2093 

2094 # The gloss could contain templates that produce more list items. 

2095 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2096 # to parts. However, e.g. Interlingua generates multiple glosses 

2097 # in HTML directly without Wikitext markup, so we must also split 

2098 # by just newlines. 

2099 subglosses = rawgloss.splitlines() 

2100 

2101 if len(subglosses) == 0: 2101 ↛ 2102line 2101 didn't jump to line 2102 because the condition on line 2101 was never true

2102 return False 

2103 

2104 if any(s.startswith("#") for s in subglosses): 

2105 subtree = wxr.wtp.parse(rawgloss) 

2106 # from wikitextprocessor.parser import print_tree 

2107 # print("SUBTREE GENERATED BY TEMPLATE:") 

2108 # print_tree(subtree) 

2109 new_subentries = [ 

2110 x 

2111 for x in subtree.children 

2112 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2113 ] 

2114 

2115 new_others = [ 

2116 x 

2117 for x in subtree.children 

2118 if isinstance(x, WikiNode) 

2119 and x.kind == NodeKind.LIST 

2120 and not x.sarg.endswith("#") 

2121 ] 

2122 

2123 new_contents = [ 

2124 clean_node(wxr, [], x) 

2125 for x in subtree.children 

2126 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2127 ] 

2128 

2129 subentries = subentries or new_subentries 

2130 others = others or new_others 

2131 subglosses = new_contents 

2132 rawgloss = "".join(subglosses) 

2133 # Generate no gloss for translation hub pages, but add the 

2134 # "translation-hub" tag for them 

2135 if rawgloss == "(This entry is a translation hub.)": 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true

2136 data_append(sense_data, "tags", "translation-hub") 

2137 return push_sense(sorting_ordinal) 

2138 

2139 # Remove certain substrings specific to outer glosses 

2140 strip_ends = [", particularly:"] 

2141 for x in strip_ends: 

2142 if rawgloss.endswith(x): 

2143 rawgloss = rawgloss[: -len(x)].strip() 

2144 break 

2145 

2146 # A single gloss, or possibly an outer gloss. 

2147 # Check if the possible outer gloss starts with 

2148 # parenthesized tags/topics 

2149 

2150 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2151 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2152 m = QUALIFIERS_RE.match(rawgloss) 

2153 # (...): ... or (...(...)...): ... 

2154 if m: 

2155 q = m.group(1) 

2156 rawgloss = rawgloss[m.end() :].strip() 

2157 parse_sense_qualifier(wxr, q, sense_base) 

2158 if rawgloss == "A pejorative:": 2158 ↛ 2159line 2158 didn't jump to line 2159 because the condition on line 2158 was never true

2159 data_append(sense_base, "tags", "pejorative") 

2160 rawgloss = "" 

2161 elif rawgloss == "Short forms.": 2161 ↛ 2162line 2161 didn't jump to line 2162 because the condition on line 2161 was never true

2162 data_append(sense_base, "tags", "abbreviation") 

2163 rawgloss = "" 

2164 elif rawgloss == "Technical or specialized senses.": 2164 ↛ 2165line 2164 didn't jump to line 2165 because the condition on line 2164 was never true

2165 rawgloss = "" 

2166 elif rawgloss.startswith("inflection of "): 

2167 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2168 if parsed is not None: 2168 ↛ 2177line 2168 didn't jump to line 2177 because the condition on line 2168 was always true

2169 tags, origins = parsed 

2170 if origins is not None: 2170 ↛ 2172line 2170 didn't jump to line 2172 because the condition on line 2170 was always true

2171 data_extend(sense_base, "form_of", origins) 

2172 if tags is not None: 2172 ↛ 2175line 2172 didn't jump to line 2175 because the condition on line 2172 was always true

2173 data_extend(sense_base, "tags", tags) 

2174 else: 

2175 data_append(sense_base, "tags", "form-of") 

2176 else: 

2177 data_append(sense_base, "tags", "form-of") 

2178 if rawgloss: 2178 ↛ 2209line 2178 didn't jump to line 2209 because the condition on line 2178 was always true

2179 # Code duplicating a lot of clean-up operations from later in 

2180 # this block. We want to clean up the "supergloss" as much as 

2181 # possible, in almost the same way as a normal gloss. 

2182 supergloss = rawgloss 

2183 

2184 if supergloss.startswith("; "): 2184 ↛ 2185line 2184 didn't jump to line 2185 because the condition on line 2184 was never true

2185 supergloss = supergloss[1:].strip() 

2186 

2187 if supergloss.startswith(("^†", "†")): 

2188 data_append(sense_base, "tags", "obsolete") 

2189 supergloss = supergloss[2:].strip() 

2190 elif supergloss.startswith("^‡"): 2190 ↛ 2191line 2190 didn't jump to line 2191 because the condition on line 2190 was never true

2191 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2192 supergloss = supergloss[2:].strip() 

2193 

2194 # remove [14th century...] style brackets at the end 

2195 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2196 

2197 if supergloss.startswith((",", ":")): 

2198 supergloss = supergloss[1:] 

2199 supergloss = supergloss.strip() 

2200 if supergloss.startswith("N. of "): 2200 ↛ 2201line 2200 didn't jump to line 2201 because the condition on line 2200 was never true

2201 supergloss = "Name of " + supergloss[6:] 

2202 supergloss = supergloss[2:] 

2203 data_append(sense_base, "glosses", supergloss) 

2204 if supergloss in ("A person:",): 

2205 data_append(sense_base, "tags", "g-person") 

2206 

2207 # The main recursive call (except for the exceptions at the 

2208 # start of this function). 

2209 for sublist in subentries: 

2210 if not ( 2210 ↛ 2213line 2210 didn't jump to line 2213 because the condition on line 2210 was never true

2211 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2212 ): 

2213 wxr.wtp.debug( 

2214 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2215 f"with items that are not LISTs", 

2216 sortid="page/1511/20230119", 

2217 ) 

2218 continue 

2219 for item in sublist.children: 

2220 if not ( 2220 ↛ 2224line 2220 didn't jump to line 2224 because the condition on line 2220 was never true

2221 isinstance(item, WikiNode) 

2222 and item.kind == NodeKind.LIST_ITEM 

2223 ): 

2224 continue 

2225 # copy sense_base to prevent cross-contamination between 

2226 # subglosses and other subglosses and superglosses 

2227 sense_base2 = copy.deepcopy(sense_base) 

2228 if parse_sense_node(item, sense_base2, pos): 2228 ↛ 2219line 2228 didn't jump to line 2219 because the condition on line 2228 was always true

2229 added = True 

2230 

2231 # Capture examples. 

2232 # This is called after the recursive calls above so that 

2233 # sense_base is not contaminated with meta-data from 

2234 # example entries for *this* gloss. 

2235 examples = [] 

2236 if wxr.config.capture_examples: 2236 ↛ 2240line 2236 didn't jump to line 2240 because the condition on line 2236 was always true

2237 examples = extract_examples(others, sense_base) 

2238 

2239 # push_sense() succeeded somewhere down-river, so skip this level 

2240 if added: 

2241 if examples: 

2242 # this higher-up gloss has examples that we do not want to skip 

2243 wxr.wtp.debug( 

2244 "'{}[...]' gloss has examples we want to keep, " 

2245 "but there are subglosses.".format(repr(rawgloss[:30])), 

2246 sortid="page/1498/20230118", 

2247 ) 

2248 else: 

2249 return True 

2250 

2251 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2252 # after the gloss, but these sentences don't seem to be intended 

2253 # as glosses. Skip them. 

2254 indexed_subglosses = list( 

2255 (i, gl) 

2256 for i, gl in enumerate(subglosses) 

2257 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2258 ) 

2259 

2260 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2260 ↛ 2261line 2260 didn't jump to line 2261 because the condition on line 2260 was never true

2261 gl = indexed_subglosses[0][1].strip() 

2262 if gl.endswith(":"): 

2263 gl = gl[:-1].strip() 

2264 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2265 if parsed is not None: 

2266 infl_tags, infl_dts = parsed 

2267 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2268 # Interpret others as a particular form under 

2269 # "inflection of" 

2270 data_extend(sense_base, "tags", infl_tags) 

2271 data_extend(sense_base, "form_of", infl_dts) 

2272 indexed_subglosses = indexed_subglosses[1:] 

2273 elif not infl_dts: 

2274 data_extend(sense_base, "tags", infl_tags) 

2275 indexed_subglosses = indexed_subglosses[1:] 

2276 

2277 # Create senses for remaining subglosses 

2278 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2279 gloss = gloss.strip() 

2280 if not gloss and len(indexed_subglosses) > 1: 2280 ↛ 2281line 2280 didn't jump to line 2281 because the condition on line 2280 was never true

2281 continue 

2282 # Push a new sense (if the last one is not empty) 

2283 if push_sense(sorting_ordinal): 2283 ↛ 2284line 2283 didn't jump to line 2284 because the condition on line 2283 was never true

2284 added = True 

2285 # if gloss not in sense_data.get("raw_glosses", ()): 

2286 # data_append(sense_data, "raw_glosses", gloss) 

2287 if i == 0 and examples: 

2288 # In a multi-line gloss, associate examples 

2289 # with only one of them. 

2290 # XXX or you could use gloss_i == len(indexed_subglosses) 

2291 # to associate examples with the *last* one. 

2292 data_extend(sense_data, "examples", examples) 

2293 if gloss.startswith("; ") and gloss_i > 0: 2293 ↛ 2294line 2293 didn't jump to line 2294 because the condition on line 2293 was never true

2294 gloss = gloss[1:].strip() 

2295 # If the gloss starts with †, mark as obsolete 

2296 if gloss.startswith("^†"): 2296 ↛ 2297line 2296 didn't jump to line 2297 because the condition on line 2296 was never true

2297 data_append(sense_data, "tags", "obsolete") 

2298 gloss = gloss[2:].strip() 

2299 elif gloss.startswith("^‡"): 2299 ↛ 2300line 2299 didn't jump to line 2300 because the condition on line 2299 was never true

2300 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2301 gloss = gloss[2:].strip() 

2302 # Copy data for all senses to this sense 

2303 for k, v in sense_base.items(): 

2304 if isinstance(v, (list, tuple)): 

2305 if k != "tags": 

2306 # Tags handled below (countable/uncountable special) 

2307 data_extend(sense_data, k, v) 

2308 else: 

2309 assert k not in ("tags", "categories", "topics") 

2310 sense_data[k] = v # type:ignore[literal-required] 

2311 # Parse the gloss for this particular sense 

2312 m = QUALIFIERS_RE.match(gloss) 

2313 # (...): ... or (...(...)...): ... 

2314 if m: 

2315 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2316 gloss = gloss[m.end() :].strip() 

2317 

2318 # Remove common suffix "[from 14th c.]" and similar 

2319 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2320 

2321 # Check to make sure we don't have unhandled list items in gloss 

2322 ofs = max(gloss.find("#"), gloss.find("* ")) 

2323 if ofs > 10 and "(#)" not in gloss: 

2324 wxr.wtp.debug( 

2325 "gloss may contain unhandled list items: {}".format(gloss), 

2326 sortid="page/1412", 

2327 ) 

2328 elif "\n" in gloss: 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true

2329 wxr.wtp.debug( 

2330 "gloss contains newline: {}".format(gloss), 

2331 sortid="page/1416", 

2332 ) 

2333 

2334 # Kludge, some glosses have a comma after initial qualifiers in 

2335 # parentheses 

2336 if gloss.startswith((",", ":")): 

2337 gloss = gloss[1:] 

2338 gloss = gloss.strip() 

2339 if gloss.endswith(":"): 

2340 gloss = gloss[:-1].strip() 

2341 if gloss.startswith("N. of "): 2341 ↛ 2342line 2341 didn't jump to line 2342 because the condition on line 2341 was never true

2342 gloss = "Name of " + gloss[6:] 

2343 if gloss.startswith("†"): 2343 ↛ 2344line 2343 didn't jump to line 2344 because the condition on line 2343 was never true

2344 data_append(sense_data, "tags", "obsolete") 

2345 gloss = gloss[1:] 

2346 elif gloss.startswith("^†"): 2346 ↛ 2347line 2346 didn't jump to line 2347 because the condition on line 2346 was never true

2347 data_append(sense_data, "tags", "obsolete") 

2348 gloss = gloss[2:] 

2349 

2350 # Copy tags from sense_base if any. This will not copy 

2351 # countable/uncountable if either was specified in the sense, 

2352 # as sometimes both are specified in word head but only one 

2353 # in individual senses. 

2354 countability_tags = [] 

2355 base_tags = sense_base.get("tags", ()) 

2356 sense_tags = sense_data.get("tags", ()) 

2357 for tag in base_tags: 

2358 if tag in ("countable", "uncountable"): 

2359 if tag not in countability_tags: 2359 ↛ 2361line 2359 didn't jump to line 2361 because the condition on line 2359 was always true

2360 countability_tags.append(tag) 

2361 continue 

2362 if tag not in sense_tags: 

2363 data_append(sense_data, "tags", tag) 

2364 if countability_tags: 

2365 if ( 2365 ↛ 2374line 2365 didn't jump to line 2374 because the condition on line 2365 was always true

2366 "countable" not in sense_tags 

2367 and "uncountable" not in sense_tags 

2368 ): 

2369 data_extend(sense_data, "tags", countability_tags) 

2370 

2371 # If outer gloss specifies a form-of ("inflection of", see 

2372 # aquamarine/German), try to parse the inner glosses as 

2373 # tags for an inflected form. 

2374 if "form-of" in sense_base.get("tags", ()): 

2375 parsed = parse_alt_or_inflection_of( 

2376 wxr, gloss, gloss_template_args 

2377 ) 

2378 if parsed is not None: 2378 ↛ 2384line 2378 didn't jump to line 2384 because the condition on line 2378 was always true

2379 infl_tags, infl_dts = parsed 

2380 if not infl_dts and infl_tags: 2380 ↛ 2384line 2380 didn't jump to line 2384 because the condition on line 2380 was always true

2381 # Interpret as a particular form under "inflection of" 

2382 data_extend(sense_data, "tags", infl_tags) 

2383 

2384 if not gloss: 2384 ↛ 2385line 2384 didn't jump to line 2385 because the condition on line 2384 was never true

2385 data_append(sense_data, "tags", "empty-gloss") 

2386 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2387 if ( 2387 ↛ 2398line 2387 didn't jump to line 2398 because the condition on line 2387 was always true

2388 gloss_i == 0 

2389 and len(sense_data.get("glosses", tuple())) >= 1 

2390 ): 

2391 # If we added a "high-level gloss" from rawgloss, but this 

2392 # is that same gloss_i, add this instead of the raw_gloss 

2393 # from before if they're different: the rawgloss was not 

2394 # cleaned exactly the same as this later gloss 

2395 sense_data["glosses"][-1] = gloss 

2396 else: 

2397 # Add the gloss for the sense. 

2398 data_append(sense_data, "glosses", gloss) 

2399 

2400 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2401 # two abbreviations in the same sense, both generated by the 

2402 # {{abbreviation of|...}} template. Handle these with some magic. 

2403 position = 0 

2404 split_glosses = [] 

2405 for m in re.finditer(r"Abbreviation of ", gloss): 

2406 if m.start() != position: 2406 ↛ 2405line 2406 didn't jump to line 2405 because the condition on line 2406 was always true

2407 split_glosses.append(gloss[position : m.start()]) 

2408 position = m.start() 

2409 split_glosses.append(gloss[position:]) 

2410 for gloss in split_glosses: 

2411 # Check if this gloss describes an alt-of or inflection-of 

2412 if ( 

2413 lang_code != "en" 

2414 and " " not in gloss 

2415 and distw([word], gloss) < 0.3 

2416 ): 

2417 # Don't try to parse gloss if it is one word 

2418 # that is close to the word itself for non-English words 

2419 # (probable translations of a tag/form name) 

2420 continue 

2421 parsed = parse_alt_or_inflection_of( 

2422 wxr, gloss, gloss_template_args 

2423 ) 

2424 if parsed is None: 

2425 continue 

2426 tags, dts = parsed 

2427 if not dts and tags: 

2428 data_extend(sense_data, "tags", tags) 

2429 continue 

2430 for dt in dts: # type:ignore[union-attr] 

2431 ftags = list(tag for tag in tags if tag != "form-of") 

2432 if "alt-of" in tags: 

2433 data_extend(sense_data, "tags", ftags) 

2434 data_append(sense_data, "alt_of", dt) 

2435 elif "compound-of" in tags: 2435 ↛ 2436line 2435 didn't jump to line 2436 because the condition on line 2435 was never true

2436 data_extend(sense_data, "tags", ftags) 

2437 data_append(sense_data, "compound_of", dt) 

2438 elif "synonym-of" in tags: 2438 ↛ 2439line 2438 didn't jump to line 2439 because the condition on line 2438 was never true

2439 data_extend(dt, "tags", ftags) 

2440 data_append(sense_data, "synonyms", dt) 

2441 elif tags and dt.get("word", "").startswith("of "): 2441 ↛ 2442line 2441 didn't jump to line 2442 because the condition on line 2441 was never true

2442 dt["word"] = dt["word"][3:] 

2443 data_append(sense_data, "tags", "form-of") 

2444 data_extend(sense_data, "tags", ftags) 

2445 data_append(sense_data, "form_of", dt) 

2446 elif "form-of" in tags: 2446 ↛ 2430line 2446 didn't jump to line 2430 because the condition on line 2446 was always true

2447 data_extend(sense_data, "tags", tags) 

2448 data_append(sense_data, "form_of", dt) 

2449 

2450 if len(sense_data) == 0: 

2451 if len(sense_base.get("tags", [])) == 0: 2451 ↛ 2453line 2451 didn't jump to line 2453 because the condition on line 2451 was always true

2452 del sense_base["tags"] 

2453 sense_data.update(sense_base) 

2454 if push_sense(sorting_ordinal): 2454 ↛ 2458line 2454 didn't jump to line 2458 because the condition on line 2454 was always true

2455 # push_sense succeded in adding a sense to pos_data 

2456 added = True 

2457 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2458 return added 

2459 

2460 def parse_inflection( 

2461 node: WikiNode, section: str, pos: Optional[str] 

2462 ) -> None: 

2463 """Parses inflection data (declension, conjugation) from the given 

2464 page. This retrieves the actual inflection template 

2465 parameters, which are very useful for applications that need 

2466 to learn the inflection classes and generate inflected 

2467 forms.""" 

2468 assert isinstance(node, WikiNode) 

2469 assert isinstance(section, str) 

2470 assert pos is None or isinstance(pos, str) 

2471 # print("parse_inflection:", node) 

2472 

2473 if pos is None: 2473 ↛ 2474line 2473 didn't jump to line 2474 because the condition on line 2473 was never true

2474 wxr.wtp.debug( 

2475 "inflection table outside part-of-speech", sortid="page/1812" 

2476 ) 

2477 return 

2478 

2479 def inflection_template_fn( 

2480 name: str, ht: TemplateArgs 

2481 ) -> Optional[str]: 

2482 # print("decl_conj_template_fn", name, ht) 

2483 if is_panel_template(wxr, name): 2483 ↛ 2484line 2483 didn't jump to line 2484 because the condition on line 2483 was never true

2484 return "" 

2485 if name in ("is-u-mutation",): 2485 ↛ 2488line 2485 didn't jump to line 2488 because the condition on line 2485 was never true

2486 # These are not to be captured as an exception to the 

2487 # generic code below 

2488 return None 

2489 m = re.search( 

2490 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2491 r"declension|inflection|mut|mutation)($|-)", 

2492 name, 

2493 ) 

2494 if m: 

2495 args_ht = clean_template_args(wxr, ht) 

2496 dt = {"name": name, "args": args_ht} 

2497 data_append(pos_data, "inflection_templates", dt) 

2498 

2499 return None 

2500 

2501 # Convert the subtree back to Wikitext, then expand all and parse, 

2502 # capturing templates in the process 

2503 text = wxr.wtp.node_to_wikitext(node.children) 

2504 

2505 # Split text into separate sections for each to-level template 

2506 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2507 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2508 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2509 # like the group around it, it would create elements in brace_matches, 

2510 # including None if it doesn't match. 

2511 # 20250114: Added {| and |} into the regex because tables were being 

2512 # cut into pieces by this code. Issue #973, introduction of two-part 

2513 # book-end templates similar to trans-top and tran-bottom. 

2514 template_sections = [] 

2515 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2516 # Because there is the possibility of triple curly braces 

2517 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2518 # count nesting depth using pairs of two brackets, but 

2519 # instead use singular braces ("{ }"). 

2520 # Because template delimiters should be balanced, regardless 

2521 # of whether {{ or {{{ is used, and because we only care 

2522 # about the outer-most delimiters (the highest level template) 

2523 # we can just count the single braces when those single 

2524 # braces are part of a group. 

2525 table_nesting = 0 

2526 # However, if we have a stray table ({| ... |}) that should always 

2527 # be its own section, and should prevent templates from cutting it 

2528 # into sections. 

2529 

2530 # print(f"Parse inflection: {text=}") 

2531 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2532 if len(brace_matches) > 1: 

2533 tsection: list[str] = [] 

2534 after_templates = False # kludge to keep any text 

2535 # before first template 

2536 # with the first template; 

2537 # otherwise, text 

2538 # goes with preceding template 

2539 for m in brace_matches: 

2540 if m.startswith("\n; ") and after_templates: 2540 ↛ 2541line 2540 didn't jump to line 2541 because the condition on line 2540 was never true

2541 after_templates = False 

2542 template_sections.append(tsection) 

2543 tsection = [] 

2544 tsection.append(m) 

2545 elif m.startswith("{{") or m.endswith("{|"): 

2546 if ( 

2547 template_nesting == 0 

2548 and after_templates 

2549 and table_nesting == 0 

2550 ): 

2551 template_sections.append(tsection) 

2552 tsection = [] 

2553 # start new section 

2554 after_templates = True 

2555 if m.startswith("{{"): 

2556 template_nesting += 1 

2557 else: 

2558 # m.endswith("{|") 

2559 table_nesting += 1 

2560 tsection.append(m) 

2561 elif m.startswith("}}") or m.endswith("|}"): 

2562 if m.startswith("}}"): 

2563 template_nesting -= 1 

2564 if template_nesting < 0: 2564 ↛ 2565line 2564 didn't jump to line 2565 because the condition on line 2564 was never true

2565 wxr.wtp.error( 

2566 "Negatively nested braces, " 

2567 "couldn't split inflection templates, " 

2568 "{}/{} section {}".format( 

2569 word, language, section 

2570 ), 

2571 sortid="page/1871", 

2572 ) 

2573 template_sections = [] # use whole text 

2574 break 

2575 else: 

2576 table_nesting -= 1 

2577 if table_nesting < 0: 2577 ↛ 2578line 2577 didn't jump to line 2578 because the condition on line 2577 was never true

2578 wxr.wtp.error( 

2579 "Negatively nested table braces, " 

2580 "couldn't split inflection section, " 

2581 "{}/{} section {}".format( 

2582 word, language, section 

2583 ), 

2584 sortid="page/20250114", 

2585 ) 

2586 template_sections = [] # use whole text 

2587 break 

2588 tsection.append(m) 

2589 else: 

2590 tsection.append(m) 

2591 if tsection: # dangling tsection 2591 ↛ 2599line 2591 didn't jump to line 2599 because the condition on line 2591 was always true

2592 template_sections.append(tsection) 

2593 # Why do it this way around? The parser has a preference 

2594 # to associate bits outside of tables with the preceding 

2595 # table (`after`-variable), so a new tsection begins 

2596 # at {{ and everything before it belongs to the previous 

2597 # template. 

2598 

2599 texts = [] 

2600 if not template_sections: 

2601 texts = [text] 

2602 else: 

2603 for tsection in template_sections: 

2604 texts.append("".join(tsection)) 

2605 if template_nesting != 0: 2605 ↛ 2606line 2605 didn't jump to line 2606 because the condition on line 2605 was never true

2606 wxr.wtp.error( 

2607 "Template nesting error: " 

2608 "template_nesting = {} " 

2609 "couldn't split inflection templates, " 

2610 "{}/{} section {}".format( 

2611 template_nesting, word, language, section 

2612 ), 

2613 sortid="page/1896", 

2614 ) 

2615 texts = [text] 

2616 for text in texts: 

2617 tree = wxr.wtp.parse( 

2618 text, expand_all=True, template_fn=inflection_template_fn 

2619 ) 

2620 

2621 if not text.strip(): 

2622 continue 

2623 

2624 # Parse inflection tables from the section. The data is stored 

2625 # under "forms". 

2626 if wxr.config.capture_inflections: 2626 ↛ 2616line 2626 didn't jump to line 2616 because the condition on line 2626 was always true

2627 tablecontext = None 

2628 m = re.search(r"{{([^}{|]+)\|?", text) 

2629 if m: 

2630 template_name = m.group(1) 

2631 tablecontext = TableContext(template_name) 

2632 

2633 parse_inflection_section( 

2634 wxr, 

2635 pos_data, 

2636 word, 

2637 language, 

2638 pos, 

2639 section, 

2640 tree, 

2641 tablecontext=tablecontext, 

2642 ) 

2643 

2644 def get_subpage_section( 

2645 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2646 ) -> Optional[Union[WikiNode, str]]: 

2647 """Loads a subpage of the given page, and finds the section 

2648 for the given language, part-of-speech, and section title. This 

2649 is used for finding translations and other sections on subpages.""" 

2650 assert isinstance(language, str) 

2651 assert isinstance(title, str) 

2652 assert isinstance(subtitle, str) 

2653 assert isinstance(seqs, (list, tuple)) 

2654 for seq in seqs: 

2655 for x in seq: 

2656 assert isinstance(x, str) 

2657 subpage_title = word + "/" + subtitle 

2658 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2659 if subpage_content is None: 

2660 wxr.wtp.error( 

2661 "/translations not found despite " 

2662 "{{see translation subpage|...}}", 

2663 sortid="page/1934", 

2664 ) 

2665 return None 

2666 

2667 def recurse( 

2668 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2669 ) -> Optional[Union[str, WikiNode]]: 

2670 # print(f"seq: {seq}") 

2671 if not seq: 

2672 return node 

2673 if not isinstance(node, WikiNode): 

2674 return None 

2675 # print(f"node.kind: {node.kind}") 

2676 if node.kind in LEVEL_KINDS: 

2677 t = clean_node(wxr, None, node.largs[0]) 

2678 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2679 if t.lower() == seq[0].lower(): 

2680 seq = seq[1:] 

2681 if not seq: 

2682 return node 

2683 for n in node.children: 

2684 ret = recurse(n, seq) 

2685 if ret is not None: 

2686 return ret 

2687 return None 

2688 

2689 tree = wxr.wtp.parse( 

2690 subpage_content, 

2691 pre_expand=True, 

2692 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2693 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2694 ) 

2695 assert tree.kind == NodeKind.ROOT 

2696 for seq in seqs: 

2697 ret = recurse(tree, seq) 

2698 if ret is None: 

2699 wxr.wtp.debug( 

2700 "Failed to find subpage section {}/{} seq {}".format( 

2701 title, subtitle, seq 

2702 ), 

2703 sortid="page/1963", 

2704 ) 

2705 return ret 

2706 

2707 def parse_linkage( 

2708 data: WordData, field: str, linkagenode: LevelNode 

2709 ) -> None: 

2710 assert isinstance(data, dict) 

2711 assert isinstance(field, str) 

2712 assert isinstance(linkagenode, WikiNode) 

2713 # if field == "synonyms": 

2714 # print("field", field) 

2715 # print("data", data) 

2716 # print("children:") 

2717 # print(linkagenode.children) 

2718 if not wxr.config.capture_linkages: 2718 ↛ 2719line 2718 didn't jump to line 2719 because the condition on line 2718 was never true

2719 return 

2720 have_panel_template = False 

2721 toplevel_text = [] 

2722 next_navframe_sense = None # Used for "(sense):" before NavFrame 

2723 

2724 def parse_linkage_item( 

2725 contents: list[Union[str, WikiNode]], 

2726 field: str, 

2727 sense: Optional[str] = None, 

2728 ): 

2729 assert isinstance(contents, (list, tuple)) 

2730 assert isinstance(field, str) 

2731 assert sense is None or isinstance(sense, str) 

2732 

2733 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

2734 # .format(field, sense, contents)) 

2735 

2736 parts: list[str] = [] 

2737 ruby: list[tuple[str, str]] = [] 

2738 urls: list[str] = [] 

2739 # data about link text; this is used to skip splitting on 

2740 # linkage text items that contain stuff like commas; for 

2741 # example "Hunde, die bellen, beißen nicht" in article 

2742 # beißen is split into "Hunde", "die bellen" etc. 

2743 # We take that link text and use it, eventually, 

2744 # in split_at_comma_semi to skip splitting on those 

2745 # commas. 

2746 links_that_should_not_be_split: list[str] = [] 

2747 

2748 def item_recurse( 

2749 contents: list[Union[str, WikiNode]], italic=False 

2750 ) -> None: 

2751 assert isinstance(contents, (list, tuple)) 

2752 nonlocal sense 

2753 nonlocal ruby 

2754 nonlocal parts 

2755 # print("ITEM_RECURSE:", contents) 

2756 for node in contents: 

2757 if isinstance(node, str): 

2758 parts.append(node) 

2759 continue 

2760 kind = node.kind 

2761 # print("ITEM_RECURSE KIND:", kind, 

2762 # node.sarg if node.sarg else node.largs) 

2763 if kind == NodeKind.LIST: 

2764 if parts: 2764 ↛ 2779line 2764 didn't jump to line 2779 because the condition on line 2764 was always true

2765 sense1: Optional[str] 

2766 sense1 = clean_node(wxr, None, parts) 

2767 if sense1.endswith(":"): 

2768 sense1 = sense1[:-1].strip() 

2769 if sense1.startswith("(") and sense1.endswith(")"): 2769 ↛ 2770line 2769 didn't jump to line 2770 because the condition on line 2769 was never true

2770 sense1 = sense1[1:-1].strip() 

2771 if sense1.lower() == TRANSLATIONS_TITLE: 2771 ↛ 2772line 2771 didn't jump to line 2772 because the condition on line 2771 was never true

2772 sense1 = None 

2773 # print("linkage item_recurse LIST sense1:", sense1) 

2774 parse_linkage_recurse( 

2775 node.children, field, sense=sense1 or sense 

2776 ) 

2777 parts = [] 

2778 else: 

2779 parse_linkage_recurse(node.children, field, sense) 

2780 elif kind in ( 2780 ↛ 2785line 2780 didn't jump to line 2785 because the condition on line 2780 was never true

2781 NodeKind.TABLE, 

2782 NodeKind.TABLE_ROW, 

2783 NodeKind.TABLE_CELL, 

2784 ): 

2785 parse_linkage_recurse(node.children, field, sense) 

2786 elif kind in ( 2786 ↛ 2790line 2786 didn't jump to line 2790 because the condition on line 2786 was never true

2787 NodeKind.TABLE_HEADER_CELL, 

2788 NodeKind.TABLE_CAPTION, 

2789 ): 

2790 continue 

2791 elif kind == NodeKind.HTML: 2791 ↛ 2792line 2791 didn't jump to line 2792 because the condition on line 2791 was never true

2792 classes = (node.attrs.get("class") or "").split() 

2793 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2794 continue 

2795 elif node.sarg == "ruby": 

2796 rb = parse_ruby(wxr, node) 

2797 if rb: 

2798 ruby.append(rb) 

2799 parts.append(rb[0]) 

2800 continue 

2801 elif node.sarg == "math": 

2802 parts.append(clean_node(wxr, None, node)) 

2803 continue 

2804 elif "interProject" in classes: 

2805 continue # These do not seem to be displayed 

2806 if "NavFrame" in classes: 

2807 parse_linkage_recurse(node.children, field, sense) 

2808 else: 

2809 item_recurse(node.children, italic=italic) 

2810 elif kind == NodeKind.ITALIC: 

2811 item_recurse(node.children, italic=True) 

2812 elif kind == NodeKind.LINK: 

2813 ignore = False 

2814 if isinstance(node.largs[0][0], str): 2814 ↛ 2756line 2814 didn't jump to line 2756 because the condition on line 2814 was always true

2815 v1 = node.largs[0][0].strip().lower() 

2816 if v1.startswith( 2816 ↛ 2820line 2816 didn't jump to line 2820 because the condition on line 2816 was never true

2817 ns_title_prefix_tuple(wxr, "Category", True) 

2818 + ns_title_prefix_tuple(wxr, "File", True) 

2819 ): 

2820 ignore = True 

2821 if not ignore: 2821 ↛ 2756line 2821 didn't jump to line 2756 because the condition on line 2821 was always true

2822 v = node.largs[-1] 

2823 if ( 

2824 len(node.largs) == 1 

2825 and len(v) > 0 

2826 and isinstance(v[0], str) 

2827 and v[0][0] == ":" 

2828 ): 

2829 v = [v[0][1:]] + list(v[1:]) # type:ignore 

2830 if isinstance(v[0], str) and not v[0].isalnum(): 

2831 links_that_should_not_be_split.append( 

2832 "".join(v[0]) 

2833 ) # type: ignore 

2834 item_recurse(v, italic=italic) 

2835 elif kind == NodeKind.URL: 

2836 if len(node.largs) < 2 and node.largs: 

2837 # Naked url captured 

2838 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

2839 continue 

2840 if len(node.largs) == 2: 2840 ↛ 2845line 2840 didn't jump to line 2845 because the condition on line 2840 was always true

2841 # Url from link with text 

2842 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

2843 # print(f"{node.largs=!r}") 

2844 # print("linkage recurse URL {}".format(node)) 

2845 item_recurse(node.largs[-1], italic=italic) 

2846 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2846 ↛ 2849line 2846 didn't jump to line 2849 because the condition on line 2846 was always true

2847 item_recurse(node.children, italic=italic) 

2848 else: 

2849 wxr.wtp.debug( 

2850 "linkage item_recurse unhandled {}: {}".format( 

2851 node.kind, node 

2852 ), 

2853 sortid="page/2073", 

2854 ) 

2855 

2856 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

2857 # .format(contents)) 

2858 

2859 item_recurse(contents) 

2860 item = clean_node(wxr, None, parts) 

2861 # print("LINKAGE ITEM CONTENTS:", parts) 

2862 # print("CLEANED ITEM: {!r}".format(item)) 

2863 # print(f"URLS {urls=!r}") 

2864 

2865 return parse_linkage_item_text( 

2866 wxr, 

2867 word, 

2868 data, 

2869 field, 

2870 item, 

2871 sense, 

2872 ruby, 

2873 sense_datas, 

2874 is_reconstruction, 

2875 urls or None, 

2876 links_that_should_not_be_split or None, 

2877 ) 

2878 

2879 def parse_linkage_recurse( 

2880 contents: list[Union[WikiNode, str]], 

2881 field: str, 

2882 sense: Optional[str], 

2883 ) -> None: 

2884 assert isinstance(contents, (list, tuple)) 

2885 assert sense is None or isinstance(sense, str) 

2886 nonlocal next_navframe_sense 

2887 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

2888 for node in contents: 

2889 if isinstance(node, str): 

2890 # Ignore top-level text, generally comments before the 

2891 # linkages list. However, if no linkages are found, then 

2892 # use this for linkages (not all words use bullet points 

2893 # for linkages). 

2894 toplevel_text.append(node) 

2895 continue 

2896 assert isinstance(node, WikiNode) 

2897 kind = node.kind 

2898 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

2899 if kind == NodeKind.LIST: 

2900 parse_linkage_recurse(node.children, field, sense) 

2901 elif kind == NodeKind.LIST_ITEM: 

2902 v = parse_linkage_item(node.children, field, sense) 

2903 if v: 2903 ↛ 2907line 2903 didn't jump to line 2907 because the condition on line 2903 was never true

2904 # parse_linkage_item() can return a value that should 

2905 # be used as the sense for the follow-on linkages, 

2906 # which are typically provided in a table (see 滿) 

2907 next_navframe_sense = v 

2908 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

2909 parse_linkage_recurse(node.children, field, sense) 

2910 elif kind == NodeKind.TABLE_CELL: 

2911 parse_linkage_item(node.children, field, sense) 

2912 elif kind in ( 

2913 NodeKind.TABLE_CAPTION, 

2914 NodeKind.TABLE_HEADER_CELL, 

2915 NodeKind.PREFORMATTED, 

2916 NodeKind.BOLD, 

2917 ): 

2918 continue 

2919 elif kind == NodeKind.HTML: 2919 ↛ 2921line 2919 didn't jump to line 2921 because the condition on line 2919 was never true

2920 # Recurse to process inside the HTML for most tags 

2921 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2922 continue 

2923 classes = (node.attrs.get("class") or "").split() 

2924 if node.sarg == "li": 

2925 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑ 

2926 v = parse_linkage_item(node.children, field, sense) 

2927 if v: 

2928 next_navframe_sense = v 

2929 elif "qualifier-content" in classes: 

2930 sense1 = clean_node(wxr, None, node.children) 

2931 if sense1.endswith(":"): 

2932 sense1 = sense1[:-1].strip() 

2933 if sense and sense1: 

2934 wxr.wtp.debug( 

2935 "linkage qualifier-content on multiple " 

2936 "levels: {!r} and {!r}".format(sense, sense1), 

2937 sortid="page/2170", 

2938 ) 

2939 parse_linkage_recurse(node.children, field, sense1) 

2940 elif "NavFrame" in classes: 

2941 # NavFrame uses previously assigned next_navframe_sense 

2942 # (from a "(sense):" item) and clears it afterwards 

2943 parse_linkage_recurse( 

2944 node.children, field, sense or next_navframe_sense 

2945 ) 

2946 next_navframe_sense = None 

2947 else: 

2948 parse_linkage_recurse(node.children, field, sense) 

2949 elif kind in LEVEL_KINDS: 2949 ↛ 2951line 2949 didn't jump to line 2951 because the condition on line 2949 was never true

2950 # Just recurse to any possible subsections 

2951 parse_linkage_recurse(node.children, field, sense) 

2952 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

2953 # Skip these on top level; at least sometimes bold is 

2954 # used for indicating a subtitle 

2955 continue 

2956 elif kind == NodeKind.LINK: 2956 ↛ 2962line 2956 didn't jump to line 2962 because the condition on line 2956 was always true

2957 # Recurse into the last argument 

2958 # Apparently ":/" is used as a link to "/", so strip 

2959 # initial value 

2960 parse_linkage_recurse(node.largs[-1], field, sense) 

2961 else: 

2962 wxr.wtp.debug( 

2963 "parse_linkage_recurse unhandled {}: {}".format( 

2964 kind, node 

2965 ), 

2966 sortid="page/2196", 

2967 ) 

2968 

2969 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

2970 nonlocal have_panel_template 

2971 if is_panel_template(wxr, name): 

2972 have_panel_template = True 

2973 return "" 

2974 return None 

2975 

2976 # Main body of parse_linkage() 

2977 l_nodes = [] 

2978 l_sense = "" 

2979 for node in linkagenode.children: 

2980 if ( 

2981 isinstance(node, TemplateNode) 

2982 and node.template_name == "zh-dial" 

2983 ): 

2984 extract_zh_dial_template(wxr, data, node, l_sense) 

2985 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

2986 for list_item in node.find_child(NodeKind.LIST_ITEM): 

2987 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

2988 if t_node.template_name in ["s", "sense"]: 

2989 l_sense = clean_node(wxr, None, t_node).strip( 

2990 "(): " 

2991 ) 

2992 l_nodes.append(node) 

2993 else: 

2994 l_nodes.append(node) 

2995 text = wxr.wtp.node_to_wikitext(l_nodes) 

2996 parsed = wxr.wtp.parse( 

2997 text, expand_all=True, template_fn=linkage_template_fn1 

2998 ) 

2999 parse_linkage_recurse(parsed.children, field, None) 

3000 if not data.get(field) and not have_panel_template: 

3001 text = "".join(toplevel_text).strip() 

3002 if "\n" not in text and "," in text and text.count(",") > 3: 

3003 if not text.startswith("See "): 3003 ↛ exitline 3003 didn't return from function 'parse_linkage' because the condition on line 3003 was always true

3004 parse_linkage_item([text], field, None) 

3005 

3006 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

3007 """Parses translations for a word. This may also pull in translations 

3008 from separate translation subpages.""" 

3009 assert isinstance(data, dict) 

3010 assert isinstance(xlatnode, WikiNode) 

3011 # print("===== PARSE_TRANSLATIONS {} {} {}" 

3012 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

3013 # print("parse_translations xlatnode={}".format(xlatnode)) 

3014 if not wxr.config.capture_translations: 3014 ↛ 3015line 3014 didn't jump to line 3015 because the condition on line 3014 was never true

3015 return 

3016 sense_parts: list[Union[WikiNode, str]] = [] 

3017 sense: Optional[str] = None 

3018 

3019 def parse_translation_item( 

3020 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

3021 ) -> None: 

3022 nonlocal sense 

3023 assert isinstance(contents, list) 

3024 assert lang is None or isinstance(lang, str) 

3025 # print("PARSE_TRANSLATION_ITEM:", contents) 

3026 

3027 langcode: Optional[str] = None 

3028 if sense is None: 

3029 sense = clean_node(wxr, data, sense_parts).strip() 

3030 # print("sense <- clean_node: ", sense) 

3031 idx = sense.find("See also translations at") 

3032 if idx > 0: 3032 ↛ 3033line 3032 didn't jump to line 3033 because the condition on line 3032 was never true

3033 wxr.wtp.debug( 

3034 "Skipping translation see also: {}".format(sense), 

3035 sortid="page/2361", 

3036 ) 

3037 sense = sense[:idx].strip() 

3038 if sense.endswith(":"): 3038 ↛ 3039line 3038 didn't jump to line 3039 because the condition on line 3038 was never true

3039 sense = sense[:-1].strip() 

3040 if sense.endswith("—"): 3040 ↛ 3041line 3040 didn't jump to line 3041 because the condition on line 3040 was never true

3041 sense = sense[:-1].strip() 

3042 translations_from_template: list[str] = [] 

3043 

3044 def translation_item_template_fn( 

3045 name: str, ht: TemplateArgs 

3046 ) -> Optional[str]: 

3047 nonlocal langcode 

3048 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

3049 if is_panel_template(wxr, name): 

3050 return "" 

3051 if name in ("t+check", "t-check", "t-needed"): 

3052 # We ignore these templates. They seem to have outright 

3053 # garbage in some entries, and very varying formatting in 

3054 # others. These should be transitory and unreliable 

3055 # anyway. 

3056 return "__IGNORE__" 

3057 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

3058 code = ht.get(1) 

3059 if code: 3059 ↛ 3069line 3059 didn't jump to line 3069 because the condition on line 3059 was always true

3060 if langcode and code != langcode: 

3061 wxr.wtp.debug( 

3062 "inconsistent language codes {} vs " 

3063 "{} in translation item: {!r} {}".format( 

3064 langcode, code, name, ht 

3065 ), 

3066 sortid="page/2386", 

3067 ) 

3068 langcode = code 

3069 tr = ht.get(2) 

3070 if tr: 

3071 tr = clean_node(wxr, None, [tr]) 

3072 translations_from_template.append(tr) 

3073 return None 

3074 if name == "t-egy": 

3075 langcode = "egy" 

3076 return None 

3077 if name == "ttbc": 

3078 code = ht.get(1) 

3079 if code: 3079 ↛ 3081line 3079 didn't jump to line 3081 because the condition on line 3079 was always true

3080 langcode = code 

3081 return None 

3082 if name == "trans-see": 3082 ↛ 3083line 3082 didn't jump to line 3083 because the condition on line 3082 was never true

3083 wxr.wtp.error( 

3084 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

3085 ) 

3086 return "" 

3087 if name.endswith("-top"): 3087 ↛ 3088line 3087 didn't jump to line 3088 because the condition on line 3087 was never true

3088 return "" 

3089 if name.endswith("-bottom"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true

3090 return "" 

3091 if name.endswith("-mid"): 3091 ↛ 3092line 3091 didn't jump to line 3092 because the condition on line 3091 was never true

3092 return "" 

3093 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

3094 # .format(name), 

3095 # sortid="page/2414") 

3096 return None 

3097 

3098 sublists = list( 

3099 x 

3100 for x in contents 

3101 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

3102 ) 

3103 contents = list( 

3104 x 

3105 for x in contents 

3106 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

3107 ) 

3108 

3109 item = clean_node( 

3110 wxr, data, contents, template_fn=translation_item_template_fn 

3111 ) 

3112 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

3113 

3114 # Parse the translation item. 

3115 if item: 3115 ↛ exitline 3115 didn't return from function 'parse_translation_item' because the condition on line 3115 was always true

3116 lang = parse_translation_item_text( 

3117 wxr, 

3118 word, 

3119 data, 

3120 item, 

3121 sense, 

3122 lang, 

3123 langcode, 

3124 translations_from_template, 

3125 is_reconstruction, 

3126 ) 

3127 

3128 # Handle sublists. They are frequently used for different 

3129 # scripts for the language and different variants of the 

3130 # language. We will include the lower-level header as a 

3131 # tag in those cases. 

3132 for listnode in sublists: 

3133 assert listnode.kind == NodeKind.LIST 

3134 for node in listnode.children: 

3135 if not isinstance(node, WikiNode): 3135 ↛ 3136line 3135 didn't jump to line 3136 because the condition on line 3135 was never true

3136 continue 

3137 if node.kind == NodeKind.LIST_ITEM: 3137 ↛ 3134line 3137 didn't jump to line 3134 because the condition on line 3137 was always true

3138 parse_translation_item(node.children, lang=lang) 

3139 

3140 def parse_translation_template(node: WikiNode) -> None: 

3141 assert isinstance(node, WikiNode) 

3142 

3143 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3144 nonlocal sense_parts 

3145 nonlocal sense 

3146 if is_panel_template(wxr, name): 

3147 return "" 

3148 if name == "see also": 

3149 # XXX capture 

3150 # XXX for example, "/" has top-level list containing 

3151 # see also items. So also should parse those. 

3152 return "" 

3153 if name == "trans-see": 

3154 # XXX capture 

3155 return "" 

3156 if name == "see translation subpage": 3156 ↛ 3157line 3156 didn't jump to line 3157 because the condition on line 3156 was never true

3157 sense_parts = [] 

3158 sense = None 

3159 sub = ht.get(1, "") 

3160 if sub: 

3161 m = re.match( 

3162 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

3163 ) 

3164 else: 

3165 m = None 

3166 etym = "" 

3167 etym_numbered = "" 

3168 pos = "" 

3169 if m: 

3170 etym_numbered = m.group(1) 

3171 etym = m.group(2) 

3172 pos = m.group(3) 

3173 if not sub: 

3174 wxr.wtp.debug( 

3175 "no part-of-speech in " 

3176 "{{see translation subpage|...}}, " 

3177 "defaulting to just wxr.wtp.section " 

3178 "(= language)", 

3179 sortid="page/2468", 

3180 ) 

3181 # seq sent to get_subpage_section without sub and pos 

3182 seq = [ 

3183 language, 

3184 TRANSLATIONS_TITLE, 

3185 ] 

3186 elif ( 

3187 m 

3188 and etym.lower().strip() in ETYMOLOGY_TITLES 

3189 and pos.lower() in POS_TITLES 

3190 ): 

3191 seq = [ 

3192 language, 

3193 etym_numbered, 

3194 pos, 

3195 TRANSLATIONS_TITLE, 

3196 ] 

3197 elif sub.lower() in POS_TITLES: 

3198 # seq with sub but not pos 

3199 seq = [ 

3200 language, 

3201 sub, 

3202 TRANSLATIONS_TITLE, 

3203 ] 

3204 else: 

3205 # seq with sub and pos 

3206 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3207 if pos.lower() not in POS_TITLES: 

3208 wxr.wtp.debug( 

3209 "unhandled see translation subpage: " 

3210 "language={} sub={} " 

3211 "wxr.wtp.subsection={}".format( 

3212 language, sub, wxr.wtp.subsection 

3213 ), 

3214 sortid="page/2478", 

3215 ) 

3216 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

3217 subnode = get_subpage_section( 

3218 wxr.wtp.title or "MISSING_TITLE", 

3219 TRANSLATIONS_TITLE, 

3220 [seq], 

3221 ) 

3222 if subnode is None or not isinstance(subnode, WikiNode): 

3223 # Failed to find the normal subpage section 

3224 # seq with sub and pos 

3225 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3226 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

3227 seqs: list[list[str] | tuple[str, ...]] = [ 

3228 [TRANSLATIONS_TITLE], 

3229 [language, pos], 

3230 ] 

3231 subnode = get_subpage_section( 

3232 wxr.wtp.title or "MISSING_TITLE", 

3233 TRANSLATIONS_TITLE, 

3234 seqs, 

3235 ) 

3236 if subnode is not None and isinstance(subnode, WikiNode): 

3237 parse_translations(data, subnode) 

3238 return "" 

3239 if name in ( 

3240 "c", 

3241 "C", 

3242 "categorize", 

3243 "cat", 

3244 "catlangname", 

3245 "topics", 

3246 "top", 

3247 "qualifier", 

3248 "cln", 

3249 ): 

3250 # These are expanded in the default way 

3251 return None 

3252 if name in ( 

3253 "trans-top", 

3254 "trans-top-see", 

3255 ): 

3256 # XXX capture id from trans-top? Capture sense here 

3257 # instead of trying to parse it from expanded content? 

3258 if ht.get(1): 

3259 sense_parts = [] 

3260 sense = ht.get(1) 

3261 else: 

3262 sense_parts = [] 

3263 sense = None 

3264 return None 

3265 if name in ( 

3266 "trans-bottom", 

3267 "trans-mid", 

3268 "checktrans-mid", 

3269 "checktrans-bottom", 

3270 ): 

3271 return None 

3272 if name == "checktrans-top": 

3273 sense_parts = [] 

3274 sense = None 

3275 return "" 

3276 if name == "trans-top-also": 

3277 # XXX capture? 

3278 sense_parts = [] 

3279 sense = None 

3280 return "" 

3281 wxr.wtp.error( 

3282 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3283 name, ht 

3284 ), 

3285 sortid="page/2517", 

3286 ) 

3287 return "" 

3288 

3289 wxr.wtp.expand( 

3290 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3291 ) 

3292 

3293 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3294 nonlocal sense 

3295 nonlocal sense_parts 

3296 for node in xlatnode.children: 

3297 # print(node) 

3298 if isinstance(node, str): 

3299 if sense: 

3300 if not node.isspace(): 

3301 wxr.wtp.debug( 

3302 "skipping string in the middle of " 

3303 "translations: {}".format(node), 

3304 sortid="page/2530", 

3305 ) 

3306 continue 

3307 # Add a part to the sense 

3308 sense_parts.append(node) 

3309 sense = None 

3310 continue 

3311 assert isinstance(node, WikiNode) 

3312 kind = node.kind 

3313 if kind == NodeKind.LIST: 

3314 for item in node.children: 

3315 if not isinstance(item, WikiNode): 3315 ↛ 3316line 3315 didn't jump to line 3316 because the condition on line 3315 was never true

3316 continue 

3317 if item.kind != NodeKind.LIST_ITEM: 3317 ↛ 3318line 3317 didn't jump to line 3318 because the condition on line 3317 was never true

3318 continue 

3319 if item.sarg == ":": 3319 ↛ 3320line 3319 didn't jump to line 3320 because the condition on line 3319 was never true

3320 continue 

3321 parse_translation_item(item.children) 

3322 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3322 ↛ 3326line 3322 didn't jump to line 3326 because the condition on line 3322 was never true

3323 # Silently skip list items that are just indented; these 

3324 # are used for text between translations, such as indicating 

3325 # translations that need to be checked. 

3326 pass 

3327 elif kind == NodeKind.TEMPLATE: 

3328 parse_translation_template(node) 

3329 elif kind in ( 3329 ↛ 3334line 3329 didn't jump to line 3334 because the condition on line 3329 was never true

3330 NodeKind.TABLE, 

3331 NodeKind.TABLE_ROW, 

3332 NodeKind.TABLE_CELL, 

3333 ): 

3334 parse_translation_recurse(node) 

3335 elif kind == NodeKind.HTML: 

3336 if node.attrs.get("class") == "NavFrame": 3336 ↛ 3342line 3336 didn't jump to line 3342 because the condition on line 3336 was never true

3337 # Reset ``sense_parts`` (and force recomputing 

3338 # by clearing ``sense``) as each NavFrame specifies 

3339 # its own sense. This helps eliminate garbage coming 

3340 # from text at the beginning at the translations 

3341 # section. 

3342 sense_parts = [] 

3343 sense = None 

3344 # for item in node.children: 

3345 # if not isinstance(item, WikiNode): 

3346 # continue 

3347 # parse_translation_recurse(item) 

3348 parse_translation_recurse(node) 

3349 elif kind in LEVEL_KINDS: 3349 ↛ 3351line 3349 didn't jump to line 3351 because the condition on line 3349 was never true

3350 # Sub-levels will be recursed elsewhere 

3351 pass 

3352 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3353 parse_translation_recurse(node) 

3354 elif kind == NodeKind.PREFORMATTED: 3354 ↛ 3355line 3354 didn't jump to line 3355 because the condition on line 3354 was never true

3355 print("parse_translation_recurse: PREFORMATTED:", node) 

3356 elif kind == NodeKind.LINK: 3356 ↛ 3410line 3356 didn't jump to line 3410 because the condition on line 3356 was always true

3357 arg0 = node.largs[0] 

3358 # Kludge: I've seen occasional normal links to translation 

3359 # subpages from main pages (e.g., language/English/Noun 

3360 # in July 2021) instead of the normal 

3361 # {{see translation subpage|...}} template. This should 

3362 # handle them. Note: must be careful not to read other 

3363 # links, particularly things like in "human being": 

3364 # "a human being -- see [[man/translations]]" (group title) 

3365 if ( 3365 ↛ 3373line 3365 didn't jump to line 3373 because the condition on line 3365 was never true

3366 isinstance(arg0, (list, tuple)) 

3367 and arg0 

3368 and isinstance(arg0[0], str) 

3369 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3370 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3371 == wxr.wtp.title 

3372 ): 

3373 wxr.wtp.debug( 

3374 "translations subpage link found on main " 

3375 "page instead " 

3376 "of normal {{see translation subpage|...}}", 

3377 sortid="page/2595", 

3378 ) 

3379 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3380 if sub.lower() in POS_TITLES: 

3381 seq = [ 

3382 language, 

3383 sub, 

3384 TRANSLATIONS_TITLE, 

3385 ] 

3386 subnode = get_subpage_section( 

3387 wxr.wtp.title, 

3388 TRANSLATIONS_TITLE, 

3389 [seq], 

3390 ) 

3391 if subnode is not None and isinstance( 

3392 subnode, WikiNode 

3393 ): 

3394 parse_translations(data, subnode) 

3395 else: 

3396 wxr.wtp.error( 

3397 "/translations link outside part-of-speech" 

3398 ) 

3399 

3400 if ( 

3401 len(arg0) >= 1 

3402 and isinstance(arg0[0], str) 

3403 and not arg0[0].lower().startswith("category:") 

3404 ): 

3405 for x in node.largs[-1]: 

3406 if isinstance(x, str): 3406 ↛ 3409line 3406 didn't jump to line 3409 because the condition on line 3406 was always true

3407 sense_parts.append(x) 

3408 else: 

3409 parse_translation_recurse(x) 

3410 elif not sense: 

3411 sense_parts.append(node) 

3412 else: 

3413 wxr.wtp.debug( 

3414 "skipping text between translation items/senses: " 

3415 "{}".format(node), 

3416 sortid="page/2621", 

3417 ) 

3418 

3419 # Main code of parse_translation(). We want ``sense`` to be assigned 

3420 # regardless of recursion levels, and thus the code is structured 

3421 # to define at this level and recurse in parse_translation_recurse(). 

3422 parse_translation_recurse(xlatnode) 

3423 

3424 def parse_etymology(data: WordData, node: WikiNode) -> None: 

3425 """Parses an etymology section.""" 

3426 assert isinstance(data, dict) 

3427 assert isinstance(node, WikiNode) 

3428 

3429 templates: list[TemplateData] = [] 

3430 

3431 # Counter for preventing the capture of etymology templates 

3432 # when we are inside templates that we want to ignore (i.e., 

3433 # not capture). 

3434 ignore_count = 0 

3435 

3436 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3437 nonlocal ignore_count 

3438 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3439 return "" 

3440 if re.match(ignored_etymology_templates_re, name): 

3441 ignore_count += 1 

3442 return None 

3443 

3444 # CONTINUE_HERE 

3445 

3446 def etym_post_template_fn( 

3447 name: str, ht: TemplateArgs, expansion: str 

3448 ) -> None: 

3449 nonlocal ignore_count 

3450 if name in wikipedia_templates: 

3451 parse_wikipedia_template(wxr, data, ht) 

3452 return None 

3453 if re.match(ignored_etymology_templates_re, name): 

3454 ignore_count -= 1 

3455 return None 

3456 if ignore_count == 0: 3456 ↛ 3462line 3456 didn't jump to line 3462 because the condition on line 3456 was always true

3457 ht = clean_template_args(wxr, ht) 

3458 expansion = clean_node(wxr, None, expansion) 

3459 templates.append( 

3460 {"name": name, "args": ht, "expansion": expansion} 

3461 ) 

3462 return None 

3463 

3464 # Remove any subsections 

3465 contents = list( 

3466 x 

3467 for x in node.children 

3468 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3469 ) 

3470 # Convert to text, also capturing templates using post_template_fn 

3471 text = clean_node( 

3472 wxr, 

3473 None, 

3474 contents, 

3475 template_fn=etym_template_fn, 

3476 post_template_fn=etym_post_template_fn, 

3477 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3478 # Save the collected information. 

3479 if len(text) > 0: 

3480 data["etymology_text"] = text 

3481 if len(templates) > 0: 

3482 # Some etymology templates, like Template:root do not generate 

3483 # text, so they should be added here. Elsewhere, we check 

3484 # for Template:root and add some text to the expansion to please 

3485 # the validation. 

3486 data["etymology_templates"] = templates 

3487 

3488 for child_node in node.find_child_recursively( 3488 ↛ exitline 3488 didn't return from function 'parse_etymology' because the loop on line 3488 didn't complete

3489 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3490 ): 

3491 if child_node.kind in LEVEL_KIND_FLAGS: 

3492 break 

3493 elif isinstance( 3493 ↛ 3496line 3493 didn't jump to line 3496 because the condition on line 3493 was never true

3494 child_node, TemplateNode 

3495 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3496 if "etymology_examples" not in data: 

3497 data["etymology_examples"] = [] 

3498 data["etymology_examples"].extend( 

3499 extract_template_zh_x( 

3500 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3501 ) 

3502 ) 

3503 

3504 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3505 """This recurses into a subtree in the parse tree for a page.""" 

3506 nonlocal etym_data 

3507 nonlocal pos_data 

3508 nonlocal inside_level_four 

3509 

3510 redirect_list: list[str] = [] # for `zh-see` template 

3511 

3512 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3513 """This is called for otherwise unprocessed parts of the page. 

3514 We still expand them so that e.g. Category links get captured.""" 

3515 if name in wikipedia_templates: 

3516 data = select_data() 

3517 parse_wikipedia_template(wxr, data, ht) 

3518 return None 

3519 if is_panel_template(wxr, name): 

3520 return "" 

3521 return None 

3522 

3523 for node in treenode.children: 

3524 if not isinstance(node, WikiNode): 

3525 # print(" X{}".format(repr(node)[:40])) 

3526 continue 

3527 if isinstance(node, TemplateNode): 

3528 if process_soft_redirect_template(wxr, node, redirect_list): 

3529 continue 

3530 elif node.template_name == "zh-forms": 

3531 extract_zh_forms_template(wxr, node, select_data()) 

3532 

3533 if node.kind not in LEVEL_KINDS: 

3534 # XXX handle e.g. wikipedia links at the top of a language 

3535 # XXX should at least capture "also" at top of page 

3536 if node.kind in ( 

3537 NodeKind.HLINE, 

3538 NodeKind.LIST, 

3539 NodeKind.LIST_ITEM, 

3540 ): 

3541 continue 

3542 # print(" UNEXPECTED: {}".format(node)) 

3543 # Clean the node to collect category links 

3544 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3545 continue 

3546 t = clean_node( 

3547 wxr, etym_data, node.sarg if node.sarg else node.largs 

3548 ) 

3549 t = t.lower() 

3550 # XXX these counts were never implemented fully, and even this 

3551 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3552 wxr.config.section_counts[t] += 1 

3553 # print("PROCESS_CHILDREN: T:", repr(t)) 

3554 if t in IGNORED_TITLES: 

3555 pass 

3556 elif t.startswith(PRONUNCIATION_TITLE): 

3557 # Chinese Pronunciation section kludge; we demote these to 

3558 # be level 4 instead of 3 so that they're part of a larger 

3559 # etymology hierarchy; usually the data here is empty and 

3560 # acts as an inbetween between POS and Etymology data 

3561 inside_level_four = True 

3562 if t.startswith(PRONUNCIATION_TITLE + " "): 

3563 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3564 # and each of them may have senses under Definition 

3565 push_level_four_section(True) 

3566 wxr.wtp.start_subsection(None) 

3567 if wxr.config.capture_pronunciation: 3567 ↛ 3649line 3567 didn't jump to line 3649 because the condition on line 3567 was always true

3568 data = select_data() 

3569 parse_pronunciation( 

3570 wxr, 

3571 node, 

3572 data, 

3573 etym_data, 

3574 have_etym, 

3575 base_data, 

3576 lang_code, 

3577 ) 

3578 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3579 push_etym() 

3580 wxr.wtp.start_subsection(None) 

3581 if wxr.config.capture_etymologies: 3581 ↛ 3649line 3581 didn't jump to line 3649 because the condition on line 3581 was always true

3582 m = re.search(r"\s(\d+)$", t) 

3583 if m: 

3584 etym_data["etymology_number"] = int(m.group(1)) 

3585 parse_etymology(etym_data, node) 

3586 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3587 data = select_data() 

3588 extract_descendant_section(wxr, data, node, False) 

3589 elif ( 

3590 t in PROTO_ROOT_DERIVED_TITLES 

3591 and pos == "root" 

3592 and is_reconstruction 

3593 and wxr.config.capture_descendants 

3594 ): 

3595 data = select_data() 

3596 extract_descendant_section(wxr, data, node, True) 

3597 elif t == TRANSLATIONS_TITLE: 

3598 data = select_data() 

3599 parse_translations(data, node) 

3600 elif t in INFLECTION_TITLES: 

3601 parse_inflection(node, t, pos) 

3602 elif t == "alternative forms": 

3603 extract_alt_form_section(wxr, select_data(), node) 

3604 else: 

3605 lst = t.split() 

3606 while len(lst) > 1 and lst[-1].isdigit(): 3606 ↛ 3607line 3606 didn't jump to line 3607 because the condition on line 3606 was never true

3607 lst = lst[:-1] 

3608 t_no_number = " ".join(lst).lower() 

3609 if t_no_number in POS_TITLES: 

3610 push_pos() 

3611 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3612 pos = dt["pos"] or "MISSING_POS" 

3613 wxr.wtp.start_subsection(t) 

3614 if "debug" in dt: 

3615 wxr.wtp.debug( 

3616 "{} in section {}".format(dt["debug"], t), 

3617 sortid="page/2755", 

3618 ) 

3619 if "warning" in dt: 3619 ↛ 3620line 3619 didn't jump to line 3620 because the condition on line 3619 was never true

3620 wxr.wtp.warning( 

3621 "{} in section {}".format(dt["warning"], t), 

3622 sortid="page/2759", 

3623 ) 

3624 if "error" in dt: 3624 ↛ 3625line 3624 didn't jump to line 3625 because the condition on line 3624 was never true

3625 wxr.wtp.error( 

3626 "{} in section {}".format(dt["error"], t), 

3627 sortid="page/2763", 

3628 ) 

3629 # Parse word senses for the part-of-speech 

3630 parse_part_of_speech(node, pos) 

3631 if "tags" in dt: 

3632 for pdata in sense_datas: 

3633 data_extend(pdata, "tags", dt["tags"]) 

3634 elif t_no_number in LINKAGE_TITLES: 

3635 # print(f"LINKAGE_TITLES NODE {node=}") 

3636 rel = LINKAGE_TITLES[t_no_number] 

3637 data = select_data() 

3638 parse_linkage(data, rel, node) 

3639 elif t_no_number == COMPOUNDS_TITLE: 

3640 data = select_data() 

3641 if wxr.config.capture_compounds: 3641 ↛ 3649line 3641 didn't jump to line 3649 because the condition on line 3641 was always true

3642 parse_linkage(data, "derived", node) 

3643 

3644 # XXX parse interesting templates also from other sections. E.g., 

3645 # {{Letter|...}} in ===See also=== 

3646 # Also <gallery> 

3647 

3648 # Recurse to children of this node, processing subtitles therein 

3649 stack.append(t) 

3650 process_children(node, pos) 

3651 stack.pop() 

3652 

3653 if len(redirect_list) > 0: 

3654 if len(pos_data) > 0: 

3655 pos_data["redirects"] = redirect_list 

3656 if "pos" not in pos_data: 3656 ↛ 3657line 3656 didn't jump to line 3657 because the condition on line 3656 was never true

3657 pos_data["pos"] = "soft-redirect" 

3658 else: 

3659 new_page_data = copy.deepcopy(base_data) 

3660 new_page_data["redirects"] = redirect_list 

3661 if "pos" not in new_page_data: 3661 ↛ 3663line 3661 didn't jump to line 3663 because the condition on line 3661 was always true

3662 new_page_data["pos"] = "soft-redirect" 

3663 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3664 page_datas.append(new_page_data) 

3665 

3666 def extract_examples( 

3667 others: list[WikiNode], sense_base: SenseData 

3668 ) -> list[ExampleData]: 

3669 """Parses through a list of definitions and quotes to find examples. 

3670 Returns a list of example dicts to be added to sense data. Adds 

3671 meta-data, mostly categories, into sense_base.""" 

3672 assert isinstance(others, list) 

3673 examples: list[ExampleData] = [] 

3674 

3675 for sub in others: 

3676 if not sub.sarg.endswith((":", "*")): 3676 ↛ 3677line 3676 didn't jump to line 3677 because the condition on line 3676 was never true

3677 continue 

3678 for item in sub.children: 

3679 if not isinstance(item, WikiNode): 3679 ↛ 3680line 3679 didn't jump to line 3680 because the condition on line 3679 was never true

3680 continue 

3681 if item.kind != NodeKind.LIST_ITEM: 3681 ↛ 3682line 3681 didn't jump to line 3682 because the condition on line 3681 was never true

3682 continue 

3683 usex_type = None 

3684 example_template_args = [] 

3685 example_template_names = [] 

3686 taxons = set() 

3687 

3688 # Bypass this function when parsing Chinese, Japanese and 

3689 # quotation templates. 

3690 new_example_lists = extract_example_list_item( 

3691 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3692 ) 

3693 if len(new_example_lists) > 0: 

3694 examples.extend(new_example_lists) 

3695 continue 

3696 

3697 def usex_template_fn( 

3698 name: str, ht: TemplateArgs 

3699 ) -> Optional[str]: 

3700 nonlocal usex_type 

3701 if is_panel_template(wxr, name): 

3702 return "" 

3703 if name in usex_templates: 

3704 usex_type = "example" 

3705 example_template_args.append(ht) 

3706 example_template_names.append(name) 

3707 elif name in quotation_templates: 

3708 usex_type = "quotation" 

3709 elif name in taxonomy_templates: 3709 ↛ 3710line 3709 didn't jump to line 3710 because the condition on line 3709 was never true

3710 taxons.update(ht.get(1, "").split()) 

3711 for prefix in template_linkages_to_ignore_in_examples: 

3712 if re.search( 

3713 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3714 ): 

3715 return "" 

3716 return None 

3717 

3718 # bookmark 

3719 ruby: list[tuple[str, str]] = [] 

3720 contents = item.children 

3721 if lang_code == "ja": 

3722 # Capture ruby contents if this is a Japanese language 

3723 # example. 

3724 # print(contents) 

3725 if ( 3725 ↛ 3730line 3725 didn't jump to line 3730 because the condition on line 3725 was never true

3726 contents 

3727 and isinstance(contents, str) 

3728 and re.match(r"\s*$", contents[0]) 

3729 ): 

3730 contents = contents[1:] 

3731 exp = wxr.wtp.parse( 

3732 wxr.wtp.node_to_wikitext(contents), 

3733 # post_template_fn=head_post_template_fn, 

3734 expand_all=True, 

3735 ) 

3736 rub, rest = extract_ruby(wxr, exp.children) 

3737 if rub: 

3738 for rtup in rub: 

3739 ruby.append(rtup) 

3740 contents = rest 

3741 subtext = clean_node( 

3742 wxr, sense_base, contents, template_fn=usex_template_fn 

3743 ) 

3744 

3745 frozen_taxons = frozenset(taxons) 

3746 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3747 

3748 # print(f"{subtext=}") 

3749 subtext = re.sub( 

3750 r"\s*\(please add an English " 

3751 r"translation of this " 

3752 r"(example|usage example|quote)\)", 

3753 "", 

3754 subtext, 

3755 ).strip() 

3756 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3757 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3758 # print("subtext:", repr(subtext)) 

3759 

3760 lines = subtext.splitlines() 

3761 # print(lines) 

3762 

3763 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3764 lines = list( 

3765 x 

3766 for x in lines 

3767 if not re.match( 

3768 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3769 r"Synonym: |Antonym: |Hyponym: |" 

3770 r"Hypernyms: |Derived terms: |" 

3771 r"Related terms: |" 

3772 r"Hypernym: |Derived term: |" 

3773 r"Coordinate terms:|" 

3774 r"Related term: |" 

3775 r"For more quotations using )", 

3776 x, 

3777 ) 

3778 ) 

3779 tr = "" 

3780 ref = "" 

3781 roman = "" 

3782 # for line in lines: 

3783 # print("LINE:", repr(line)) 

3784 # print(classify_desc(line)) 

3785 if len(lines) == 1 and lang_code != "en": 

3786 parts = example_splitter_re.split(lines[0]) 

3787 if ( 3787 ↛ 3795line 3787 didn't jump to line 3795 because the condition on line 3787 was never true

3788 len(parts) > 2 

3789 and len(example_template_args) == 1 

3790 and any( 

3791 ("―" in s) or ("—" in s) 

3792 for s in example_template_args[0].values() 

3793 ) 

3794 ): 

3795 if nparts := synch_splits_with_args( 

3796 lines[0], example_template_args[0] 

3797 ): 

3798 parts = nparts 

3799 if ( 3799 ↛ 3804line 3799 didn't jump to line 3804 because the condition on line 3799 was never true

3800 len(example_template_args) == 1 

3801 and "lit" in example_template_args[0] 

3802 ): 

3803 # ugly brute-force kludge in case there's a lit= arg 

3804 literally = example_template_args[0].get("lit", "") 

3805 if literally: 

3806 literally = ( 

3807 " (literally, “" 

3808 + clean_value(wxr, literally) 

3809 + "”)" 

3810 ) 

3811 else: 

3812 literally = "" 

3813 if ( 3813 ↛ 3852line 3813 didn't jump to line 3852 because the condition on line 3813 was never true

3814 len(example_template_args) == 1 

3815 and len(parts) == 2 

3816 and len(example_template_args[0]) 

3817 - ( 

3818 # horrible kludge to ignore these arguments 

3819 # when calculating how many there are 

3820 sum( 

3821 s in example_template_args[0] 

3822 for s in ( 

3823 "lit", # generates text, but we handle it 

3824 "inline", 

3825 "noenum", 

3826 "nocat", 

3827 "sort", 

3828 ) 

3829 ) 

3830 ) 

3831 == 3 

3832 and clean_value( 

3833 wxr, example_template_args[0].get(2, "") 

3834 ) 

3835 == parts[0].strip() 

3836 and clean_value( 

3837 wxr, 

3838 ( 

3839 example_template_args[0].get(3) 

3840 or example_template_args[0].get("translation") 

3841 or example_template_args[0].get("t", "") 

3842 ) 

3843 + literally, # in case there's a lit= argument 

3844 ) 

3845 == parts[1].strip() 

3846 ): 

3847 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3848 # is a pretty reliable 'heuristic', so we use it here 

3849 # before the others. To be extra sure the template 

3850 # doesn't do anything weird, we compare the arguments 

3851 # and the output to each other. 

3852 lines = [parts[0].strip()] 

3853 tr = parts[1].strip() 

3854 elif ( 

3855 len(parts) == 2 

3856 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3857 ): 

3858 # These other branches just do some simple heuristics w/ 

3859 # the expanded output of the template (if applicable). 

3860 lines = [parts[0].strip()] 

3861 tr = parts[1].strip() 

3862 elif ( 3862 ↛ 3868line 3862 didn't jump to line 3868 because the condition on line 3862 was never true

3863 len(parts) == 3 

3864 and classify_desc2(parts[1]) 

3865 in ("romanization", "english") 

3866 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3867 ): 

3868 lines = [parts[0].strip()] 

3869 roman = parts[1].strip() 

3870 tr = parts[2].strip() 

3871 else: 

3872 parts = re.split(r"\s+-\s+", lines[0]) 

3873 if ( 3873 ↛ 3877line 3873 didn't jump to line 3877 because the condition on line 3873 was never true

3874 len(parts) == 2 

3875 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3876 ): 

3877 lines = [parts[0].strip()] 

3878 tr = parts[1].strip() 

3879 elif len(lines) > 1: 

3880 if any( 

3881 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3882 ) and not (len(example_template_names) == 1): 

3883 refs: list[str] = [] 

3884 for i in range(len(lines)): 3884 ↛ 3890line 3884 didn't jump to line 3890 because the loop on line 3884 didn't complete

3885 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3885 ↛ 3886line 3885 didn't jump to line 3886 because the condition on line 3885 was never true

3886 break 

3887 refs.append(lines[i].strip()) 

3888 if re.search(r"[]\d:)]\s*$", lines[i]): 

3889 break 

3890 ref = " ".join(refs) 

3891 lines = lines[i + 1 :] 

3892 if ( 

3893 lang_code != "en" 

3894 and len(lines) >= 2 

3895 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3896 ): 

3897 i = len(lines) - 1 

3898 while ( 3898 ↛ 3903line 3898 didn't jump to line 3903 because the condition on line 3898 was never true

3899 i > 1 

3900 and classify_desc2(lines[i - 1]) 

3901 in ENGLISH_TEXTS 

3902 ): 

3903 i -= 1 

3904 tr = "\n".join(lines[i:]) 

3905 lines = lines[:i] 

3906 if len(lines) >= 2: 

3907 if classify_desc2(lines[-1]) == "romanization": 

3908 roman = lines[-1].strip() 

3909 lines = lines[:-1] 

3910 

3911 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

3912 ref = lines[0] 

3913 lines = lines[1:] 

3914 elif lang_code != "en" and len(lines) == 2: 

3915 cls1 = classify_desc2(lines[0]) 

3916 cls2 = classify_desc2(lines[1]) 

3917 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3918 tr = lines[1] 

3919 lines = [lines[0]] 

3920 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3920 ↛ 3921line 3920 didn't jump to line 3921 because the condition on line 3920 was never true

3921 tr = lines[0] 

3922 lines = [lines[1]] 

3923 elif ( 3923 ↛ 3930line 3923 didn't jump to line 3930 because the condition on line 3923 was never true

3924 re.match(r"^[#*]*:+", lines[1]) 

3925 and classify_desc2( 

3926 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3927 ) 

3928 in ENGLISH_TEXTS 

3929 ): 

3930 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3931 lines = [lines[0]] 

3932 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3933 # Both were classified as English, but 

3934 # presumably one is not. Assume first is 

3935 # non-English, as that seems more common. 

3936 tr = lines[1] 

3937 lines = [lines[0]] 

3938 elif ( 

3939 usex_type != "quotation" 

3940 and lang_code != "en" 

3941 and len(lines) == 3 

3942 ): 

3943 cls1 = classify_desc2(lines[0]) 

3944 cls2 = classify_desc2(lines[1]) 

3945 cls3 = classify_desc2(lines[2]) 

3946 if ( 

3947 cls3 == "english" 

3948 and cls2 in ("english", "romanization") 

3949 and cls1 != "english" 

3950 ): 

3951 tr = lines[2].strip() 

3952 roman = lines[1].strip() 

3953 lines = [lines[0].strip()] 

3954 elif ( 3954 ↛ 3962line 3954 didn't jump to line 3962 because the condition on line 3954 was never true

3955 usex_type == "quotation" 

3956 and lang_code != "en" 

3957 and len(lines) > 2 

3958 ): 

3959 # for x in lines: 

3960 # print(" LINE: {}: {}" 

3961 # .format(classify_desc2(x), x)) 

3962 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3963 ref = lines[0] 

3964 lines = lines[2:] 

3965 cls1 = classify_desc2(lines[-1]) 

3966 if cls1 == "english": 

3967 i = len(lines) - 1 

3968 while ( 

3969 i > 1 

3970 and classify_desc2(lines[i - 1]) 

3971 == ENGLISH_TEXTS 

3972 ): 

3973 i -= 1 

3974 tr = "\n".join(lines[i:]) 

3975 lines = lines[:i] 

3976 

3977 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3978 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3979 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3980 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3981 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3982 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3983 ref = re.sub( 

3984 r", (volume |number |page )?“?" 

3985 r"\(please specify ([^)]|\(s\))*\)”?|" 

3986 ", text here$", 

3987 "", 

3988 ref, 

3989 ) 

3990 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3991 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3992 subtext = "\n".join(x for x in lines if x) 

3993 if not tr and lang_code != "en": 

3994 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3995 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true

3996 tr = m.group(2) 

3997 subtext = subtext[: m.start()] + m.group(1) 

3998 elif lines: 

3999 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

4000 if ( 4000 ↛ 4004line 4000 didn't jump to line 4004 because the condition on line 4000 was never true

4001 len(parts) == 2 

4002 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

4003 ): 

4004 subtext = parts[0].strip() 

4005 tr = parts[1].strip() 

4006 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

4007 subtext = re.sub( 

4008 r"(please add an English translation of " 

4009 r"this (quote|usage example))", 

4010 "", 

4011 subtext, 

4012 ) 

4013 subtext = re.sub( 

4014 r"\s*→New International Version " "translation$", 

4015 "", 

4016 subtext, 

4017 ) # e.g. pis/Tok Pisin (Bible) 

4018 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

4019 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

4020 note = None 

4021 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

4022 if ( 4022 ↛ 4030line 4022 didn't jump to line 4030 because the condition on line 4022 was never true

4023 m is not None 

4024 and lang_code != "en" 

4025 and ( 

4026 m.group(1).startswith("with ") 

4027 or classify_desc2(m.group(1)) == "english" 

4028 ) 

4029 ): 

4030 note = m.group(1) 

4031 subtext = subtext[m.end() :] 

4032 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

4033 ref = re.sub(r",\s*→ISBN", "", ref) 

4034 ref = ref.strip() 

4035 if ref.endswith(":") or ref.endswith(","): 

4036 ref = ref[:-1].strip() 

4037 ref = re.sub(r"\s+,\s+", ", ", ref) 

4038 ref = re.sub(r"\s+", " ", ref) 

4039 if ref and not subtext: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true

4040 subtext = ref 

4041 ref = "" 

4042 if subtext: 

4043 dt: ExampleData = {"text": subtext} 

4044 if ref: 

4045 dt["ref"] = ref 

4046 if tr: 

4047 dt["english"] = tr # DEPRECATED for "translation" 

4048 dt["translation"] = tr 

4049 if usex_type: 

4050 dt["type"] = usex_type 

4051 if note: 4051 ↛ 4052line 4051 didn't jump to line 4052 because the condition on line 4051 was never true

4052 dt["note"] = note 

4053 if roman: 

4054 dt["roman"] = roman 

4055 if ruby: 

4056 dt["ruby"] = ruby 

4057 examples.append(dt) 

4058 

4059 return examples 

4060 

4061 # Main code of parse_language() 

4062 # Process the section 

4063 stack.append(language) 

4064 process_children(langnode, None) 

4065 stack.pop() 

4066 

4067 # Finalize word entires 

4068 push_etym() 

4069 ret = [] 

4070 for data in page_datas: 

4071 merge_base(data, base_data) 

4072 ret.append(data) 

4073 

4074 # Copy all tags to word senses 

4075 for data in ret: 

4076 if "senses" not in data: 4076 ↛ 4077line 4076 didn't jump to line 4077 because the condition on line 4076 was never true

4077 continue 

4078 # WordData should not have a 'tags' field, but if it does, it's 

4079 # deleted and its contents removed and placed in each sense; 

4080 # that's why the type ignores. 

4081 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

4082 if "tags" in data: 

4083 del data["tags"] # type: ignore[typeddict-item] 

4084 for sense in data["senses"]: 

4085 data_extend(sense, "tags", tags) 

4086 

4087 return ret 

4088 

4089 

4090def parse_wikipedia_template( 

4091 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

4092) -> None: 

4093 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

4094 assert isinstance(wxr, WiktextractContext) 

4095 assert isinstance(data, dict) 

4096 assert isinstance(ht, dict) 

4097 langid = clean_node(wxr, data, ht.get("lang", ())) 

4098 pagename = ( 

4099 clean_node(wxr, data, ht.get(1, ())) 

4100 or wxr.wtp.title 

4101 or "MISSING_PAGE_TITLE" 

4102 ) 

4103 if langid: 

4104 data_append(data, "wikipedia", langid + ":" + pagename) 

4105 else: 

4106 data_append(data, "wikipedia", pagename) 

4107 

4108 

4109def parse_top_template( 

4110 wxr: WiktextractContext, node: WikiNode, data: WordData 

4111) -> None: 

4112 """Parses a template that occurs on the top-level in a page, before any 

4113 language subtitles.""" 

4114 assert isinstance(wxr, WiktextractContext) 

4115 assert isinstance(node, WikiNode) 

4116 assert isinstance(data, dict) 

4117 

4118 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

4119 if name in wikipedia_templates: 

4120 parse_wikipedia_template(wxr, data, ht) 

4121 return None 

4122 if is_panel_template(wxr, name): 

4123 return "" 

4124 if name in ("reconstruction",): 4124 ↛ 4125line 4124 didn't jump to line 4125 because the condition on line 4124 was never true

4125 return "" 

4126 if name.lower() == "also" or name.lower().startswith("also/"): 

4127 # XXX shows related words that might really have been the intended 

4128 # word, capture them 

4129 return "" 

4130 if name == "see also": 4130 ↛ 4132line 4130 didn't jump to line 4132 because the condition on line 4130 was never true

4131 # XXX capture 

4132 return "" 

4133 if name == "cardinalbox": 4133 ↛ 4135line 4133 didn't jump to line 4135 because the condition on line 4133 was never true

4134 # XXX capture 

4135 return "" 

4136 if name == "character info": 4136 ↛ 4138line 4136 didn't jump to line 4138 because the condition on line 4136 was never true

4137 # XXX capture 

4138 return "" 

4139 if name == "commonscat": 4139 ↛ 4141line 4139 didn't jump to line 4141 because the condition on line 4139 was never true

4140 # XXX capture link to Wikimedia commons 

4141 return "" 

4142 if name == "wrongtitle": 4142 ↛ 4145line 4142 didn't jump to line 4145 because the condition on line 4142 was never true

4143 # XXX this should be captured to replace page title with the 

4144 # correct title. E.g. ⿰亻革家 

4145 return "" 

4146 if name == "wikidata": 4146 ↛ 4147line 4146 didn't jump to line 4147 because the condition on line 4146 was never true

4147 arg = clean_node(wxr, data, ht.get(1, ())) 

4148 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

4149 data_append(data, "wikidata", arg) 

4150 return "" 

4151 wxr.wtp.debug( 

4152 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

4153 sortid="page/2870", 

4154 ) 

4155 return "" 

4156 

4157 clean_node(wxr, None, [node], template_fn=top_template_fn) 

4158 

4159 

4160def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

4161 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

4162 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

4163 that are next to each other.""" 

4164 

4165 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

4166 # section get overwritten. In this case, let's just combine the two. 

4167 

4168 # In Chinese entries, Pronunciation can be preceded on the 

4169 # same level 3 by its Etymology *and* Glyph Origin sections: 

4170 # ===Glyph Origin=== 

4171 # ===Etymology=== 

4172 # ===Pronunciation=== 

4173 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

4174 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

4175 # are now level 6 

4176 

4177 # Known lowercase PoS names are in part_of_speech_map 

4178 # Known lowercase linkage section names are in linkage_map 

4179 

4180 old = re.split( 

4181 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

4182 ) 

4183 

4184 parts = [] 

4185 npar = 4 # Number of parentheses in above expression 

4186 parts.append(old[0]) 

4187 prev_level = None 

4188 level = None 

4189 skip_level_title = False # When combining etymology sections 

4190 for i in range(1, len(old), npar + 1): 

4191 left = old[i] 

4192 right = old[i + npar - 1] 

4193 # remove Wikilinks in title 

4194 title = re.sub(r"^\[\[", "", old[i + 1]) 

4195 title = re.sub(r"\]\]$", "", title) 

4196 prev_level = level 

4197 level = len(left) 

4198 part = old[i + npar] 

4199 if level != len(right): 4199 ↛ 4200line 4199 didn't jump to line 4200 because the condition on line 4199 was never true

4200 wxr.wtp.debug( 

4201 "subtitle has unbalanced levels: " 

4202 "{!r} has {} on the left and {} on the right".format( 

4203 title, left, right 

4204 ), 

4205 sortid="page/2904", 

4206 ) 

4207 lc = title.lower() 

4208 if name_to_code(title, "en") != "": 

4209 if level > 2: 4209 ↛ 4210line 4209 didn't jump to line 4210 because the condition on line 4209 was never true

4210 wxr.wtp.debug( 

4211 "subtitle has language name {} at level {}".format( 

4212 title, level 

4213 ), 

4214 sortid="page/2911", 

4215 ) 

4216 level = 2 

4217 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4218 if level > 3: 4218 ↛ 4219line 4218 didn't jump to line 4219 because the condition on line 4218 was never true

4219 wxr.wtp.debug( 

4220 "etymology section {} at level {}".format(title, level), 

4221 sortid="page/2917", 

4222 ) 

4223 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

4224 # sections cheek-to-cheek 

4225 skip_level_title = True 

4226 # Modify the title of previous ("Glyph Origin") section, in 

4227 # case we have a meaningful title like "Etymology 1" 

4228 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4229 level = 3 

4230 elif lc.startswith(PRONUNCIATION_TITLE): 

4231 # Pronunciation is now a level between POS and Etymology, so 

4232 # we need to shift everything down by one 

4233 level = 4 

4234 elif lc in POS_TITLES: 

4235 level = 5 

4236 elif lc == TRANSLATIONS_TITLE: 

4237 level = 6 

4238 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

4239 level = 6 

4240 elif lc in INFLECTION_TITLES: 

4241 level = 6 

4242 elif lc == DESCENDANTS_TITLE: 

4243 level = 6 

4244 elif title in PROTO_ROOT_DERIVED_TITLES: 4244 ↛ 4245line 4244 didn't jump to line 4245 because the condition on line 4244 was never true

4245 level = 6 

4246 elif lc in IGNORED_TITLES: 

4247 level = 6 

4248 else: 

4249 level = 6 

4250 if skip_level_title: 

4251 skip_level_title = False 

4252 parts.append(part) 

4253 else: 

4254 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4255 parts.append(part) 

4256 # print("=" * level, title) 

4257 # if level != len(left): 

4258 # print(" FIXED LEVEL OF {} {} -> {}" 

4259 # .format(title, len(left), level)) 

4260 

4261 text = "".join(parts) 

4262 # print(text) 

4263 return text 

4264 

4265 

4266def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4267 # Skip translation pages 

4268 if word.endswith("/" + TRANSLATIONS_TITLE): 4268 ↛ 4269line 4268 didn't jump to line 4269 because the condition on line 4268 was never true

4269 return [] 

4270 

4271 if wxr.config.verbose: 4271 ↛ 4272line 4271 didn't jump to line 4272 because the condition on line 4271 was never true

4272 logger.info(f"Parsing page: {word}") 

4273 

4274 wxr.config.word = word 

4275 wxr.wtp.start_page(word) 

4276 

4277 # Remove <noinclude> and similar tags from main pages. They 

4278 # should not appear there, but at least net/Elfdala has one and it 

4279 # is probably not the only one. 

4280 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4281 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4282 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4283 

4284 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4285 # pages that have, for example, Translations section under Linkage, or 

4286 # Translations section on the same level as Noun. Enforce a proper 

4287 # hierarchy by manipulating the subtitle levels in certain cases. 

4288 text = fix_subtitle_hierarchy(wxr, text) 

4289 

4290 # Parse the page, pre-expanding those templates that are likely to 

4291 # influence parsing 

4292 tree = wxr.wtp.parse( 

4293 text, 

4294 pre_expand=True, 

4295 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4296 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4297 ) 

4298 # from wikitextprocessor.parser import print_tree 

4299 # print("PAGE PARSE:", print_tree(tree)) 

4300 

4301 top_data: WordData = {} 

4302 

4303 # Iterate over top-level titles, which should be languages for normal 

4304 # pages 

4305 by_lang = defaultdict(list) 

4306 for langnode in tree.children: 

4307 if not isinstance(langnode, WikiNode): 

4308 continue 

4309 if langnode.kind == NodeKind.TEMPLATE: 

4310 parse_top_template(wxr, langnode, top_data) 

4311 continue 

4312 if langnode.kind == NodeKind.LINK: 

4313 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4314 continue 

4315 if langnode.kind != NodeKind.LEVEL2: 4315 ↛ 4316line 4315 didn't jump to line 4316 because the condition on line 4315 was never true

4316 wxr.wtp.debug( 

4317 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4318 ) 

4319 continue 

4320 lang = clean_node( 

4321 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4322 ) 

4323 lang_code = name_to_code(lang, "en") 

4324 if lang_code == "": 4324 ↛ 4325line 4324 didn't jump to line 4325 because the condition on line 4324 was never true

4325 wxr.wtp.debug( 

4326 f"unrecognized language name: {lang}", sortid="page/3019" 

4327 ) 

4328 if ( 

4329 wxr.config.capture_language_codes 

4330 and lang_code not in wxr.config.capture_language_codes 

4331 ): 

4332 continue 

4333 wxr.wtp.start_section(lang) 

4334 

4335 # Collect all words from the page. 

4336 # print(f"{langnode=}") 

4337 datas = parse_language(wxr, langnode, lang, lang_code) 

4338 

4339 # Propagate fields resulting from top-level templates to this 

4340 # part-of-speech. 

4341 for data in datas: 

4342 if "lang" not in data: 4342 ↛ 4343line 4342 didn't jump to line 4343 because the condition on line 4342 was never true

4343 wxr.wtp.debug( 

4344 "internal error -- no lang in data: {}".format(data), 

4345 sortid="page/3034", 

4346 ) 

4347 continue 

4348 for k, v in top_data.items(): 

4349 assert isinstance(v, (list, tuple)) 

4350 data_extend(data, k, v) 

4351 by_lang[data["lang"]].append(data) 

4352 

4353 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4354 # field. FIX OR REMOVE. 

4355 # Do some post-processing on the words. For example, we may distribute 

4356 # conjugation information to all the words. 

4357 ret = [] 

4358 for lang, lang_datas in by_lang.items(): 

4359 ret.extend(lang_datas) 

4360 

4361 for x in ret: 

4362 if x["word"] != word: 

4363 if word.startswith("Unsupported titles/"): 

4364 wxr.wtp.debug( 

4365 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4366 sortid="20231101/3578page.py", 

4367 ) 

4368 else: 

4369 wxr.wtp.debug( 

4370 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4371 sortid="20231101/3582page.py", 

4372 ) 

4373 x["original_title"] = word 

4374 # validate tag data 

4375 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4376 return ret 

4377 

4378 

4379def recursively_separate_raw_tags( 

4380 wxr: WiktextractContext, data: dict[str, Any] 

4381) -> None: 

4382 if not isinstance(data, dict): 4382 ↛ 4383line 4382 didn't jump to line 4383 because the condition on line 4382 was never true

4383 wxr.wtp.error( 

4384 "'data' is not dict; most probably " 

4385 "data has a list that contains at least one dict and " 

4386 "at least one non-dict item", 

4387 sortid="en/page-4016/20240419", 

4388 ) 

4389 return 

4390 new_tags: list[str] = [] 

4391 raw_tags: list[str] = data.get("raw_tags", []) 

4392 for field, val in data.items(): 

4393 if field == "tags": 

4394 for tag in val: 

4395 if tag not in valid_tags: 

4396 raw_tags.append(tag) 

4397 else: 

4398 new_tags.append(tag) 

4399 if isinstance(val, list): 

4400 if len(val) > 0 and isinstance(val[0], dict): 

4401 for d in val: 

4402 recursively_separate_raw_tags(wxr, d) 

4403 if "tags" in data and not new_tags: 

4404 del data["tags"] 

4405 elif new_tags: 

4406 data["tags"] = new_tags 

4407 if raw_tags: 

4408 data["raw_tags"] = raw_tags 

4409 

4410 

4411def process_soft_redirect_template( 

4412 wxr: WiktextractContext, 

4413 template_node: TemplateNode, 

4414 redirect_pages: list[str], 

4415) -> bool: 

4416 # return `True` if the template is soft redirect template 

4417 if template_node.template_name == "zh-see": 

4418 # https://en.wiktionary.org/wiki/Template:zh-see 

4419 title = clean_node( 

4420 wxr, None, template_node.template_parameters.get(1, "") 

4421 ) 

4422 if title != "": 4422 ↛ 4424line 4422 didn't jump to line 4424 because the condition on line 4422 was always true

4423 redirect_pages.append(title) 

4424 return True 

4425 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4426 # https://en.wiktionary.org/wiki/Template:ja-see 

4427 for key, value in template_node.template_parameters.items(): 

4428 if isinstance(key, int): 4428 ↛ 4427line 4428 didn't jump to line 4427 because the condition on line 4428 was always true

4429 title = clean_node(wxr, None, value) 

4430 if title != "": 4430 ↛ 4427line 4430 didn't jump to line 4427 because the condition on line 4430 was always true

4431 redirect_pages.append(title) 

4432 return True 

4433 return False 

4434 

4435 

4436ZH_FORMS_TAGS = { 

4437 "trad.": "Traditional-Chinese", 

4438 "simp.": "Simplified-Chinese", 

4439 "alternative forms": "alternative", 

4440} 

4441 

4442 

4443def extract_zh_forms_template( 

4444 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4445): 

4446 # https://en.wiktionary.org/wiki/Template:zh-forms 

4447 lit_meaning = clean_node( 

4448 wxr, None, t_node.template_parameters.get("lit", "") 

4449 ) 

4450 if lit_meaning != "": 4450 ↛ 4451line 4450 didn't jump to line 4451 because the condition on line 4450 was never true

4451 base_data["literal_meaning"] = lit_meaning 

4452 expanded_node = wxr.wtp.parse( 

4453 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4454 ) 

4455 for table in expanded_node.find_child(NodeKind.TABLE): 

4456 for row in table.find_child(NodeKind.TABLE_ROW): 

4457 row_header = "" 

4458 row_header_tags = [] 

4459 header_has_span = False 

4460 for cell in row.find_child( 

4461 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

4462 ): 

4463 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

4464 row_header, row_header_tags, header_has_span = ( 

4465 extract_zh_forms_header_cell(wxr, base_data, cell) 

4466 ) 

4467 elif not header_has_span: 

4468 extract_zh_forms_data_cell( 

4469 wxr, base_data, cell, row_header, row_header_tags 

4470 ) 

4471 

4472 if "forms" in base_data and len(base_data["forms"]) == 0: 4472 ↛ 4473line 4472 didn't jump to line 4473 because the condition on line 4472 was never true

4473 del base_data["forms"] 

4474 

4475 

4476def extract_zh_forms_header_cell( 

4477 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode 

4478) -> tuple[str, list[str], bool]: 

4479 row_header = "" 

4480 row_header_tags = [] 

4481 header_has_span = False 

4482 first_span_index = len(header_cell.children) 

4483 for index, span_tag in header_cell.find_html("span", with_index=True): 

4484 if index < first_span_index: 4484 ↛ 4486line 4484 didn't jump to line 4486 because the condition on line 4484 was always true

4485 first_span_index = index 

4486 header_has_span = True 

4487 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

4488 for raw_tag in row_header.split(" and "): 

4489 raw_tag = raw_tag.strip() 

4490 if raw_tag != "": 4490 ↛ 4488line 4490 didn't jump to line 4488 because the condition on line 4490 was always true

4491 row_header_tags.append(raw_tag) 

4492 for span_tag in header_cell.find_html_recursively("span"): 

4493 span_lang = span_tag.attrs.get("lang", "") 

4494 form_nodes = [] 

4495 sup_title = "" 

4496 for node in span_tag.children: 

4497 if isinstance(node, HTMLNode) and node.tag == "sup": 4497 ↛ 4498line 4497 didn't jump to line 4498 because the condition on line 4497 was never true

4498 for sup_span in node.find_html("span"): 

4499 sup_title = sup_span.attrs.get("title", "") 

4500 else: 

4501 form_nodes.append(node) 

4502 if span_lang in ["zh-Hant", "zh-Hans"]: 4502 ↛ 4503line 4502 didn't jump to line 4503 because the condition on line 4502 was never true

4503 for word in clean_node(wxr, None, form_nodes).split("/"): 

4504 if word not in [wxr.wtp.title, ""]: 

4505 form = {"form": word} 

4506 for raw_tag in row_header_tags: 

4507 if raw_tag in ZH_FORMS_TAGS: 

4508 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4509 else: 

4510 data_append(form, "raw_tags", raw_tag) 

4511 if sup_title != "": 

4512 data_append(form, "raw_tags", sup_title) 

4513 data_append(base_data, "forms", form) 

4514 return row_header, row_header_tags, header_has_span 

4515 

4516 

4517def extract_zh_forms_data_cell( 

4518 wxr: WiktextractContext, 

4519 base_data: WordData, 

4520 cell: WikiNode, 

4521 row_header: str, 

4522 row_header_tags: list[str], 

4523): 

4524 from .zh_pron_tags import ZH_PRON_TAGS 

4525 

4526 for top_span_tag in cell.find_html("span"): 

4527 forms = [] 

4528 for span_tag in top_span_tag.find_html("span"): 

4529 span_lang = span_tag.attrs.get("lang", "") 

4530 if span_lang in ["zh-Hant", "zh-Hans", "zh"]: 

4531 word = clean_node(wxr, None, span_tag) 

4532 if word not in ["", "/", wxr.wtp.title]: 

4533 form = {"form": word} 

4534 if row_header != "anagram": 4534 ↛ 4542line 4534 didn't jump to line 4542 because the condition on line 4534 was always true

4535 for raw_tag in row_header_tags: 

4536 if raw_tag in ZH_FORMS_TAGS: 4536 ↛ 4541line 4536 didn't jump to line 4541 because the condition on line 4536 was always true

4537 data_append( 

4538 form, "tags", ZH_FORMS_TAGS[raw_tag] 

4539 ) 

4540 else: 

4541 data_append(form, "raw_tags", raw_tag) 

4542 if span_lang == "zh-Hant": 

4543 data_append(form, "tags", "Traditional-Chinese") 

4544 elif span_lang == "zh-Hans": 

4545 data_append(form, "tags", "Simplified-Chinese") 

4546 forms.append(form) 

4547 elif "font-size:80%" in span_tag.attrs.get("style", ""): 4547 ↛ 4528line 4547 didn't jump to line 4528 because the condition on line 4547 was always true

4548 raw_tag = clean_node(wxr, None, span_tag) 

4549 if raw_tag != "": 4549 ↛ 4528line 4549 didn't jump to line 4528 because the condition on line 4549 was always true

4550 for form in forms: 

4551 if raw_tag in ZH_PRON_TAGS: 4551 ↛ 4557line 4551 didn't jump to line 4557 because the condition on line 4551 was always true

4552 tr_tag = ZH_PRON_TAGS[raw_tag] 

4553 if isinstance(tr_tag, list): 4553 ↛ 4554line 4553 didn't jump to line 4554 because the condition on line 4553 was never true

4554 data_extend(form, "tags", tr_tag) 

4555 elif isinstance(tr_tag, str): 4555 ↛ 4550line 4555 didn't jump to line 4550 because the condition on line 4555 was always true

4556 data_append(form, "tags", tr_tag) 

4557 elif raw_tag in valid_tags: 

4558 data_append(form, "tags", raw_tag) 

4559 else: 

4560 data_append(form, "raw_tags", raw_tag) 

4561 

4562 if row_header == "anagram": 4562 ↛ 4563line 4562 didn't jump to line 4563 because the condition on line 4562 was never true

4563 for form in forms: 

4564 l_data = {"word": form["form"]} 

4565 for key in ["tags", "raw_tags"]: 

4566 if key in form: 

4567 l_data[key] = form[key] 

4568 data_append(base_data, "anagrams", l_data) 

4569 else: 

4570 data_extend(base_data, "forms", forms)