Coverage for src/wiktextract/extractor/en/page.py: 76%

1944 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8from collections import defaultdict 

9from functools import partial 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 Iterable, 

14 Optional, 

15 Set, 

16 Union, 

17 cast, 

18) 

19 

20from mediawiki_langcodes import get_all_names, name_to_code 

21from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

22from wikitextprocessor.parser import ( 

23 LEVEL_KIND_FLAGS, 

24 GeneralNode, 

25 HTMLNode, 

26 LevelNode, 

27 NodeKind, 

28 TemplateNode, 

29 WikiNode, 

30 is_list, 

31 is_list_item, 

32) 

33 

34from ...clean import clean_template_args, clean_value 

35from ...datautils import ( 

36 data_append, 

37 data_extend, 

38 ns_title_prefix_tuple, 

39) 

40from ...page import ( 

41 LEVEL_KINDS, 

42 clean_node, 

43 is_panel_template, 

44 recursively_extract, 

45) 

46from ...tags import valid_tags 

47from ...wxr_context import WiktextractContext 

48from ...wxr_logging import logger 

49from ..ruby import extract_ruby, parse_ruby 

50from ..share import strip_nodes 

51from .descendant import extract_descendant_section 

52from .example import extract_example_list_item, extract_template_zh_x 

53from .form_descriptions import ( 

54 classify_desc, 

55 decode_tags, 

56 distw, 

57 parse_alt_or_inflection_of, 

58 parse_sense_qualifier, 

59 parse_word_head, 

60) 

61from .inflection import TableContext, parse_inflection_section 

62from .info_templates import ( 

63 INFO_TEMPLATE_FUNCS, 

64 parse_info_template_arguments, 

65 parse_info_template_node, 

66) 

67from .linkages import ( 

68 extract_alt_form_section, 

69 extract_zh_dial_template, 

70 parse_linkage_item_text, 

71) 

72from .parts_of_speech import PARTS_OF_SPEECH 

73from .section_titles import ( 

74 COMPOUNDS_TITLE, 

75 DESCENDANTS_TITLE, 

76 ETYMOLOGY_TITLES, 

77 IGNORED_TITLES, 

78 INFLECTION_TITLES, 

79 LINKAGE_TITLES, 

80 POS_TITLES, 

81 PRONUNCIATION_TITLE, 

82 PROTO_ROOT_DERIVED_TITLES, 

83 TRANSLATIONS_TITLE, 

84) 

85from .translations import parse_translation_item_text 

86from .type_utils import ( 

87 AttestationData, 

88 ExampleData, 

89 LinkageData, 

90 ReferenceData, 

91 SenseData, 

92 SoundData, 

93 TemplateData, 

94 WordData, 

95) 

96from .unsupported_titles import unsupported_title_map 

97 

98# When determining whether a string is 'english', classify_desc 

99# might return 'taxonomic' which is English text 99% of the time. 

100ENGLISH_TEXTS = ("english", "taxonomic") 

101 

102# Matches head tag 

103HEAD_TAG_RE = re.compile( 

104 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

105 r"hangul-symbol|syllable-hangul)$|" 

106 + r"^(latin|" 

107 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

108 + r")-(" 

109 + "|".join( 

110 [ 

111 "abbr", 

112 "adj", 

113 "adjective", 

114 "adjective form", 

115 "adjective-form", 

116 "adv", 

117 "adverb", 

118 "affix", 

119 "animal command", 

120 "art", 

121 "article", 

122 "aux", 

123 "bound pronoun", 

124 "bound-pronoun", 

125 "Buyla", 

126 "card num", 

127 "card-num", 

128 "cardinal", 

129 "chunom", 

130 "classifier", 

131 "clitic", 

132 "cls", 

133 "cmene", 

134 "cmavo", 

135 "colloq-verb", 

136 "colverbform", 

137 "combining form", 

138 "combining-form", 

139 "comparative", 

140 "con", 

141 "concord", 

142 "conj", 

143 "conjunction", 

144 "conjug", 

145 "cont", 

146 "contr", 

147 "converb", 

148 "daybox", 

149 "decl", 

150 "decl noun", 

151 "def", 

152 "dem", 

153 "det", 

154 "determ", 

155 "Deva", 

156 "ending", 

157 "entry", 

158 "form", 

159 "fuhivla", 

160 "gerund", 

161 "gismu", 

162 "hanja", 

163 "hantu", 

164 "hanzi", 

165 "head", 

166 "ideophone", 

167 "idiom", 

168 "inf", 

169 "indef", 

170 "infixed pronoun", 

171 "infixed-pronoun", 

172 "infl", 

173 "inflection", 

174 "initialism", 

175 "int", 

176 "interfix", 

177 "interj", 

178 "interjection", 

179 "jyut", 

180 "latin", 

181 "letter", 

182 "locative", 

183 "lujvo", 

184 "monthbox", 

185 "mutverb", 

186 "name", 

187 "nisba", 

188 "nom", 

189 "noun", 

190 "noun form", 

191 "noun-form", 

192 "noun plural", 

193 "noun-plural", 

194 "nounprefix", 

195 "num", 

196 "number", 

197 "numeral", 

198 "ord", 

199 "ordinal", 

200 "par", 

201 "part", 

202 "part form", 

203 "part-form", 

204 "participle", 

205 "particle", 

206 "past", 

207 "past neg", 

208 "past-neg", 

209 "past participle", 

210 "past-participle", 

211 "perfect participle", 

212 "perfect-participle", 

213 "personal pronoun", 

214 "personal-pronoun", 

215 "pref", 

216 "prefix", 

217 "phrase", 

218 "pinyin", 

219 "plural noun", 

220 "plural-noun", 

221 "pos", 

222 "poss-noun", 

223 "post", 

224 "postp", 

225 "postposition", 

226 "PP", 

227 "pp", 

228 "ppron", 

229 "pred", 

230 "predicative", 

231 "prep", 

232 "prep phrase", 

233 "prep-phrase", 

234 "preposition", 

235 "present participle", 

236 "present-participle", 

237 "pron", 

238 "prondem", 

239 "pronindef", 

240 "pronoun", 

241 "prop", 

242 "proper noun", 

243 "proper-noun", 

244 "proper noun form", 

245 "proper-noun form", 

246 "proper noun-form", 

247 "proper-noun-form", 

248 "prov", 

249 "proverb", 

250 "prpn", 

251 "prpr", 

252 "punctuation mark", 

253 "punctuation-mark", 

254 "regnoun", 

255 "rel", 

256 "rom", 

257 "romanji", 

258 "root", 

259 "sign", 

260 "suff", 

261 "suffix", 

262 "syllable", 

263 "symbol", 

264 "verb", 

265 "verb form", 

266 "verb-form", 

267 "verbal noun", 

268 "verbal-noun", 

269 "verbnec", 

270 "vform", 

271 ] 

272 ) 

273 + r")(-|/|\+|$)" 

274) 

275 

276# Head-templates causing problems (like newlines) that can be squashed into 

277# an empty string in the template handler while saving their template 

278# data for later. 

279WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

280 

281FLOATING_TABLE_TEMPLATES: set[str] = { 

282 # az-suffix-form creates a style=floatright div that is otherwise 

283 # deleted; if it is not pre-expanded, we can intercept the template 

284 # so we add this set into do_not_pre_expand, and intercept the 

285 # templates in parse_part_of_speech 

286 "az-suffix-forms", 

287 "az-inf-p", 

288 "kk-suffix-forms", 

289 "ky-suffix-forms", 

290 "tr-inf-p", 

291 "tr-suffix-forms", 

292 "tt-suffix-forms", 

293 "uz-suffix-forms", 

294} 

295# These two should contain template names that should always be 

296# pre-expanded when *first* processing the tree, or not pre-expanded 

297# so that the template are left in place with their identifying 

298# name intact for later filtering. 

299 

300DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

301DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

302 

303# Additional templates to be expanded in the pre-expand phase 

304ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

305 "multitrans", 

306 "multitrans-nowiki", 

307 "trans-top", 

308 "trans-top-also", 

309 "trans-bottom", 

310 "checktrans-top", 

311 "checktrans-bottom", 

312 "col", 

313 "col1", 

314 "col2", 

315 "col3", 

316 "col4", 

317 "col5", 

318 "col1-u", 

319 "col2-u", 

320 "col3-u", 

321 "col4-u", 

322 "col5-u", 

323 "check deprecated lang param usage", 

324 "deprecated code", 

325 "ru-verb-alt-ё", 

326 "ru-noun-alt-ё", 

327 "ru-adj-alt-ё", 

328 "ru-proper noun-alt-ё", 

329 "ru-pos-alt-ё", 

330 "ru-alt-ё", 

331 "inflection of", 

332 "no deprecated lang param usage", 

333 "transclude", # these produce sense entries (or other lists) 

334 "tcl", 

335} 

336 

337# Inverse linkage for those that have them 

338linkage_inverses: dict[str, str] = { 

339 # XXX this is not currently used, move to post-processing 

340 "synonyms": "synonyms", 

341 "hypernyms": "hyponyms", 

342 "hyponyms": "hypernyms", 

343 "holonyms": "meronyms", 

344 "meronyms": "holonyms", 

345 "derived": "derived_from", 

346 "coordinate_terms": "coordinate_terms", 

347 "troponyms": "hypernyms", 

348 "antonyms": "antonyms", 

349 "instances": "instance_of", 

350 "related": "related", 

351} 

352 

353# Templates that are used to form panels on pages and that 

354# should be ignored in various positions 

355PANEL_TEMPLATES: set[str] = { 

356 "Character info", 

357 "CJKV", 

358 "French personal pronouns", 

359 "French possessive adjectives", 

360 "French possessive pronouns", 

361 "Han etym", 

362 "Japanese demonstratives", 

363 "Latn-script", 

364 "LDL", 

365 "MW1913Abbr", 

366 "Number-encoding", 

367 "Nuttall", 

368 "Spanish possessive adjectives", 

369 "Spanish possessive pronouns", 

370 "USRegionDisputed", 

371 "Webster 1913", 

372 "ase-rfr", 

373 "attention", 

374 "attn", 

375 "beer", 

376 "broken ref", 

377 "ca-compass", 

378 "character info", 

379 "character info/var", 

380 "checksense", 

381 "compass-fi", 

382 "copyvio suspected", 

383 "delete", 

384 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

385 "etystub", 

386 "examples", 

387 "hu-corr", 

388 "hu-suff-pron", 

389 "interwiktionary", 

390 "ja-kanjitab", 

391 "ko-hanja-search", 

392 "look", 

393 "maintenance box", 

394 "maintenance line", 

395 "mediagenic terms", 

396 "merge", 

397 "missing template", 

398 "morse links", 

399 "move", 

400 "multiple images", 

401 "no inline", 

402 "picdic", 

403 "picdicimg", 

404 "picdiclabel", 

405 "polyominoes", 

406 "predidential nomics", 

407 "punctuation", # This actually gets pre-expanded 

408 "reconstructed", 

409 "request box", 

410 "rf-sound example", 

411 "rfaccents", 

412 "rfap", 

413 "rfaspect", 

414 "rfc", 

415 "rfc-auto", 

416 "rfc-header", 

417 "rfc-level", 

418 "rfc-pron-n", 

419 "rfc-sense", 

420 "rfclarify", 

421 "rfd", 

422 "rfd-redundant", 

423 "rfd-sense", 

424 "rfdate", 

425 "rfdatek", 

426 "rfdef", 

427 "rfe", 

428 "rfe/dowork", 

429 "rfex", 

430 "rfexp", 

431 "rfform", 

432 "rfgender", 

433 "rfi", 

434 "rfinfl", 

435 "rfm", 

436 "rfm-sense", 

437 "rfp", 

438 "rfp-old", 

439 "rfquote", 

440 "rfquote-sense", 

441 "rfquotek", 

442 "rfref", 

443 "rfscript", 

444 "rft2", 

445 "rftaxon", 

446 "rftone", 

447 "rftranslit", 

448 "rfv", 

449 "rfv-etym", 

450 "rfv-pron", 

451 "rfv-quote", 

452 "rfv-sense", 

453 "selfref", 

454 "split", 

455 "stroke order", # XXX consider capturing this? 

456 "stub entry", 

457 "t-needed", 

458 "tbot entry", 

459 "tea room", 

460 "tea room sense", 

461 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

462 "unblock", 

463 "unsupportedpage", 

464 "video frames", 

465 "was wotd", 

466 "wrongtitle", 

467 "zh-forms", 

468 "zh-hanzi-box", 

469 "no entry", 

470} 

471 

472# Template name prefixes used for language-specific panel templates (i.e., 

473# templates that create side boxes or notice boxes or that should generally 

474# be ignored). 

475PANEL_PREFIXES: set[str] = { 

476 "list:compass points/", 

477 "list:Gregorian calendar months/", 

478 "RQ:", 

479} 

480 

481# Templates used for wikipedia links. 

482wikipedia_templates: set[str] = { 

483 "wikipedia", 

484 "slim-wikipedia", 

485 "w", 

486 "W", 

487 "swp", 

488 "wiki", 

489 "Wikipedia", 

490 "wtorw", 

491} 

492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started

493 print( 

494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

495 x 

496 ) 

497 ) 

498 

499# Mapping from a template name (without language prefix) for the main word 

500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

501# it could validly occur. This is used as just a sanity check to give 

502# warnings about probably incorrect coding in Wiktionary. 

503template_allowed_pos_map: dict[str, list[str]] = { 

504 "abbr": ["abbrev"], 

505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

506 "plural noun": ["noun", "name"], 

507 "plural-noun": ["noun", "name"], 

508 "proper noun": ["noun", "name"], 

509 "proper-noun": ["name", "noun"], 

510 "prop": ["name", "noun"], 

511 "verb": ["verb", "phrase"], 

512 "gerund": ["verb"], 

513 "particle": ["adv", "particle"], 

514 "adj": ["adj", "adj_noun"], 

515 "pron": ["pron", "noun"], 

516 "name": ["name", "noun"], 

517 "adv": ["adv", "intj", "conj", "particle"], 

518 "phrase": ["phrase", "prep_phrase"], 

519 "noun phrase": ["phrase"], 

520 "ordinal": ["num"], 

521 "number": ["num"], 

522 "pos": ["affix", "name", "num"], 

523 "suffix": ["suffix", "affix"], 

524 "character": ["character"], 

525 "letter": ["character"], 

526 "kanji": ["character"], 

527 "cont": ["abbrev"], 

528 "interj": ["intj"], 

529 "con": ["conj"], 

530 "part": ["particle"], 

531 "prep": ["prep", "postp"], 

532 "postp": ["postp"], 

533 "misspelling": ["noun", "adj", "verb", "adv"], 

534 "part-form": ["verb"], 

535} 

536for k, v in template_allowed_pos_map.items(): 

537 for x in v: 

538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 print( 

540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

541 "".format(x, k, v) 

542 ) 

543 assert False 

544 

545 

546# Templates ignored during etymology extraction, i.e., these will not be listed 

547# in the extracted etymology templates. 

548ignored_etymology_templates: list[str] = [ 

549 "...", 

550 "IPAchar", 

551 "ipachar", 

552 "ISBN", 

553 "isValidPageName", 

554 "redlink category", 

555 "deprecated code", 

556 "check deprecated lang param usage", 

557 "para", 

558 "p", 

559 "cite", 

560 "Cite news", 

561 "Cite newsgroup", 

562 "cite paper", 

563 "cite MLLM 1976", 

564 "cite journal", 

565 "cite news/documentation", 

566 "cite paper/documentation", 

567 "cite video game", 

568 "cite video game/documentation", 

569 "cite newsgroup", 

570 "cite newsgroup/documentation", 

571 "cite web/documentation", 

572 "cite news", 

573 "Cite book", 

574 "Cite-book", 

575 "cite book", 

576 "cite web", 

577 "cite-usenet", 

578 "cite-video/documentation", 

579 "Cite-journal", 

580 "rfe", 

581 "catlangname", 

582 "cln", 

583 "langname-lite", 

584 "no deprecated lang param usage", 

585 "mention", 

586 "m", 

587 "m-self", 

588 "link", 

589 "l", 

590 "ll", 

591 "l-self", 

592] 

593# Regexp for matching ignored etymology template names. This adds certain 

594# prefixes to the names listed above. 

595ignored_etymology_templates_re = re.compile( 

596 r"^((cite-|R:|RQ:).*|" 

597 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

598 + r")$" 

599) 

600 

601# Regexp for matching ignored descendants template names. Right now we just 

602# copy the ignored etymology templates 

603ignored_descendants_templates_re = ignored_etymology_templates_re 

604 

605# Set of template names that are used to define usage examples. If the usage 

606# example contains one of these templates, then it its type is set to 

607# "example" 

608usex_templates: set[str] = { 

609 "afex", 

610 "affixusex", 

611 "co", # {{collocation}} acts like a example template, specifically for 

612 # pairs of combinations of words that are more common than you'd 

613 # except would be randomly; hlavní#Czech 

614 "coi", 

615 "collocation", 

616 "el-example", 

617 "el-x", 

618 "example", 

619 "examples", 

620 "he-usex", 

621 "he-x", 

622 "hi-usex", 

623 "hi-x", 

624 "ja-usex-inline", 

625 "ja-usex", 

626 "ja-x", 

627 "jbo-example", 

628 "jbo-x", 

629 "km-usex", 

630 "km-x", 

631 "ko-usex", 

632 "ko-x", 

633 "lo-usex", 

634 "lo-x", 

635 "ne-x", 

636 "ne-usex", 

637 "prefixusex", 

638 "ryu-usex", 

639 "ryu-x", 

640 "shn-usex", 

641 "shn-x", 

642 "suffixusex", 

643 "th-usex", 

644 "th-x", 

645 "ur-usex", 

646 "ur-x", 

647 "usex", 

648 "usex-suffix", 

649 "ux", 

650 "uxi", 

651} 

652 

653stop_head_at_these_templates: set[str] = { 

654 "category", 

655 "cat", 

656 "topics", 

657 "catlangname", 

658 "c", 

659 "C", 

660 "top", 

661 "cln", 

662} 

663 

664# Set of template names that are used to define quotation examples. If the 

665# usage example contains one of these templates, then its type is set to 

666# "quotation". 

667quotation_templates: set[str] = { 

668 "collapse-quote", 

669 "quote-av", 

670 "quote-book", 

671 "quote-GYLD", 

672 "quote-hansard", 

673 "quotei", 

674 "quote-journal", 

675 "quotelite", 

676 "quote-mailing list", 

677 "quote-meta", 

678 "quote-newsgroup", 

679 "quote-song", 

680 "quote-text", 

681 "quote", 

682 "quote-us-patent", 

683 "quote-video game", 

684 "quote-web", 

685 "quote-wikipedia", 

686 "wikiquote", 

687 "Wikiquote", 

688} 

689 

690taxonomy_templates = { 

691 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

692 "taxfmt", 

693 "taxlink", 

694 "taxlink2", 

695 "taxlinknew", 

696 "taxlook", 

697} 

698 

699# Template names, this was exctracted from template_linkage_mappings, 

700# because the code using template_linkage_mappings was actually not used 

701# (but not removed). 

702template_linkages_to_ignore_in_examples: set[str] = { 

703 "syn", 

704 "synonyms", 

705 "ant", 

706 "antonyms", 

707 "hyp", 

708 "hyponyms", 

709 "der", 

710 "derived terms", 

711 "coordinate terms", 

712 "cot", 

713 "rel", 

714 "col", 

715 "inline alt forms", 

716 "alti", 

717 "comeronyms", 

718 "holonyms", 

719 "holo", 

720 "hypernyms", 

721 "hyper", 

722 "meronyms", 

723 "mero", 

724 "troponyms", 

725 "perfectives", 

726 "pf", 

727 "imperfectives", 

728 "impf", 

729 "syndiff", 

730 "synsee", 

731 # not linkage nor example templates 

732 "sense", 

733 "s", 

734 "color panel", 

735 "colour panel", 

736} 

737 

738# Maps template name used in a word sense to a linkage field that it adds. 

739sense_linkage_templates: dict[str, str] = { 

740 "syn": "synonyms", 

741 "synonyms": "synonyms", 

742 "synsee": "synonyms", 

743 "syndiff": "synonyms", 

744 "hyp": "hyponyms", 

745 "hyponyms": "hyponyms", 

746 "ant": "antonyms", 

747 "antonyms": "antonyms", 

748 "alti": "related", 

749 "inline alt forms": "related", 

750 "coordinate terms": "coordinate_terms", 

751 "cot": "coordinate_terms", 

752 "comeronyms": "related", 

753 "holonyms": "holonyms", 

754 "holo": "holonyms", 

755 "hypernyms": "hypernyms", 

756 "hyper": "hypernyms", 

757 "meronyms": "meronyms", 

758 "mero": "meronyms", 

759 "troponyms": "troponyms", 

760 "perfectives": "related", 

761 "pf": "related", 

762 "imperfectives": "related", 

763 "impf": "related", 

764} 

765 

766sense_linkage_templates_tags: dict[str, list[str]] = { 

767 "alti": ["alternative"], 

768 "inline alt forms": ["alternative"], 

769 "comeronyms": ["comeronym"], 

770 "perfectives": ["perfective"], 

771 "pf": ["perfective"], 

772 "imperfectives": ["imperfective"], 

773 "impf": ["imperfective"], 

774} 

775 

776 

777def decode_html_entities(v: Union[str, int]) -> str: 

778 """Decodes HTML entities from a value, converting them to the respective 

779 Unicode characters/strings.""" 

780 if isinstance(v, int): 

781 # I changed this to return str(v) instead of v = str(v), 

782 # but there might have been the intention to have more logic 

783 # here. html.unescape would not do anything special with an integer, 

784 # it needs html escape symbols (&xx;). 

785 return str(v) 

786 return html.unescape(v) 

787 

788 

789def parse_sense_linkage( 

790 wxr: WiktextractContext, 

791 data: SenseData, 

792 name: str, 

793 ht: TemplateArgs, 

794 pos: str, 

795) -> None: 

796 """Parses a linkage (synonym, etc) specified in a word sense.""" 

797 assert isinstance(wxr, WiktextractContext) 

798 assert isinstance(data, dict) 

799 assert isinstance(name, str) 

800 assert isinstance(ht, dict) 

801 field = sense_linkage_templates[name] 

802 field_tags = sense_linkage_templates_tags.get(name, []) 

803 for i in range(2, 20): 

804 w = ht.get(i) or "" 

805 w = clean_node(wxr, data, w) 

806 is_thesaurus = False 

807 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

808 if w.startswith(alias): 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true

809 is_thesaurus = True 

810 w = w[len(alias) :] 

811 if w != wxr.wtp.title: 

812 from ...thesaurus import search_thesaurus 

813 

814 lang_code = clean_node(wxr, None, ht.get(1, "")) 

815 for t_data in search_thesaurus( 

816 wxr.thesaurus_db_conn, w, lang_code, pos, field 

817 ): 

818 l_data = { 

819 "word": t_data.term, 

820 "source": "Thesaurus:" + w, 

821 } 

822 if len(t_data.tags) > 0: 

823 l_data["tags"] = t_data.tags 

824 if len(t_data.raw_tags) > 0: 

825 l_data["raw_tags"] = t_data.raw_tags 

826 data_append(data, field, l_data) 

827 break 

828 if not w: 

829 break 

830 if is_thesaurus: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 continue 

832 tags: list[str] = [] 

833 topics: list[str] = [] 

834 english: Optional[str] = None 

835 # Try to find qualifiers for this synonym 

836 q = ht.get("q{}".format(i - 1)) 

837 if q: 

838 cls = classify_desc(q) 

839 if cls == "tags": 

840 tagsets1, topics1 = decode_tags(q) 

841 for ts in tagsets1: 

842 tags.extend(ts) 

843 topics.extend(topics1) 

844 elif cls == "english": 844 ↛ 850line 844 didn't jump to line 850 because the condition on line 844 was always true

845 if english: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true

846 english += "; " + q 

847 else: 

848 english = q 

849 # Try to find English translation for this synonym 

850 t = ht.get("t{}".format(i - 1)) 

851 if t: 851 ↛ 852line 851 didn't jump to line 852 because the condition on line 851 was never true

852 if english: 

853 english += "; " + t 

854 else: 

855 english = t 

856 

857 # See if the linkage contains a parenthesized alt 

858 alt = None 

859 m = re.search(r"\(([^)]+)\)$", w) 

860 if m: 860 ↛ 861line 860 didn't jump to line 861 because the condition on line 860 was never true

861 w = w[: m.start()].strip() 

862 alt = m.group(1) 

863 

864 dt = {"word": w} 

865 if field_tags: 865 ↛ 866line 865 didn't jump to line 866 because the condition on line 865 was never true

866 data_extend(dt, "tags", field_tags) 

867 if tags: 

868 data_extend(dt, "tags", tags) 

869 if topics: 869 ↛ 870line 869 didn't jump to line 870 because the condition on line 869 was never true

870 data_extend(dt, "topics", topics) 

871 if english: 

872 dt["english"] = english # DEPRECATED for "translation" 

873 dt["translation"] = english 

874 if alt: 874 ↛ 875line 874 didn't jump to line 875 because the condition on line 874 was never true

875 dt["alt"] = alt 

876 data_append(data, field, dt) 

877 

878 

879EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

880example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

881captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

882 

883 

884def synch_splits_with_args( 

885 line: str, targs: TemplateArgs 

886) -> Optional[list[str]]: 

887 """If it looks like there's something weird with how a line of example 

888 text has been split, this function will do the splitting after counting 

889 occurences of the splitting regex inside the two main template arguments 

890 containing the string data for the original language example and the 

891 English translations. 

892 """ 

893 # Previously, we split without capturing groups, but here we want to 

894 # keep the original splitting hyphen regex intact. 

895 fparts = captured_splitters_re.split(line) 

896 new_parts = [] 

897 # ["First", " – ", "second", " – ", "third..."] from OL argument 

898 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

899 new_parts.append("".join(fparts[:first])) 

900 # Translation argument 

901 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

902 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

903 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

904 new_parts.append("".join(fparts[first + 1 : second])) 

905 

906 if all(new_parts): # no empty strings from the above spaghetti 

907 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

908 return new_parts 

909 else: 

910 return None 

911 

912 

913QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

914QUALIFIERS_RE = re.compile(QUALIFIERS) 

915# (...): ... or (...(...)...): ... 

916 

917 

918def parse_language( 

919 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

920) -> list[WordData]: 

921 """Iterates over the text of the page, returning words (parts-of-speech) 

922 defined on the page one at a time. (Individual word senses for the 

923 same part-of-speech are typically encoded in the same entry.)""" 

924 # imported here to avoid circular import 

925 from .pronunciation import parse_pronunciation 

926 

927 assert isinstance(wxr, WiktextractContext) 

928 assert isinstance(langnode, WikiNode) 

929 assert isinstance(language, str) 

930 assert isinstance(lang_code, str) 

931 # print("parse_language", language) 

932 

933 is_reconstruction = False 

934 word: str = wxr.wtp.title # type: ignore[assignment] 

935 unsupported_prefix = "Unsupported titles/" 

936 if word.startswith(unsupported_prefix): 

937 w = word[len(unsupported_prefix) :] 

938 if w in unsupported_title_map: 938 ↛ 941line 938 didn't jump to line 941 because the condition on line 938 was always true

939 word = unsupported_title_map[w] 

940 else: 

941 wxr.wtp.error( 

942 "Unimplemented unsupported title: {}".format(word), 

943 sortid="page/870", 

944 ) 

945 word = w 

946 elif word.startswith("Reconstruction:"): 

947 word = word[word.find("/") + 1 :] 

948 is_reconstruction = True 

949 

950 base_data: WordData = { 

951 "word": word, 

952 "lang": language, 

953 "lang_code": lang_code, 

954 } 

955 if is_reconstruction: 

956 data_append(base_data, "tags", "reconstruction") 

957 sense_data: SenseData = {} 

958 pos_data: WordData = {} # For a current part-of-speech 

959 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

960 etym_data: WordData = {} # For one etymology 

961 sense_datas: list[SenseData] = [] 

962 sense_ordinal = 0 # The recursive sense parsing messes up the ordering 

963 # Never reset, do not use as data 

964 level_four_datas: list[WordData] = [] 

965 etym_datas: list[WordData] = [] 

966 page_datas: list[WordData] = [] 

967 have_etym = False 

968 inside_level_four = False # This is for checking if the etymology section 

969 # or article has a Pronunciation section, for Chinese mostly; because 

970 # Chinese articles can have three level three sections (two etymology 

971 # sections and pronunciation sections) one after another, we need a kludge 

972 # to better keep track of whether we're in a normal "etym" or inside a 

973 # "level four" (which is what we've turned the level three Pron sections 

974 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

975 # a step. 

976 stack: list[str] = [] # names of items on the "stack" 

977 

978 def merge_base(data: WordData, base: WordData) -> None: 

979 for k, v in base.items(): 

980 # Copy the value to ensure that we don't share lists or 

981 # dicts between structures (even nested ones). 

982 v = copy.deepcopy(v) 

983 if k not in data: 

984 # The list was copied above, so this will not create shared ref 

985 data[k] = v # type: ignore[literal-required] 

986 continue 

987 if data[k] == v: # type: ignore[literal-required] 

988 continue 

989 if ( 989 ↛ 997line 989 didn't jump to line 997 because the condition on line 989 was always true

990 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

991 or isinstance( 

992 v, 

993 (list, tuple), # Should this be "and"? 

994 ) 

995 ): 

996 data[k] = list(data[k]) + list(v) # type: ignore 

997 elif data[k] != v: # type: ignore[literal-required] 

998 wxr.wtp.warning( 

999 "conflicting values for {} in merge_base: " 

1000 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1001 sortid="page/904", 

1002 ) 

1003 

1004 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1005 """Remove unnecessary keys from dict values 

1006 in a list comprehension...""" 

1007 if key in pron: 

1008 pron.pop(key) # type: ignore 

1009 return pron 

1010 

1011 # If the result has sounds, eliminate sounds that have a prefix that 

1012 # does not match "word" or one of "forms" 

1013 if "sounds" in data and "word" in data: 

1014 accepted = [data["word"]] 

1015 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1016 data["sounds"] = list( 

1017 s 

1018 for s in data["sounds"] 

1019 if "form" not in s or s["form"] in accepted 

1020 ) 

1021 # If the result has sounds, eliminate sounds that have a pos that 

1022 # does not match "pos" 

1023 if "sounds" in data and "pos" in data: 

1024 data["sounds"] = list( 

1025 complementary_pop(s, "pos") 

1026 for s in data["sounds"] 

1027 # "pos" is not a field of SoundData, correctly, so we're 

1028 # removing it here. It's a kludge on a kludge on a kludge. 

1029 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

1030 ) 

1031 

1032 def push_sense(sorting_ordinal: int | None = None) -> bool: 

1033 """Starts collecting data for a new word sense. This returns True 

1034 if a sense was added.""" 

1035 nonlocal sense_data 

1036 if sorting_ordinal is None: 

1037 sorting_ordinal = sense_ordinal 

1038 tags = sense_data.get("tags", ()) 

1039 if ( 

1040 not sense_data.get("glosses") 

1041 and "translation-hub" not in tags 

1042 and "no-gloss" not in tags 

1043 ): 

1044 return False 

1045 

1046 if ( 1046 ↛ 1056line 1046 didn't jump to line 1056 because the condition on line 1046 was never true

1047 ( 

1048 "participle" in sense_data.get("tags", ()) 

1049 or "infinitive" in sense_data.get("tags", ()) 

1050 ) 

1051 and "alt_of" not in sense_data 

1052 and "form_of" not in sense_data 

1053 and "etymology_text" in etym_data 

1054 and etym_data["etymology_text"] != "" 

1055 ): 

1056 etym = etym_data["etymology_text"] 

1057 etym = etym.split(". ")[0] 

1058 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1059 if ret is not None: 

1060 tags, lst = ret 

1061 assert isinstance(lst, (list, tuple)) 

1062 if "form-of" in tags: 

1063 data_extend(sense_data, "form_of", lst) 

1064 data_extend(sense_data, "tags", tags) 

1065 elif "alt-of" in tags: 

1066 data_extend(sense_data, "alt_of", lst) 

1067 data_extend(sense_data, "tags", tags) 

1068 

1069 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1069 ↛ 1072line 1069 didn't jump to line 1072 because the condition on line 1069 was never true

1070 "tags", () 

1071 ): 

1072 data_append(sense_data, "tags", "no-gloss") 

1073 

1074 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal 

1075 sense_datas.append(sense_data) 

1076 sense_data = {} 

1077 return True 

1078 

1079 def push_pos(sorting_ordinal: int | None = None) -> None: 

1080 """Starts collecting data for a new part-of-speech.""" 

1081 nonlocal pos_data 

1082 nonlocal sense_datas 

1083 push_sense(sorting_ordinal) 

1084 if wxr.wtp.subsection: 

1085 data: WordData = {"senses": sense_datas} 

1086 merge_base(data, pos_data) 

1087 level_four_datas.append(data) 

1088 pos_data = {} 

1089 sense_datas = [] 

1090 wxr.wtp.start_subsection(None) 

1091 

1092 def push_level_four_section(clear_sound_data: bool) -> None: 

1093 """Starts collecting data for a new level four sections, which 

1094 is usually virtual and empty, unless the article has Chinese 

1095 'Pronunciation' sections that are etymology-section-like but 

1096 under etymology, and at the same level in the source. We modify 

1097 the source to demote Pronunciation sections like that to level 

1098 4, and other sections one step lower.""" 

1099 nonlocal level_four_data 

1100 nonlocal level_four_datas 

1101 nonlocal etym_datas 

1102 push_pos() 

1103 # print(f"======\n{etym_data=}") 

1104 # print(f"======\n{etym_datas=}") 

1105 # print(f"======\n{level_four_data=}") 

1106 # print(f"======\n{level_four_datas=}") 

1107 for data in level_four_datas: 

1108 merge_base(data, level_four_data) 

1109 etym_datas.append(data) 

1110 for data in etym_datas: 

1111 merge_base(data, etym_data) 

1112 page_datas.append(data) 

1113 if clear_sound_data: 

1114 level_four_data = {} 

1115 level_four_datas = [] 

1116 etym_datas = [] 

1117 

1118 def push_etym() -> None: 

1119 """Starts collecting data for a new etymology.""" 

1120 nonlocal etym_data 

1121 nonlocal etym_datas 

1122 nonlocal have_etym 

1123 nonlocal inside_level_four 

1124 have_etym = True 

1125 push_level_four_section(False) 

1126 inside_level_four = False 

1127 # etymology section could under pronunciation section 

1128 etym_data = ( 

1129 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {} 

1130 ) 

1131 

1132 def select_data() -> WordData: 

1133 """Selects where to store data (pos or etym) based on whether we 

1134 are inside a pos (part-of-speech).""" 

1135 # print(f"{wxr.wtp.subsection=}") 

1136 # print(f"{stack=}") 

1137 if wxr.wtp.subsection is not None: 

1138 return pos_data 

1139 if inside_level_four: 

1140 return level_four_data 

1141 if stack[-1] == language: 

1142 return base_data 

1143 return etym_data 

1144 

1145 term_label_templates: list[TemplateData] = [] 

1146 

1147 def head_post_template_fn( 

1148 name: str, ht: TemplateArgs, expansion: str 

1149 ) -> Optional[str]: 

1150 """Handles special templates in the head section of a word. Head 

1151 section is the text after part-of-speech subtitle and before word 

1152 sense list. Typically it generates the bold line for the word, but 

1153 may also contain other useful information that often ends in 

1154 side boxes. We want to capture some of that additional information.""" 

1155 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1156 if is_panel_template(wxr, name): 1156 ↛ 1159line 1156 didn't jump to line 1159 because the condition on line 1156 was never true

1157 # Completely ignore these templates (not even recorded in 

1158 # head_templates) 

1159 return "" 

1160 if name == "head": 

1161 # XXX are these also captured in forms? Should this special case 

1162 # be removed? 

1163 t = ht.get(2, "") 

1164 if t == "pinyin": 1164 ↛ 1165line 1164 didn't jump to line 1165 because the condition on line 1164 was never true

1165 data_append(pos_data, "tags", "Pinyin") 

1166 elif t == "romanization": 1166 ↛ 1167line 1166 didn't jump to line 1167 because the condition on line 1166 was never true

1167 data_append(pos_data, "tags", "romanization") 

1168 if ( 

1169 HEAD_TAG_RE.search(name) is not None 

1170 or name in WORD_LEVEL_HEAD_TEMPLATES 

1171 ): 

1172 args_ht = clean_template_args(wxr, ht) 

1173 cleaned_expansion = clean_node(wxr, None, expansion) 

1174 dt: TemplateData = { 

1175 "name": name, 

1176 "args": args_ht, 

1177 "expansion": cleaned_expansion, 

1178 } 

1179 data_append(pos_data, "head_templates", dt) 

1180 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1181 term_label_templates.append(dt) 

1182 # Squash these, their tags are applied to the whole word, 

1183 # and some cause problems like "term-label" 

1184 return "" 

1185 

1186 # The following are both captured in head_templates and parsed 

1187 # separately 

1188 

1189 if name in wikipedia_templates: 

1190 # Note: various places expect to have content from wikipedia 

1191 # templates, so cannot convert this to empty 

1192 parse_wikipedia_template(wxr, pos_data, ht) 

1193 return None 

1194 

1195 if name == "number box": 1195 ↛ 1197line 1195 didn't jump to line 1197 because the condition on line 1195 was never true

1196 # XXX extract numeric value? 

1197 return "" 

1198 if name == "enum": 

1199 # XXX extract? 

1200 return "" 

1201 if name == "cardinalbox": 1201 ↛ 1204line 1201 didn't jump to line 1204 because the condition on line 1201 was never true

1202 # XXX extract similar to enum? 

1203 # XXX this can also occur in top-level under language 

1204 return "" 

1205 if name == "Han simplified forms": 1205 ↛ 1207line 1205 didn't jump to line 1207 because the condition on line 1205 was never true

1206 # XXX extract? 

1207 return "" 

1208 # if name == "ja-kanji forms": 

1209 # # XXX extract? 

1210 # return "" 

1211 # if name == "vi-readings": 

1212 # # XXX extract? 

1213 # return "" 

1214 # if name == "ja-kanji": 

1215 # # XXX extract? 

1216 # return "" 

1217 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1217 ↛ 1219line 1217 didn't jump to line 1219 because the condition on line 1217 was never true

1218 # XXX extract? 

1219 return "" 

1220 

1221 return None 

1222 

1223 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1224 """Parses the subsection for a part-of-speech under a language on 

1225 a page.""" 

1226 assert isinstance(posnode, WikiNode) 

1227 assert isinstance(pos, str) 

1228 # print("parse_part_of_speech", pos) 

1229 pos_data["pos"] = pos 

1230 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1231 lists: list[list[WikiNode]] = [[]] # list of lists 

1232 first_para = True 

1233 first_head_tmplt = True 

1234 collecting_head = True 

1235 start_of_paragraph = True 

1236 

1237 # XXX extract templates from posnode with recursively_extract 

1238 # that break stuff, like ja-kanji or az-suffix-form. 

1239 # Do the extraction with a list of template names, combined from 

1240 # different lists, then separate out them into different lists 

1241 # that are handled at different points of the POS section. 

1242 # First, extract az-suffix-form, put it in `inflection`, 

1243 # and parse `inflection`'s content when appropriate later. 

1244 # The contents of az-suffix-form (and ja-kanji) that generate 

1245 # divs with "floatright" in their style gets deleted by 

1246 # clean_value, so templates that slip through from here won't 

1247 # break anything. 

1248 # XXX bookmark 

1249 # print("===================") 

1250 # print(posnode.children) 

1251 

1252 floaters, poschildren = recursively_extract( 

1253 posnode.children, 

1254 lambda x: ( 

1255 isinstance(x, WikiNode) 

1256 and ( 

1257 ( 

1258 x.kind == NodeKind.TEMPLATE 

1259 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES 

1260 ) 

1261 or ( 

1262 x.kind == NodeKind.LINK 

1263 # Need to check for stringiness because some links are 

1264 # broken; for example, if a template is missing an 

1265 # argument, a link might look like `[[{{{1}}}...]]` 

1266 and isinstance(x.largs[0][0], str) 

1267 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1268 ) 

1269 ) 

1270 ), 

1271 ) 

1272 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1273 tempnode.largs = [["Inflection"]] 

1274 tempnode.children = floaters 

1275 parse_inflection(tempnode, "Floating Div", pos) 

1276 # print(poschildren) 

1277 # XXX new above 

1278 

1279 if not poschildren: 1279 ↛ 1280line 1279 didn't jump to line 1280 because the condition on line 1279 was never true

1280 if not floaters: 

1281 wxr.wtp.debug( 

1282 "PoS section without contents", 

1283 sortid="en/page/1051/20230612", 

1284 ) 

1285 else: 

1286 wxr.wtp.debug( 

1287 "PoS section without contents except for a floating table", 

1288 sortid="en/page/1056/20230612", 

1289 ) 

1290 return 

1291 

1292 for node in poschildren: 

1293 if isinstance(node, str): 

1294 for m in re.finditer(r"\n+|[^\n]+", node): 

1295 p = m.group(0) 

1296 if p.startswith("\n\n") and pre: 

1297 first_para = False 

1298 start_of_paragraph = True 

1299 break 

1300 if p and collecting_head: 

1301 pre[-1].append(p) 

1302 continue 

1303 assert isinstance(node, WikiNode) 

1304 kind = node.kind 

1305 if kind == NodeKind.LIST: 

1306 lists[-1].append(node) 

1307 collecting_head = False 

1308 start_of_paragraph = True 

1309 continue 

1310 elif kind in LEVEL_KINDS: 

1311 # Stop parsing section if encountering any kind of 

1312 # level header (like ===Noun=== or ====Further Reading====). 

1313 # At a quick glance, this should be the default behavior, 

1314 # but if some kinds of source articles have sub-sub-sections 

1315 # that should be parsed XXX it should be handled by changing 

1316 # this break. 

1317 break 

1318 elif collecting_head and kind == NodeKind.LINK: 

1319 # We might collect relevant links as they are often pictures 

1320 # relating to the word 

1321 if len(node.largs[0]) >= 1 and isinstance( 1321 ↛ 1336line 1321 didn't jump to line 1336 because the condition on line 1321 was always true

1322 node.largs[0][0], str 

1323 ): 

1324 if node.largs[0][0].startswith( 1324 ↛ 1330line 1324 didn't jump to line 1330 because the condition on line 1324 was never true

1325 ns_title_prefix_tuple(wxr, "Category") 

1326 ): 

1327 # [[Category:...]] 

1328 # We're at the end of the file, probably, so stop 

1329 # here. Otherwise the head will get garbage. 

1330 break 

1331 if node.largs[0][0].startswith( 1331 ↛ 1336line 1331 didn't jump to line 1336 because the condition on line 1331 was always true

1332 ns_title_prefix_tuple(wxr, "File") 

1333 ): 

1334 # Skips file links 

1335 continue 

1336 start_of_paragraph = False 

1337 pre[-1].extend(node.largs[-1]) 

1338 elif kind == NodeKind.HTML: 

1339 if node.sarg == "br": 

1340 if pre[-1]: 1340 ↛ 1292line 1340 didn't jump to line 1292 because the condition on line 1340 was always true

1341 pre.append([]) # Switch to next head 

1342 lists.append([]) # Lists parallels pre 

1343 collecting_head = True 

1344 start_of_paragraph = True 

1345 elif collecting_head and node.sarg not in ( 1345 ↛ 1351line 1345 didn't jump to line 1351 because the condition on line 1345 was never true

1346 "gallery", 

1347 "ref", 

1348 "cite", 

1349 "caption", 

1350 ): 

1351 start_of_paragraph = False 

1352 pre[-1].append(node) 

1353 else: 

1354 start_of_paragraph = False 

1355 elif isinstance(node, TemplateNode): 

1356 # XXX Insert code here that disambiguates between 

1357 # templates that generate word heads and templates 

1358 # that don't. 

1359 # There's head_tag_re that seems like a regex meant 

1360 # to identify head templates. Too bad it's None. 

1361 

1362 # ignore {{category}}, {{cat}}... etc. 

1363 if node.template_name in stop_head_at_these_templates: 

1364 # we've reached a template that should be at the end, 

1365 continue 

1366 

1367 # skip these templates; panel_templates is already used 

1368 # to skip certain templates else, but it also applies to 

1369 # head parsing quite well. 

1370 # node.largs[0][0] should always be str, but can't type-check 

1371 # that. 

1372 if is_panel_template(wxr, node.template_name): 

1373 continue 

1374 # skip these templates 

1375 # if node.largs[0][0] in skip_these_templates_in_head: 

1376 # first_head_tmplt = False # no first_head_tmplt at all 

1377 # start_of_paragraph = False 

1378 # continue 

1379 

1380 if first_head_tmplt and pre[-1]: 

1381 first_head_tmplt = False 

1382 start_of_paragraph = False 

1383 pre[-1].append(node) 

1384 elif pre[-1] and start_of_paragraph: 

1385 pre.append([]) # Switch to the next head 

1386 lists.append([]) # lists parallel pre 

1387 collecting_head = True 

1388 start_of_paragraph = False 

1389 pre[-1].append(node) 

1390 else: 

1391 pre[-1].append(node) 

1392 elif first_para: 

1393 start_of_paragraph = False 

1394 if collecting_head: 1394 ↛ 1292line 1394 didn't jump to line 1292 because the condition on line 1394 was always true

1395 pre[-1].append(node) 

1396 # XXX use template_fn in clean_node to check that the head macro 

1397 # is compatible with the current part-of-speech and generate warning 

1398 # if not. Use template_allowed_pos_map. 

1399 

1400 # Clean up empty pairs, and fix messes with extra newlines that 

1401 # separate templates that are followed by lists wiktextract issue #314 

1402 

1403 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1404 cleaned_lists: list[list[WikiNode]] = [] 

1405 pairless_pre_index = None 

1406 

1407 for pre1, ls in zip(pre, lists): 

1408 if pre1 and not ls: 

1409 pairless_pre_index = len(cleaned_pre) 

1410 if not pre1 and not ls: 1410 ↛ 1412line 1410 didn't jump to line 1412 because the condition on line 1410 was never true

1411 # skip [] + [] 

1412 continue 

1413 if not ls and all( 

1414 (isinstance(x, str) and not x.strip()) for x in pre1 

1415 ): 

1416 # skip ["\n", " "] + [] 

1417 continue 

1418 if ls and not pre1: 

1419 if pairless_pre_index is not None: 1419 ↛ 1420line 1419 didn't jump to line 1420 because the condition on line 1419 was never true

1420 cleaned_lists[pairless_pre_index] = ls 

1421 pairless_pre_index = None 

1422 continue 

1423 cleaned_pre.append(pre1) 

1424 cleaned_lists.append(ls) 

1425 

1426 pre = cleaned_pre 

1427 lists = cleaned_lists 

1428 

1429 there_are_many_heads = len(pre) > 1 

1430 header_tags: list[str] = [] 

1431 header_topics: list[str] = [] 

1432 previous_head_had_list = False 

1433 

1434 if not any(g for g in lists): 

1435 process_gloss_without_list( 

1436 poschildren, pos, pos_data, header_tags, header_topics 

1437 ) 

1438 else: 

1439 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1440 # if len(ls) == 0: 

1441 # # don't have gloss list 

1442 # # XXX add code here to filter out 'garbage', like text 

1443 # # that isn't a head template or head. 

1444 # continue 

1445 

1446 if all(not sl for sl in lists[i:]): 

1447 if i == 0: 1447 ↛ 1448line 1447 didn't jump to line 1448 because the condition on line 1447 was never true

1448 if isinstance(node, str): 

1449 wxr.wtp.debug( 

1450 "first head without list of senses," 

1451 "string: '{}[...]', {}/{}".format( 

1452 node[:20], word, language 

1453 ), 

1454 sortid="page/1689/20221215", 

1455 ) 

1456 if isinstance(node, WikiNode): 

1457 if node.largs and node.largs[0][0] in [ 

1458 "Han char", 

1459 ]: 

1460 # just ignore these templates 

1461 pass 

1462 else: 

1463 wxr.wtp.debug( 

1464 "first head without " 

1465 "list of senses, " 

1466 "template node " 

1467 "{}, {}/{}".format( 

1468 node.largs, word, language 

1469 ), 

1470 sortid="page/1694/20221215", 

1471 ) 

1472 else: 

1473 wxr.wtp.debug( 

1474 "first head without list of senses, " 

1475 "{}/{}".format(word, language), 

1476 sortid="page/1700/20221215", 

1477 ) 

1478 # no break here so that the first head always 

1479 # gets processed. 

1480 else: 

1481 if isinstance(node, str): 1481 ↛ 1482line 1481 didn't jump to line 1482 because the condition on line 1481 was never true

1482 wxr.wtp.debug( 

1483 "later head without list of senses," 

1484 "string: '{}[...]', {}/{}".format( 

1485 node[:20], word, language 

1486 ), 

1487 sortid="page/1708/20221215", 

1488 ) 

1489 if isinstance(node, WikiNode): 1489 ↛ 1501line 1489 didn't jump to line 1501 because the condition on line 1489 was always true

1490 wxr.wtp.debug( 

1491 "later head without list of senses," 

1492 "template node " 

1493 "{}, {}/{}".format( 

1494 node.sarg if node.sarg else node.largs, 

1495 word, 

1496 language, 

1497 ), 

1498 sortid="page/1713/20221215", 

1499 ) 

1500 else: 

1501 wxr.wtp.debug( 

1502 "later head without list of senses, " 

1503 "{}/{}".format(word, language), 

1504 sortid="page/1719/20221215", 

1505 ) 

1506 break 

1507 head_group = i + 1 if there_are_many_heads else None 

1508 # print("parse_part_of_speech: {}: {}: pre={}" 

1509 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1510 

1511 if previous_head_had_list: 

1512 # We use a boolean flag here because we want to be able 

1513 # let the header_tags data pass through after the loop 

1514 # is over without accidentally emptying it, if there are 

1515 # no pos_datas and we need a dummy data. 

1516 header_tags.clear() 

1517 header_topics.clear() 

1518 

1519 process_gloss_header( 

1520 pre1, pos, head_group, pos_data, header_tags, header_topics 

1521 ) 

1522 for ln in ls: 

1523 # Parse each list associated with this head. 

1524 for node in ln.children: 

1525 # Parse nodes in l.children recursively. 

1526 # The recursion function uses push_sense() to 

1527 # add stuff into sense_datas, and returns True or 

1528 # False if something is added, which bubbles upward. 

1529 # If the bubble is "True", then higher levels of 

1530 # the recursion will not push_sense(), because 

1531 # the data is already pushed into a sub-gloss 

1532 # downstream, unless the higher level has examples 

1533 # that need to be put somewhere. 

1534 common_data: SenseData = { 

1535 "tags": list(header_tags), 

1536 "topics": list(header_topics), 

1537 } 

1538 if head_group: 

1539 common_data["head_nr"] = head_group 

1540 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1541 

1542 if len(ls) > 0: 

1543 previous_head_had_list = True 

1544 else: 

1545 previous_head_had_list = False 

1546 

1547 # If there are no senses extracted, add a dummy sense. We want to 

1548 # keep tags extracted from the head for the dummy sense. 

1549 push_sense() # Make sure unfinished data pushed, and start clean sense 

1550 if len(sense_datas) == 0: 

1551 data_extend(sense_data, "tags", header_tags) 

1552 data_extend(sense_data, "topics", header_topics) 

1553 data_append(sense_data, "tags", "no-gloss") 

1554 push_sense() 

1555 

1556 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) 

1557 

1558 for sd in sense_datas: 

1559 if "__temp_sense_sorting_ordinal" in sd: 1559 ↛ 1558line 1559 didn't jump to line 1558 because the condition on line 1559 was always true

1560 del sd["__temp_sense_sorting_ordinal"] 

1561 

1562 def process_gloss_header( 

1563 header_nodes: list[Union[WikiNode, str]], 

1564 pos_type: str, 

1565 header_group: Optional[int], 

1566 pos_data: WordData, 

1567 header_tags: list[str], 

1568 header_topics: list[str], 

1569 ) -> None: 

1570 ruby = [] 

1571 links: list[str] = [] 

1572 

1573 # process template parse nodes here 

1574 new_nodes = [] 

1575 info_template_data = [] 

1576 for node in header_nodes: 

1577 # print(f"{node=}") 

1578 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1579 if info_data or info_out: 

1580 if info_data: 1580 ↛ 1582line 1580 didn't jump to line 1582 because the condition on line 1580 was always true

1581 info_template_data.append(info_data) 

1582 if info_out: # including just the original node 1582 ↛ 1583line 1582 didn't jump to line 1583 because the condition on line 1582 was never true

1583 new_nodes.append(info_out) 

1584 else: 

1585 new_nodes.append(node) 

1586 header_nodes = new_nodes 

1587 

1588 if info_template_data: 

1589 if "info_templates" not in pos_data: 1589 ↛ 1592line 1589 didn't jump to line 1592 because the condition on line 1589 was always true

1590 pos_data["info_templates"] = info_template_data 

1591 else: 

1592 pos_data["info_templates"].extend(info_template_data) 

1593 

1594 if not word.isalnum(): 

1595 # `-` is kosher, add more of these if needed. 

1596 if word.replace("-", "").isalnum(): 

1597 pass 

1598 else: 

1599 # if the word contains non-letter or -number characters, it 

1600 # might have something that messes with split-at-semi-comma; we 

1601 # collect links so that we can skip splitting them. 

1602 exp = wxr.wtp.parse( 

1603 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1604 ) 

1605 link_nodes, _ = recursively_extract( 

1606 exp.children, 

1607 lambda x: isinstance(x, WikiNode) 

1608 and x.kind == NodeKind.LINK, 

1609 ) 

1610 for ln in link_nodes: 

1611 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1612 if not ltext.isalnum(): 

1613 links.append(ltext) 

1614 if word not in links: 1614 ↛ 1617line 1614 didn't jump to line 1617 because the condition on line 1614 was always true

1615 links.append(word) 

1616 

1617 if lang_code == "ja": 

1618 exp = wxr.wtp.parse( 

1619 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1620 ) 

1621 rub, _ = recursively_extract( 

1622 exp.children, 

1623 lambda x: isinstance(x, WikiNode) 

1624 and x.kind == NodeKind.HTML 

1625 and x.sarg == "ruby", 

1626 ) 

1627 if rub is not None: 1627 ↛ 1672line 1627 didn't jump to line 1672 because the condition on line 1627 was always true

1628 for r in rub: 

1629 if TYPE_CHECKING: 

1630 # we know the lambda above in recursively_extract 

1631 # returns only WikiNodes in rub 

1632 assert isinstance(r, WikiNode) 

1633 rt = parse_ruby(wxr, r) 

1634 if rt is not None: 

1635 ruby.append(rt) 

1636 elif lang_code == "vi": 

1637 # Handle vi-readings templates that have a weird structures for 

1638 # Chu Nom vietnamese characters heads 

1639 # https://en.wiktionary.org/wiki/Template:vi-readings 

1640 new_header_nodes = [] 

1641 related_readings: list[LinkageData] = [] 

1642 for node in header_nodes: 

1643 if ( 1643 ↛ 1667line 1643 didn't jump to line 1667 because the condition on line 1643 was always true

1644 isinstance(node, TemplateNode) 

1645 and node.template_name == "vi-readings" 

1646 ): 

1647 print(node.template_parameters) 

1648 for parameter, tag in ( 

1649 ("hanviet", "han-viet-reading"), 

1650 ("nom", "nom-reading"), 

1651 # we ignore the fanqie parameter "phienthiet" 

1652 ): 

1653 arg = node.template_parameters.get(parameter) 

1654 if arg is not None: 1654 ↛ 1648line 1654 didn't jump to line 1648 because the condition on line 1654 was always true

1655 text = clean_node(wxr, None, arg) 

1656 for w in text.split(","): 

1657 # ignore - separated references 

1658 if "-" in w: 

1659 w = w[: w.index("-")] 

1660 w = w.strip() 

1661 related_readings.append( 

1662 LinkageData(word=w, tags=[tag]) 

1663 ) 

1664 continue 

1665 

1666 # Skip the vi-reading template for the rest of the head parsing 

1667 new_header_nodes.append(node) 

1668 if len(related_readings) > 0: 1668 ↛ 1672line 1668 didn't jump to line 1672 because the condition on line 1668 was always true

1669 data_extend(pos_data, "related", related_readings) 

1670 header_nodes = new_header_nodes 

1671 

1672 header_text = clean_node( 

1673 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1674 ) 

1675 

1676 if not header_text.strip(): 

1677 return 

1678 

1679 term_label_tags: list[str] = [] 

1680 term_label_topics: list[str] = [] 

1681 if len(term_label_templates) > 0: 

1682 # parse term label templates; if there are other similar kinds 

1683 # of templates in headers that you want to squash and apply as 

1684 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1685 for templ_data in term_label_templates: 

1686 # print(templ_data) 

1687 expan = templ_data.get("expansion", "").strip("().,; ") 

1688 if not expan: 1688 ↛ 1689line 1688 didn't jump to line 1689 because the condition on line 1688 was never true

1689 continue 

1690 tlb_tagsets, tlb_topics = decode_tags(expan) 

1691 for tlb_tags in tlb_tagsets: 

1692 if len(tlb_tags) > 0 and not any( 

1693 t.startswith("error-") for t in tlb_tags 

1694 ): 

1695 term_label_tags.extend(tlb_tags) 

1696 term_label_topics.extend(tlb_topics) 

1697 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1698 

1699 header_text = re.sub(r"\s+", " ", header_text) 

1700 # print(f"{header_text=}") 

1701 parse_word_head( 

1702 wxr, 

1703 pos_type, 

1704 header_text, 

1705 pos_data, 

1706 is_reconstruction, 

1707 header_group, 

1708 ruby=ruby, 

1709 links=links, 

1710 ) 

1711 if "tags" in pos_data: 

1712 # pos_data can get "tags" data from some source; type-checkers 

1713 # doesn't like it, so let's ignore it. 

1714 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1715 del pos_data["tags"] # type: ignore[typeddict-item] 

1716 if len(term_label_tags) > 0: 

1717 header_tags.extend(term_label_tags) 

1718 if len(term_label_topics) > 0: 

1719 header_topics.extend(term_label_topics) 

1720 

1721 def process_gloss_without_list( 

1722 nodes: list[Union[WikiNode, str]], 

1723 pos_type: str, 

1724 pos_data: WordData, 

1725 header_tags: list[str], 

1726 header_topics: list[str], 

1727 ) -> None: 

1728 # gloss text might not inside a list 

1729 header_nodes: list[Union[str, WikiNode]] = [] 

1730 gloss_nodes: list[Union[str, WikiNode]] = [] 

1731 for node in strip_nodes(nodes): 

1732 if isinstance(node, WikiNode): 

1733 if isinstance(node, TemplateNode): 

1734 if node.template_name in ( 

1735 "zh-see", 

1736 "ja-see", 

1737 "ja-see-kango", 

1738 ): 

1739 continue # soft redirect 

1740 elif ( 

1741 node.template_name == "head" 

1742 or node.template_name.startswith(f"{lang_code}-") 

1743 ): 

1744 header_nodes.append(node) 

1745 continue 

1746 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1746 ↛ 1748line 1746 didn't jump to line 1748 because the condition on line 1746 was always true

1747 break 

1748 gloss_nodes.append(node) 

1749 

1750 if len(header_nodes) > 0: 

1751 process_gloss_header( 

1752 header_nodes, 

1753 pos_type, 

1754 None, 

1755 pos_data, 

1756 header_tags, 

1757 header_topics, 

1758 ) 

1759 if len(gloss_nodes) > 0: 

1760 process_gloss_contents( 

1761 gloss_nodes, 

1762 pos_type, 

1763 {"tags": list(header_tags), "topics": list(header_topics)}, 

1764 ) 

1765 

1766 def parse_sense_node( 

1767 node: Union[str, WikiNode], # never receives str 

1768 sense_base: SenseData, 

1769 pos: str, 

1770 ) -> bool: 

1771 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1772 Uses push_sense() to attempt adding data to pos_data in the scope 

1773 of parse_language() when it reaches deep in the recursion. push_sense() 

1774 returns True if it succeeds, and that is bubbled up the stack; if 

1775 a sense was added downstream, the higher levels (whose shared data 

1776 was already added by a subsense) do not push_sense(), unless it 

1777 has examples that need to be put somewhere. 

1778 """ 

1779 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1780 

1781 nonlocal sense_ordinal 

1782 my_ordinal = sense_ordinal # copies, not a reference 

1783 sense_ordinal += 1 # only use for sorting 

1784 

1785 if not isinstance(node, WikiNode): 1785 ↛ 1787line 1785 didn't jump to line 1787 because the condition on line 1785 was never true

1786 # This doesn't seem to ever happen in practice. 

1787 wxr.wtp.debug( 

1788 "{}: parse_sense_node called with" 

1789 "something that isn't a WikiNode".format(pos), 

1790 sortid="page/1287/20230119", 

1791 ) 

1792 return False 

1793 

1794 if node.kind != NodeKind.LIST_ITEM: 1794 ↛ 1795line 1794 didn't jump to line 1795 because the condition on line 1794 was never true

1795 wxr.wtp.debug( 

1796 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1797 ) 

1798 return False 

1799 

1800 if node.sarg == ":": 

1801 # Skip example entries at the highest level, ones without 

1802 # a sense ("...#") above them. 

1803 # If node.sarg is exactly and only ":", then it's at 

1804 # the highest level; lower levels would have more 

1805 # "indentation", like "#:" or "##:" 

1806 return False 

1807 

1808 # If a recursion call succeeds in push_sense(), bubble it up with 

1809 # `added`. 

1810 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1811 added = False 

1812 

1813 gloss_template_args: set[str] = set() 

1814 

1815 # For LISTs and LIST_ITEMS, their argument is something like 

1816 # "##" or "##:", and using that we can rudimentally determine 

1817 # list 'depth' if need be, and also what kind of list or 

1818 # entry it is; # is for normal glosses, : for examples (indent) 

1819 # and * is used for quotations on wiktionary. 

1820 current_depth = node.sarg 

1821 

1822 children = node.children 

1823 

1824 # subentries, (presumably) a list 

1825 # of subglosses below this. The list's 

1826 # argument ends with #, and its depth should 

1827 # be bigger than parent node. 

1828 subentries = [ 

1829 x 

1830 for x in children 

1831 if isinstance(x, WikiNode) 

1832 and x.kind == NodeKind.LIST 

1833 and x.sarg == current_depth + "#" 

1834 ] 

1835 

1836 # sublists of examples and quotations. .sarg 

1837 # does not end with "#". 

1838 others = [ 

1839 x 

1840 for x in children 

1841 if isinstance(x, WikiNode) 

1842 and x.kind == NodeKind.LIST 

1843 and x.sarg != current_depth + "#" 

1844 ] 

1845 

1846 # the actual contents of this particular node. 

1847 # can be a gloss (or a template that expands into 

1848 # many glosses which we can't easily pre-expand) 

1849 # or could be an "outer gloss" with more specific 

1850 # subglosses, or could be a qualfier for the subglosses. 

1851 contents = [ 

1852 x 

1853 for x in children 

1854 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1855 ] 

1856 # If this entry has sublists of entries, we should combine 

1857 # gloss information from both the "outer" and sublist content. 

1858 # Sometimes the outer gloss 

1859 # is more non-gloss or tags, sometimes it is a coarse sense 

1860 # and the inner glosses are more specific. The outer one 

1861 # does not seem to have qualifiers. 

1862 

1863 # If we have one sublist with one element, treat it 

1864 # specially as it may be a Wiktionary error; raise 

1865 # that nested element to the same level. 

1866 # XXX If need be, this block can be easily removed in 

1867 # the current recursive logicand the result is one sense entry 

1868 # with both glosses in the glosses list, as you would 

1869 # expect. If the higher entry has examples, there will 

1870 # be a higher entry with some duplicated data. 

1871 if len(subentries) == 1: 

1872 slc = subentries[0].children 

1873 if len(slc) == 1: 

1874 # copy current node and modify it so it doesn't 

1875 # loop infinitely. 

1876 cropped_node = copy.copy(node) 

1877 cropped_node.children = [ 

1878 x 

1879 for x in children 

1880 if not ( 

1881 isinstance(x, WikiNode) 

1882 and x.kind == NodeKind.LIST 

1883 and x.sarg == current_depth + "#" 

1884 ) 

1885 ] 

1886 added |= parse_sense_node(cropped_node, sense_base, pos) 

1887 nonlocal sense_data # this kludge causes duplicated raw_ 

1888 # glosses data if this is not done; 

1889 # if the top-level (cropped_node) 

1890 # does not push_sense() properly or 

1891 # parse_sense_node() returns early, 

1892 # sense_data is not reset. This happens 

1893 # for example when you have a no-gloss 

1894 # string like "(intransitive)": 

1895 # no gloss, push_sense() returns early 

1896 # and sense_data has duplicate data with 

1897 # sense_base 

1898 sense_data = {} 

1899 added |= parse_sense_node(slc[0], sense_base, pos) 

1900 return added 

1901 

1902 return process_gloss_contents( 

1903 contents, 

1904 pos, 

1905 sense_base, 

1906 subentries, 

1907 others, 

1908 gloss_template_args, 

1909 added, 

1910 my_ordinal, 

1911 ) 

1912 

1913 def process_gloss_contents( 

1914 contents: list[Union[str, WikiNode]], 

1915 pos: str, 

1916 sense_base: SenseData, 

1917 subentries: list[WikiNode] = [], 

1918 others: list[WikiNode] = [], 

1919 gloss_template_args: Set[str] = set(), 

1920 added: bool = False, 

1921 sorting_ordinal: int | None = None, 

1922 ) -> bool: 

1923 def sense_template_fn( 

1924 name: str, ht: TemplateArgs, is_gloss: bool = False 

1925 ) -> Optional[str]: 

1926 # print(f"sense_template_fn: {name}, {ht}") 

1927 if name in wikipedia_templates: 

1928 # parse_wikipedia_template(wxr, pos_data, ht) 

1929 return None 

1930 if is_panel_template(wxr, name): 

1931 return "" 

1932 if name in INFO_TEMPLATE_FUNCS: 

1933 info_data, info_exp = parse_info_template_arguments( 

1934 wxr, name, ht, "sense" 

1935 ) 

1936 if info_data or info_exp: 1936 ↛ 1942line 1936 didn't jump to line 1942 because the condition on line 1936 was always true

1937 if info_data: 1937 ↛ 1939line 1937 didn't jump to line 1939 because the condition on line 1937 was always true

1938 data_append(sense_base, "info_templates", info_data) 

1939 if info_exp and isinstance(info_exp, str): 1939 ↛ 1941line 1939 didn't jump to line 1941 because the condition on line 1939 was always true

1940 return info_exp 

1941 return "" 

1942 if name in ("defdate",): 

1943 date = clean_node(wxr, None, ht.get(1, ())) 

1944 if part_two := ht.get(2): 1944 ↛ 1946line 1944 didn't jump to line 1946 because the condition on line 1944 was never true

1945 # Unicode mdash, not '-' 

1946 date += "–" + clean_node(wxr, None, part_two) 

1947 refs: dict[str, ReferenceData] = {} 

1948 # ref, refn, ref2, ref2n, ref3, ref3n 

1949 # ref1 not valid 

1950 for k, v in sorted( 

1951 (k, v) for k, v in ht.items() if isinstance(k, str) 

1952 ): 

1953 if m := re.match(r"ref(\d?)(n?)", k): 1953 ↛ 1950line 1953 didn't jump to line 1950 because the condition on line 1953 was always true

1954 ref_v = clean_node(wxr, None, v) 

1955 if m.group(1) not in refs: # empty string or digit 

1956 refs[m.group(1)] = ReferenceData() 

1957 if m.group(2): 

1958 refs[m.group(1)]["refn"] = ref_v 

1959 else: 

1960 refs[m.group(1)]["text"] = ref_v 

1961 data_append( 

1962 sense_base, 

1963 "attestations", 

1964 AttestationData(date=date, references=list(refs.values())), 

1965 ) 

1966 return "" 

1967 if name == "senseid": 

1968 langid = clean_node(wxr, None, ht.get(1, ())) 

1969 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1970 if re.match(r"Q\d+$", arg): 

1971 data_append(sense_base, "wikidata", arg) 

1972 data_append(sense_base, "senseid", langid + ":" + arg) 

1973 if name in sense_linkage_templates: 

1974 # print(f"SENSE_TEMPLATE_FN: {name}") 

1975 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

1976 return "" 

1977 if name == "†" or name == "zh-obsolete": 

1978 data_append(sense_base, "tags", "obsolete") 

1979 return "" 

1980 if name in { 

1981 "ux", 

1982 "uxi", 

1983 "usex", 

1984 "afex", 

1985 "prefixusex", 

1986 "ko-usex", 

1987 "ko-x", 

1988 "hi-x", 

1989 "ja-usex-inline", 

1990 "ja-x", 

1991 "quotei", 

1992 "he-x", 

1993 "hi-x", 

1994 "km-x", 

1995 "ne-x", 

1996 "shn-x", 

1997 "th-x", 

1998 "ur-x", 

1999 }: 

2000 # Usage examples are captured separately below. We don't 

2001 # want to expand them into glosses even when unusual coding 

2002 # is used in the entry. 

2003 # These templates may slip through inside another item, but 

2004 # currently we're separating out example entries (..#:) 

2005 # well enough that there seems to very little contamination. 

2006 if is_gloss: 

2007 wxr.wtp.wiki_notice( 

2008 "Example template is used for gloss text", 

2009 sortid="extractor.en.page.sense_template_fn/1415", 

2010 ) 

2011 else: 

2012 return "" 

2013 if name == "w": 2013 ↛ 2014line 2013 didn't jump to line 2014 because the condition on line 2013 was never true

2014 if ht.get(2) == "Wp": 

2015 return "" 

2016 for k, v in ht.items(): 

2017 v = v.strip() 

2018 if v and "<" not in v: 

2019 gloss_template_args.add(v) 

2020 return None 

2021 

2022 def extract_link_texts(item: GeneralNode) -> None: 

2023 """Recursively extracts link texts from the gloss source. This 

2024 information is used to select whether to remove final "." from 

2025 form_of/alt_of (e.g., ihm/Hunsrik).""" 

2026 if isinstance(item, (list, tuple)): 

2027 for x in item: 

2028 extract_link_texts(x) 

2029 return 

2030 if isinstance(item, str): 

2031 # There seem to be HTML sections that may futher contain 

2032 # unparsed links. 

2033 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2033 ↛ 2034line 2033 didn't jump to line 2034 because the loop on line 2033 never started

2034 print("ITER:", m.group(0)) 

2035 v = m.group(1).split("|")[-1].strip() 

2036 if v: 

2037 gloss_template_args.add(v) 

2038 return 

2039 if not isinstance(item, WikiNode): 2039 ↛ 2040line 2039 didn't jump to line 2040 because the condition on line 2039 was never true

2040 return 

2041 if item.kind == NodeKind.LINK: 

2042 v = item.largs[-1] 

2043 if ( 2043 ↛ 2049line 2043 didn't jump to line 2049 because the condition on line 2043 was always true

2044 isinstance(v, list) 

2045 and len(v) == 1 

2046 and isinstance(v[0], str) 

2047 ): 

2048 gloss_template_args.add(v[0].strip()) 

2049 for x in item.children: 

2050 extract_link_texts(x) 

2051 

2052 extract_link_texts(contents) 

2053 

2054 # get the raw text of non-list contents of this node, and other stuff 

2055 # like tag and category data added to sense_base 

2056 # cast = no-op type-setter for the type-checker 

2057 partial_template_fn = cast( 

2058 TemplateFnCallable, 

2059 partial(sense_template_fn, is_gloss=True), 

2060 ) 

2061 rawgloss = clean_node( 

2062 wxr, 

2063 sense_base, 

2064 contents, 

2065 template_fn=partial_template_fn, 

2066 collect_links=True, 

2067 ) 

2068 

2069 if not rawgloss: 2069 ↛ 2070line 2069 didn't jump to line 2070 because the condition on line 2069 was never true

2070 return False 

2071 

2072 # remove manually typed ordered list text at the start("1. ") 

2073 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2074 

2075 # get stuff like synonyms and categories from "others", 

2076 # maybe examples and quotations 

2077 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2078 

2079 # The gloss could contain templates that produce more list items. 

2080 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2081 # to parts. However, e.g. Interlingua generates multiple glosses 

2082 # in HTML directly without Wikitext markup, so we must also split 

2083 # by just newlines. 

2084 subglosses = rawgloss.splitlines() 

2085 

2086 if len(subglosses) == 0: 2086 ↛ 2087line 2086 didn't jump to line 2087 because the condition on line 2086 was never true

2087 return False 

2088 

2089 if any(s.startswith("#") for s in subglosses): 

2090 subtree = wxr.wtp.parse(rawgloss) 

2091 # from wikitextprocessor.parser import print_tree 

2092 # print("SUBTREE GENERATED BY TEMPLATE:") 

2093 # print_tree(subtree) 

2094 new_subentries = [ 

2095 x 

2096 for x in subtree.children 

2097 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2098 ] 

2099 

2100 new_others = [ 

2101 x 

2102 for x in subtree.children 

2103 if isinstance(x, WikiNode) 

2104 and x.kind == NodeKind.LIST 

2105 and not x.sarg.endswith("#") 

2106 ] 

2107 

2108 new_contents = [ 

2109 clean_node(wxr, [], x) 

2110 for x in subtree.children 

2111 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2112 ] 

2113 

2114 subentries = subentries or new_subentries 

2115 others = others or new_others 

2116 subglosses = new_contents 

2117 rawgloss = "".join(subglosses) 

2118 # Generate no gloss for translation hub pages, but add the 

2119 # "translation-hub" tag for them 

2120 if rawgloss == "(This entry is a translation hub.)": 2120 ↛ 2121line 2120 didn't jump to line 2121 because the condition on line 2120 was never true

2121 data_append(sense_data, "tags", "translation-hub") 

2122 return push_sense(sorting_ordinal) 

2123 

2124 # Remove certain substrings specific to outer glosses 

2125 strip_ends = [", particularly:"] 

2126 for x in strip_ends: 

2127 if rawgloss.endswith(x): 

2128 rawgloss = rawgloss[: -len(x)].strip() 

2129 break 

2130 

2131 # A single gloss, or possibly an outer gloss. 

2132 # Check if the possible outer gloss starts with 

2133 # parenthesized tags/topics 

2134 

2135 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2136 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2137 m = QUALIFIERS_RE.match(rawgloss) 

2138 # (...): ... or (...(...)...): ... 

2139 if m: 

2140 q = m.group(1) 

2141 rawgloss = rawgloss[m.end() :].strip() 

2142 parse_sense_qualifier(wxr, q, sense_base) 

2143 if rawgloss == "A pejorative:": 2143 ↛ 2144line 2143 didn't jump to line 2144 because the condition on line 2143 was never true

2144 data_append(sense_base, "tags", "pejorative") 

2145 rawgloss = "" 

2146 elif rawgloss == "Short forms.": 2146 ↛ 2147line 2146 didn't jump to line 2147 because the condition on line 2146 was never true

2147 data_append(sense_base, "tags", "abbreviation") 

2148 rawgloss = "" 

2149 elif rawgloss == "Technical or specialized senses.": 2149 ↛ 2150line 2149 didn't jump to line 2150 because the condition on line 2149 was never true

2150 rawgloss = "" 

2151 elif rawgloss.startswith("inflection of "): 

2152 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2153 if parsed is not None: 2153 ↛ 2162line 2153 didn't jump to line 2162 because the condition on line 2153 was always true

2154 tags, origins = parsed 

2155 if origins is not None: 2155 ↛ 2157line 2155 didn't jump to line 2157 because the condition on line 2155 was always true

2156 data_extend(sense_base, "form_of", origins) 

2157 if tags is not None: 2157 ↛ 2160line 2157 didn't jump to line 2160 because the condition on line 2157 was always true

2158 data_extend(sense_base, "tags", tags) 

2159 else: 

2160 data_append(sense_base, "tags", "form-of") 

2161 else: 

2162 data_append(sense_base, "tags", "form-of") 

2163 if rawgloss: 2163 ↛ 2194line 2163 didn't jump to line 2194 because the condition on line 2163 was always true

2164 # Code duplicating a lot of clean-up operations from later in 

2165 # this block. We want to clean up the "supergloss" as much as 

2166 # possible, in almost the same way as a normal gloss. 

2167 supergloss = rawgloss 

2168 

2169 if supergloss.startswith("; "): 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true

2170 supergloss = supergloss[1:].strip() 

2171 

2172 if supergloss.startswith(("^†", "†")): 

2173 data_append(sense_base, "tags", "obsolete") 

2174 supergloss = supergloss[2:].strip() 

2175 elif supergloss.startswith("^‡"): 2175 ↛ 2176line 2175 didn't jump to line 2176 because the condition on line 2175 was never true

2176 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2177 supergloss = supergloss[2:].strip() 

2178 

2179 # remove [14th century...] style brackets at the end 

2180 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2181 

2182 if supergloss.startswith((",", ":")): 

2183 supergloss = supergloss[1:] 

2184 supergloss = supergloss.strip() 

2185 if supergloss.startswith("N. of "): 2185 ↛ 2186line 2185 didn't jump to line 2186 because the condition on line 2185 was never true

2186 supergloss = "Name of " + supergloss[6:] 

2187 supergloss = supergloss[2:] 

2188 data_append(sense_base, "glosses", supergloss) 

2189 if supergloss in ("A person:",): 

2190 data_append(sense_base, "tags", "g-person") 

2191 

2192 # The main recursive call (except for the exceptions at the 

2193 # start of this function). 

2194 for sublist in subentries: 

2195 if not ( 2195 ↛ 2198line 2195 didn't jump to line 2198 because the condition on line 2195 was never true

2196 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2197 ): 

2198 wxr.wtp.debug( 

2199 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2200 f"with items that are not LISTs", 

2201 sortid="page/1511/20230119", 

2202 ) 

2203 continue 

2204 for item in sublist.children: 

2205 if not ( 2205 ↛ 2209line 2205 didn't jump to line 2209 because the condition on line 2205 was never true

2206 isinstance(item, WikiNode) 

2207 and item.kind == NodeKind.LIST_ITEM 

2208 ): 

2209 continue 

2210 # copy sense_base to prevent cross-contamination between 

2211 # subglosses and other subglosses and superglosses 

2212 sense_base2 = copy.deepcopy(sense_base) 

2213 if parse_sense_node(item, sense_base2, pos): 2213 ↛ 2204line 2213 didn't jump to line 2204 because the condition on line 2213 was always true

2214 added = True 

2215 

2216 # Capture examples. 

2217 # This is called after the recursive calls above so that 

2218 # sense_base is not contaminated with meta-data from 

2219 # example entries for *this* gloss. 

2220 examples = [] 

2221 if wxr.config.capture_examples: 2221 ↛ 2225line 2221 didn't jump to line 2225 because the condition on line 2221 was always true

2222 examples = extract_examples(others, sense_base) 

2223 

2224 # push_sense() succeeded somewhere down-river, so skip this level 

2225 if added: 

2226 if examples: 

2227 # this higher-up gloss has examples that we do not want to skip 

2228 wxr.wtp.debug( 

2229 "'{}[...]' gloss has examples we want to keep, " 

2230 "but there are subglosses.".format(repr(rawgloss[:30])), 

2231 sortid="page/1498/20230118", 

2232 ) 

2233 else: 

2234 return True 

2235 

2236 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2237 # after the gloss, but these sentences don't seem to be intended 

2238 # as glosses. Skip them. 

2239 indexed_subglosses = list( 

2240 (i, gl) 

2241 for i, gl in enumerate(subglosses) 

2242 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2243 ) 

2244 

2245 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2245 ↛ 2246line 2245 didn't jump to line 2246 because the condition on line 2245 was never true

2246 gl = indexed_subglosses[0][1].strip() 

2247 if gl.endswith(":"): 

2248 gl = gl[:-1].strip() 

2249 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2250 if parsed is not None: 

2251 infl_tags, infl_dts = parsed 

2252 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2253 # Interpret others as a particular form under 

2254 # "inflection of" 

2255 data_extend(sense_base, "tags", infl_tags) 

2256 data_extend(sense_base, "form_of", infl_dts) 

2257 indexed_subglosses = indexed_subglosses[1:] 

2258 elif not infl_dts: 

2259 data_extend(sense_base, "tags", infl_tags) 

2260 indexed_subglosses = indexed_subglosses[1:] 

2261 

2262 # Create senses for remaining subglosses 

2263 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2264 gloss = gloss.strip() 

2265 if not gloss and len(indexed_subglosses) > 1: 2265 ↛ 2266line 2265 didn't jump to line 2266 because the condition on line 2265 was never true

2266 continue 

2267 # Push a new sense (if the last one is not empty) 

2268 if push_sense(sorting_ordinal): 2268 ↛ 2269line 2268 didn't jump to line 2269 because the condition on line 2268 was never true

2269 added = True 

2270 # if gloss not in sense_data.get("raw_glosses", ()): 

2271 # data_append(sense_data, "raw_glosses", gloss) 

2272 if i == 0 and examples: 

2273 # In a multi-line gloss, associate examples 

2274 # with only one of them. 

2275 # XXX or you could use gloss_i == len(indexed_subglosses) 

2276 # to associate examples with the *last* one. 

2277 data_extend(sense_data, "examples", examples) 

2278 if gloss.startswith("; ") and gloss_i > 0: 2278 ↛ 2279line 2278 didn't jump to line 2279 because the condition on line 2278 was never true

2279 gloss = gloss[1:].strip() 

2280 # If the gloss starts with †, mark as obsolete 

2281 if gloss.startswith("^†"): 2281 ↛ 2282line 2281 didn't jump to line 2282 because the condition on line 2281 was never true

2282 data_append(sense_data, "tags", "obsolete") 

2283 gloss = gloss[2:].strip() 

2284 elif gloss.startswith("^‡"): 2284 ↛ 2285line 2284 didn't jump to line 2285 because the condition on line 2284 was never true

2285 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2286 gloss = gloss[2:].strip() 

2287 # Copy data for all senses to this sense 

2288 for k, v in sense_base.items(): 

2289 if isinstance(v, (list, tuple)): 

2290 if k != "tags": 

2291 # Tags handled below (countable/uncountable special) 

2292 data_extend(sense_data, k, v) 

2293 else: 

2294 assert k not in ("tags", "categories", "topics") 

2295 sense_data[k] = v # type:ignore[literal-required] 

2296 # Parse the gloss for this particular sense 

2297 m = QUALIFIERS_RE.match(gloss) 

2298 # (...): ... or (...(...)...): ... 

2299 if m: 

2300 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2301 gloss = gloss[m.end() :].strip() 

2302 

2303 # Remove common suffix "[from 14th c.]" and similar 

2304 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2305 

2306 # Check to make sure we don't have unhandled list items in gloss 

2307 ofs = max(gloss.find("#"), gloss.find("* ")) 

2308 if ofs > 10 and "(#)" not in gloss: 

2309 wxr.wtp.debug( 

2310 "gloss may contain unhandled list items: {}".format(gloss), 

2311 sortid="page/1412", 

2312 ) 

2313 elif "\n" in gloss: 2313 ↛ 2314line 2313 didn't jump to line 2314 because the condition on line 2313 was never true

2314 wxr.wtp.debug( 

2315 "gloss contains newline: {}".format(gloss), 

2316 sortid="page/1416", 

2317 ) 

2318 

2319 # Kludge, some glosses have a comma after initial qualifiers in 

2320 # parentheses 

2321 if gloss.startswith((",", ":")): 

2322 gloss = gloss[1:] 

2323 gloss = gloss.strip() 

2324 if gloss.endswith(":"): 

2325 gloss = gloss[:-1].strip() 

2326 if gloss.startswith("N. of "): 2326 ↛ 2327line 2326 didn't jump to line 2327 because the condition on line 2326 was never true

2327 gloss = "Name of " + gloss[6:] 

2328 if gloss.startswith("†"): 2328 ↛ 2329line 2328 didn't jump to line 2329 because the condition on line 2328 was never true

2329 data_append(sense_data, "tags", "obsolete") 

2330 gloss = gloss[1:] 

2331 elif gloss.startswith("^†"): 2331 ↛ 2332line 2331 didn't jump to line 2332 because the condition on line 2331 was never true

2332 data_append(sense_data, "tags", "obsolete") 

2333 gloss = gloss[2:] 

2334 

2335 # Copy tags from sense_base if any. This will not copy 

2336 # countable/uncountable if either was specified in the sense, 

2337 # as sometimes both are specified in word head but only one 

2338 # in individual senses. 

2339 countability_tags = [] 

2340 base_tags = sense_base.get("tags", ()) 

2341 sense_tags = sense_data.get("tags", ()) 

2342 for tag in base_tags: 

2343 if tag in ("countable", "uncountable"): 

2344 if tag not in countability_tags: 2344 ↛ 2346line 2344 didn't jump to line 2346 because the condition on line 2344 was always true

2345 countability_tags.append(tag) 

2346 continue 

2347 if tag not in sense_tags: 

2348 data_append(sense_data, "tags", tag) 

2349 if countability_tags: 

2350 if ( 2350 ↛ 2359line 2350 didn't jump to line 2359 because the condition on line 2350 was always true

2351 "countable" not in sense_tags 

2352 and "uncountable" not in sense_tags 

2353 ): 

2354 data_extend(sense_data, "tags", countability_tags) 

2355 

2356 # If outer gloss specifies a form-of ("inflection of", see 

2357 # aquamarine/German), try to parse the inner glosses as 

2358 # tags for an inflected form. 

2359 if "form-of" in sense_base.get("tags", ()): 

2360 parsed = parse_alt_or_inflection_of( 

2361 wxr, gloss, gloss_template_args 

2362 ) 

2363 if parsed is not None: 2363 ↛ 2369line 2363 didn't jump to line 2369 because the condition on line 2363 was always true

2364 infl_tags, infl_dts = parsed 

2365 if not infl_dts and infl_tags: 2365 ↛ 2369line 2365 didn't jump to line 2369 because the condition on line 2365 was always true

2366 # Interpret as a particular form under "inflection of" 

2367 data_extend(sense_data, "tags", infl_tags) 

2368 

2369 if not gloss: 2369 ↛ 2370line 2369 didn't jump to line 2370 because the condition on line 2369 was never true

2370 data_append(sense_data, "tags", "empty-gloss") 

2371 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2372 if ( 2372 ↛ 2383line 2372 didn't jump to line 2383 because the condition on line 2372 was always true

2373 gloss_i == 0 

2374 and len(sense_data.get("glosses", tuple())) >= 1 

2375 ): 

2376 # If we added a "high-level gloss" from rawgloss, but this 

2377 # is that same gloss_i, add this instead of the raw_gloss 

2378 # from before if they're different: the rawgloss was not 

2379 # cleaned exactly the same as this later gloss 

2380 sense_data["glosses"][-1] = gloss 

2381 else: 

2382 # Add the gloss for the sense. 

2383 data_append(sense_data, "glosses", gloss) 

2384 

2385 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2386 # two abbreviations in the same sense, both generated by the 

2387 # {{abbreviation of|...}} template. Handle these with some magic. 

2388 position = 0 

2389 split_glosses = [] 

2390 for m in re.finditer(r"Abbreviation of ", gloss): 

2391 if m.start() != position: 2391 ↛ 2390line 2391 didn't jump to line 2390 because the condition on line 2391 was always true

2392 split_glosses.append(gloss[position : m.start()]) 

2393 position = m.start() 

2394 split_glosses.append(gloss[position:]) 

2395 for gloss in split_glosses: 

2396 # Check if this gloss describes an alt-of or inflection-of 

2397 if ( 

2398 lang_code != "en" 

2399 and " " not in gloss 

2400 and distw([word], gloss) < 0.3 

2401 ): 

2402 # Don't try to parse gloss if it is one word 

2403 # that is close to the word itself for non-English words 

2404 # (probable translations of a tag/form name) 

2405 continue 

2406 parsed = parse_alt_or_inflection_of( 

2407 wxr, gloss, gloss_template_args 

2408 ) 

2409 if parsed is None: 

2410 continue 

2411 tags, dts = parsed 

2412 if not dts and tags: 

2413 data_extend(sense_data, "tags", tags) 

2414 continue 

2415 for dt in dts: # type:ignore[union-attr] 

2416 ftags = list(tag for tag in tags if tag != "form-of") 

2417 if "alt-of" in tags: 

2418 data_extend(sense_data, "tags", ftags) 

2419 data_append(sense_data, "alt_of", dt) 

2420 elif "compound-of" in tags: 2420 ↛ 2421line 2420 didn't jump to line 2421 because the condition on line 2420 was never true

2421 data_extend(sense_data, "tags", ftags) 

2422 data_append(sense_data, "compound_of", dt) 

2423 elif "synonym-of" in tags: 2423 ↛ 2424line 2423 didn't jump to line 2424 because the condition on line 2423 was never true

2424 data_extend(dt, "tags", ftags) 

2425 data_append(sense_data, "synonyms", dt) 

2426 elif tags and dt.get("word", "").startswith("of "): 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true

2427 dt["word"] = dt["word"][3:] 

2428 data_append(sense_data, "tags", "form-of") 

2429 data_extend(sense_data, "tags", ftags) 

2430 data_append(sense_data, "form_of", dt) 

2431 elif "form-of" in tags: 2431 ↛ 2415line 2431 didn't jump to line 2415 because the condition on line 2431 was always true

2432 data_extend(sense_data, "tags", tags) 

2433 data_append(sense_data, "form_of", dt) 

2434 

2435 if len(sense_data) == 0: 

2436 if len(sense_base.get("tags", [])) == 0: 2436 ↛ 2438line 2436 didn't jump to line 2438 because the condition on line 2436 was always true

2437 del sense_base["tags"] 

2438 sense_data.update(sense_base) 

2439 if push_sense(sorting_ordinal): 2439 ↛ 2443line 2439 didn't jump to line 2443 because the condition on line 2439 was always true

2440 # push_sense succeded in adding a sense to pos_data 

2441 added = True 

2442 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2443 return added 

2444 

2445 def parse_inflection( 

2446 node: WikiNode, section: str, pos: Optional[str] 

2447 ) -> None: 

2448 """Parses inflection data (declension, conjugation) from the given 

2449 page. This retrieves the actual inflection template 

2450 parameters, which are very useful for applications that need 

2451 to learn the inflection classes and generate inflected 

2452 forms.""" 

2453 assert isinstance(node, WikiNode) 

2454 assert isinstance(section, str) 

2455 assert pos is None or isinstance(pos, str) 

2456 # print("parse_inflection:", node) 

2457 

2458 if pos is None: 2458 ↛ 2459line 2458 didn't jump to line 2459 because the condition on line 2458 was never true

2459 wxr.wtp.debug( 

2460 "inflection table outside part-of-speech", sortid="page/1812" 

2461 ) 

2462 return 

2463 

2464 def inflection_template_fn( 

2465 name: str, ht: TemplateArgs 

2466 ) -> Optional[str]: 

2467 # print("decl_conj_template_fn", name, ht) 

2468 if is_panel_template(wxr, name): 2468 ↛ 2469line 2468 didn't jump to line 2469 because the condition on line 2468 was never true

2469 return "" 

2470 if name in ("is-u-mutation",): 2470 ↛ 2473line 2470 didn't jump to line 2473 because the condition on line 2470 was never true

2471 # These are not to be captured as an exception to the 

2472 # generic code below 

2473 return None 

2474 m = re.search( 

2475 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2476 r"declension|inflection|mut|mutation)($|-)", 

2477 name, 

2478 ) 

2479 if m: 

2480 args_ht = clean_template_args(wxr, ht) 

2481 dt = {"name": name, "args": args_ht} 

2482 data_append(pos_data, "inflection_templates", dt) 

2483 

2484 return None 

2485 

2486 # Convert the subtree back to Wikitext, then expand all and parse, 

2487 # capturing templates in the process 

2488 text = wxr.wtp.node_to_wikitext(node.children) 

2489 

2490 # Split text into separate sections for each to-level template 

2491 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2492 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2493 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2494 # like the group around it, it would create elements in brace_matches, 

2495 # including None if it doesn't match. 

2496 # 20250114: Added {| and |} into the regex because tables were being 

2497 # cut into pieces by this code. Issue #973, introduction of two-part 

2498 # book-end templates similar to trans-top and tran-bottom. 

2499 template_sections = [] 

2500 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2501 # Because there is the possibility of triple curly braces 

2502 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2503 # count nesting depth using pairs of two brackets, but 

2504 # instead use singular braces ("{ }"). 

2505 # Because template delimiters should be balanced, regardless 

2506 # of whether {{ or {{{ is used, and because we only care 

2507 # about the outer-most delimiters (the highest level template) 

2508 # we can just count the single braces when those single 

2509 # braces are part of a group. 

2510 table_nesting = 0 

2511 # However, if we have a stray table ({| ... |}) that should always 

2512 # be its own section, and should prevent templates from cutting it 

2513 # into sections. 

2514 

2515 # print(f"Parse inflection: {text=}") 

2516 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2517 if len(brace_matches) > 1: 

2518 tsection: list[str] = [] 

2519 after_templates = False # kludge to keep any text 

2520 # before first template 

2521 # with the first template; 

2522 # otherwise, text 

2523 # goes with preceding template 

2524 for m in brace_matches: 

2525 if m.startswith("\n; ") and after_templates: 2525 ↛ 2526line 2525 didn't jump to line 2526 because the condition on line 2525 was never true

2526 after_templates = False 

2527 template_sections.append(tsection) 

2528 tsection = [] 

2529 tsection.append(m) 

2530 elif m.startswith("{{") or m.endswith("{|"): 

2531 if ( 

2532 template_nesting == 0 

2533 and after_templates 

2534 and table_nesting == 0 

2535 ): 

2536 template_sections.append(tsection) 

2537 tsection = [] 

2538 # start new section 

2539 after_templates = True 

2540 if m.startswith("{{"): 

2541 template_nesting += 1 

2542 else: 

2543 # m.endswith("{|") 

2544 table_nesting += 1 

2545 tsection.append(m) 

2546 elif m.startswith("}}") or m.endswith("|}"): 

2547 if m.startswith("}}"): 

2548 template_nesting -= 1 

2549 if template_nesting < 0: 2549 ↛ 2550line 2549 didn't jump to line 2550 because the condition on line 2549 was never true

2550 wxr.wtp.error( 

2551 "Negatively nested braces, " 

2552 "couldn't split inflection templates, " 

2553 "{}/{} section {}".format( 

2554 word, language, section 

2555 ), 

2556 sortid="page/1871", 

2557 ) 

2558 template_sections = [] # use whole text 

2559 break 

2560 else: 

2561 table_nesting -= 1 

2562 if table_nesting < 0: 2562 ↛ 2563line 2562 didn't jump to line 2563 because the condition on line 2562 was never true

2563 wxr.wtp.error( 

2564 "Negatively nested table braces, " 

2565 "couldn't split inflection section, " 

2566 "{}/{} section {}".format( 

2567 word, language, section 

2568 ), 

2569 sortid="page/20250114", 

2570 ) 

2571 template_sections = [] # use whole text 

2572 break 

2573 tsection.append(m) 

2574 else: 

2575 tsection.append(m) 

2576 if tsection: # dangling tsection 2576 ↛ 2584line 2576 didn't jump to line 2584 because the condition on line 2576 was always true

2577 template_sections.append(tsection) 

2578 # Why do it this way around? The parser has a preference 

2579 # to associate bits outside of tables with the preceding 

2580 # table (`after`-variable), so a new tsection begins 

2581 # at {{ and everything before it belongs to the previous 

2582 # template. 

2583 

2584 texts = [] 

2585 if not template_sections: 

2586 texts = [text] 

2587 else: 

2588 for tsection in template_sections: 

2589 texts.append("".join(tsection)) 

2590 if template_nesting != 0: 2590 ↛ 2591line 2590 didn't jump to line 2591 because the condition on line 2590 was never true

2591 wxr.wtp.error( 

2592 "Template nesting error: " 

2593 "template_nesting = {} " 

2594 "couldn't split inflection templates, " 

2595 "{}/{} section {}".format( 

2596 template_nesting, word, language, section 

2597 ), 

2598 sortid="page/1896", 

2599 ) 

2600 texts = [text] 

2601 for text in texts: 

2602 tree = wxr.wtp.parse( 

2603 text, expand_all=True, template_fn=inflection_template_fn 

2604 ) 

2605 

2606 if not text.strip(): 

2607 continue 

2608 

2609 # Parse inflection tables from the section. The data is stored 

2610 # under "forms". 

2611 if wxr.config.capture_inflections: 2611 ↛ 2601line 2611 didn't jump to line 2601 because the condition on line 2611 was always true

2612 tablecontext = None 

2613 m = re.search(r"{{([^}{|]+)\|?", text) 

2614 if m: 

2615 template_name = m.group(1) 

2616 tablecontext = TableContext(template_name) 

2617 

2618 parse_inflection_section( 

2619 wxr, 

2620 pos_data, 

2621 word, 

2622 language, 

2623 pos, 

2624 section, 

2625 tree, 

2626 tablecontext=tablecontext, 

2627 ) 

2628 

2629 def get_subpage_section( 

2630 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2631 ) -> Optional[Union[WikiNode, str]]: 

2632 """Loads a subpage of the given page, and finds the section 

2633 for the given language, part-of-speech, and section title. This 

2634 is used for finding translations and other sections on subpages.""" 

2635 assert isinstance(language, str) 

2636 assert isinstance(title, str) 

2637 assert isinstance(subtitle, str) 

2638 assert isinstance(seqs, (list, tuple)) 

2639 for seq in seqs: 

2640 for x in seq: 

2641 assert isinstance(x, str) 

2642 subpage_title = word + "/" + subtitle 

2643 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2644 if subpage_content is None: 

2645 wxr.wtp.error( 

2646 "/translations not found despite " 

2647 "{{see translation subpage|...}}", 

2648 sortid="page/1934", 

2649 ) 

2650 return None 

2651 

2652 def recurse( 

2653 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2654 ) -> Optional[Union[str, WikiNode]]: 

2655 # print(f"seq: {seq}") 

2656 if not seq: 

2657 return node 

2658 if not isinstance(node, WikiNode): 

2659 return None 

2660 # print(f"node.kind: {node.kind}") 

2661 if node.kind in LEVEL_KINDS: 

2662 t = clean_node(wxr, None, node.largs[0]) 

2663 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2664 if t.lower() == seq[0].lower(): 

2665 seq = seq[1:] 

2666 if not seq: 

2667 return node 

2668 for n in node.children: 

2669 ret = recurse(n, seq) 

2670 if ret is not None: 

2671 return ret 

2672 return None 

2673 

2674 tree = wxr.wtp.parse( 

2675 subpage_content, 

2676 pre_expand=True, 

2677 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2678 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2679 ) 

2680 assert tree.kind == NodeKind.ROOT 

2681 for seq in seqs: 

2682 ret = recurse(tree, seq) 

2683 if ret is None: 

2684 wxr.wtp.debug( 

2685 "Failed to find subpage section {}/{} seq {}".format( 

2686 title, subtitle, seq 

2687 ), 

2688 sortid="page/1963", 

2689 ) 

2690 return ret 

2691 

2692 def parse_linkage( 

2693 data: WordData, field: str, linkagenode: LevelNode 

2694 ) -> None: 

2695 assert isinstance(data, dict) 

2696 assert isinstance(field, str) 

2697 assert isinstance(linkagenode, WikiNode) 

2698 # if field == "synonyms": 

2699 # print("field", field) 

2700 # print("data", data) 

2701 # print("children:") 

2702 # print(linkagenode.children) 

2703 if not wxr.config.capture_linkages: 2703 ↛ 2704line 2703 didn't jump to line 2704 because the condition on line 2703 was never true

2704 return 

2705 have_panel_template = False 

2706 toplevel_text = [] 

2707 next_navframe_sense = None # Used for "(sense):" before NavFrame 

2708 

2709 def parse_linkage_item( 

2710 contents: list[Union[str, WikiNode]], 

2711 field: str, 

2712 sense: Optional[str] = None, 

2713 ): 

2714 assert isinstance(contents, (list, tuple)) 

2715 assert isinstance(field, str) 

2716 assert sense is None or isinstance(sense, str) 

2717 

2718 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

2719 # .format(field, sense, contents)) 

2720 

2721 parts: list[str] = [] 

2722 ruby: list[tuple[str, str]] = [] 

2723 urls: list[str] = [] 

2724 # data about link text; this is used to skip splitting on 

2725 # linkage text items that contain stuff like commas; for 

2726 # example "Hunde, die bellen, beißen nicht" in article 

2727 # beißen is split into "Hunde", "die bellen" etc. 

2728 # We take that link text and use it, eventually, 

2729 # in split_at_comma_semi to skip splitting on those 

2730 # commas. 

2731 links_that_should_not_be_split: list[str] = [] 

2732 

2733 def item_recurse( 

2734 contents: list[Union[str, WikiNode]], italic=False 

2735 ) -> None: 

2736 assert isinstance(contents, (list, tuple)) 

2737 nonlocal sense 

2738 nonlocal ruby 

2739 nonlocal parts 

2740 # print("ITEM_RECURSE:", contents) 

2741 for node in contents: 

2742 if isinstance(node, str): 

2743 parts.append(node) 

2744 continue 

2745 kind = node.kind 

2746 # print("ITEM_RECURSE KIND:", kind, 

2747 # node.sarg if node.sarg else node.largs) 

2748 if is_list_item(node): 2748 ↛ 2749line 2748 didn't jump to line 2749 because the condition on line 2748 was never true

2749 if parts: 

2750 sense1: Optional[str] 

2751 sense1 = clean_node(wxr, None, parts) 

2752 if sense1.endswith(":"): 

2753 sense1 = sense1[:-1].strip() 

2754 if sense1.startswith("(") and sense1.endswith(")"): 

2755 sense1 = sense1[1:-1].strip() 

2756 if sense1.lower() == TRANSLATIONS_TITLE: 

2757 sense1 = None 

2758 # print("linkage item_recurse LIST sense1:", sense1) 

2759 parse_linkage_recurse( 

2760 node.children, field, sense=sense1 or sense 

2761 ) 

2762 parts = [] 

2763 else: 

2764 parse_linkage_recurse(node.children, field, sense) 

2765 elif kind in ( 2765 ↛ 2770line 2765 didn't jump to line 2770 because the condition on line 2765 was never true

2766 NodeKind.TABLE, 

2767 NodeKind.TABLE_ROW, 

2768 NodeKind.TABLE_CELL, 

2769 ): 

2770 parse_linkage_recurse(node.children, field, sense) 

2771 elif kind in ( 2771 ↛ 2775line 2771 didn't jump to line 2775 because the condition on line 2771 was never true

2772 NodeKind.TABLE_HEADER_CELL, 

2773 NodeKind.TABLE_CAPTION, 

2774 ): 

2775 continue 

2776 elif kind == NodeKind.HTML: 2776 ↛ 2777line 2776 didn't jump to line 2777 because the condition on line 2776 was never true

2777 classes = (node.attrs.get("class") or "").split() 

2778 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2779 continue 

2780 elif node.sarg == "ruby": 

2781 rb = parse_ruby(wxr, node) 

2782 if rb: 

2783 ruby.append(rb) 

2784 parts.append(rb[0]) 

2785 continue 

2786 elif node.sarg == "math": 

2787 parts.append(clean_node(wxr, None, node)) 

2788 continue 

2789 elif "interProject" in classes: 

2790 continue # These do not seem to be displayed 

2791 if "NavFrame" in classes: 

2792 parse_linkage_recurse(node.children, field, sense) 

2793 else: 

2794 item_recurse(node.children, italic=italic) 

2795 elif kind == NodeKind.ITALIC: 

2796 item_recurse(node.children, italic=True) 

2797 elif kind == NodeKind.LINK: 

2798 ignore = False 

2799 if isinstance(node.largs[0][0], str): 2799 ↛ 2741line 2799 didn't jump to line 2741 because the condition on line 2799 was always true

2800 v1 = node.largs[0][0].strip().lower() 

2801 if v1.startswith( 2801 ↛ 2805line 2801 didn't jump to line 2805 because the condition on line 2801 was never true

2802 ns_title_prefix_tuple(wxr, "Category", True) 

2803 + ns_title_prefix_tuple(wxr, "File", True) 

2804 ): 

2805 ignore = True 

2806 if not ignore: 2806 ↛ 2741line 2806 didn't jump to line 2741 because the condition on line 2806 was always true

2807 v = node.largs[-1] 

2808 if ( 

2809 len(node.largs) == 1 

2810 and len(v) > 0 

2811 and isinstance(v[0], str) 

2812 and v[0][0] == ":" 

2813 ): 

2814 v = [v[0][1:]] + list(v[1:]) # type:ignore 

2815 if isinstance(v[0], str) and not v[0].isalnum(): 

2816 links_that_should_not_be_split.append( 

2817 "".join(v[0]) 

2818 ) # type: ignore 

2819 item_recurse(v, italic=italic) 

2820 elif kind == NodeKind.URL: 

2821 if len(node.largs) < 2 and node.largs: 

2822 # Naked url captured 

2823 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

2824 continue 

2825 if len(node.largs) == 2: 2825 ↛ 2830line 2825 didn't jump to line 2830 because the condition on line 2825 was always true

2826 # Url from link with text 

2827 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

2828 # print(f"{node.largs=!r}") 

2829 # print("linkage recurse URL {}".format(node)) 

2830 item_recurse(node.largs[-1], italic=italic) 

2831 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 

2832 item_recurse(node.children, italic=italic) 

2833 else: 

2834 wxr.wtp.debug( 

2835 "linkage item_recurse unhandled {}: {}".format( 

2836 node.kind, node 

2837 ), 

2838 sortid="page/2073", 

2839 ) 

2840 

2841 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

2842 # .format(contents)) 

2843 

2844 item_recurse(contents) 

2845 item = clean_node(wxr, None, parts) 

2846 # print("LINKAGE ITEM CONTENTS:", parts) 

2847 # print("CLEANED ITEM: {!r}".format(item)) 

2848 # print(f"URLS {urls=!r}") 

2849 

2850 return parse_linkage_item_text( 

2851 wxr, 

2852 word, 

2853 data, 

2854 field, 

2855 item, 

2856 sense, 

2857 ruby, 

2858 sense_datas, 

2859 is_reconstruction, 

2860 urls or None, 

2861 links_that_should_not_be_split or None, 

2862 ) 

2863 

2864 def parse_linkage_recurse( 

2865 contents: list[Union[WikiNode, str]], 

2866 field: str, 

2867 sense: Optional[str], 

2868 ) -> None: 

2869 assert isinstance(contents, (list, tuple)) 

2870 assert sense is None or isinstance(sense, str) 

2871 nonlocal next_navframe_sense 

2872 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

2873 for node in contents: 

2874 if isinstance(node, str): 

2875 # Ignore top-level text, generally comments before the 

2876 # linkages list. However, if no linkages are found, then 

2877 # use this for linkages (not all words use bullet points 

2878 # for linkages). 

2879 toplevel_text.append(node) 

2880 continue 

2881 assert isinstance(node, WikiNode) 

2882 kind = node.kind 

2883 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

2884 if is_list(node): 

2885 parse_linkage_recurse(node.children, field, sense) 

2886 elif is_list_item(node): 

2887 v = parse_linkage_item(node.children, field, sense) 

2888 if v: 2888 ↛ 2892line 2888 didn't jump to line 2892 because the condition on line 2888 was never true

2889 # parse_linkage_item() can return a value that should 

2890 # be used as the sense for the follow-on linkages, 

2891 # which are typically provided in a table (see 滿) 

2892 next_navframe_sense = v 

2893 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

2894 parse_linkage_recurse(node.children, field, sense) 

2895 elif kind == NodeKind.TABLE_CELL: 

2896 parse_linkage_item(node.children, field, sense) 

2897 elif kind in ( 

2898 NodeKind.TABLE_CAPTION, 

2899 NodeKind.TABLE_HEADER_CELL, 

2900 NodeKind.PREFORMATTED, 

2901 NodeKind.BOLD, 

2902 ): 

2903 continue 

2904 elif kind == NodeKind.HTML: 2904 ↛ 2906line 2904 didn't jump to line 2906 because the condition on line 2904 was never true

2905 # Recurse to process inside the HTML for most tags 

2906 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2907 continue 

2908 classes = (node.attrs.get("class") or "").split() 

2909 if node.sarg == "li": 

2910 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑ 

2911 v = parse_linkage_item(node.children, field, sense) 

2912 if v: 

2913 next_navframe_sense = v 

2914 elif "qualifier-content" in classes: 

2915 sense1 = clean_node(wxr, None, node.children) 

2916 if sense1.endswith(":"): 

2917 sense1 = sense1[:-1].strip() 

2918 if sense and sense1: 

2919 wxr.wtp.debug( 

2920 "linkage qualifier-content on multiple " 

2921 "levels: {!r} and {!r}".format(sense, sense1), 

2922 sortid="page/2170", 

2923 ) 

2924 parse_linkage_recurse(node.children, field, sense1) 

2925 elif "NavFrame" in classes: 

2926 # NavFrame uses previously assigned next_navframe_sense 

2927 # (from a "(sense):" item) and clears it afterwards 

2928 parse_linkage_recurse( 

2929 node.children, field, sense or next_navframe_sense 

2930 ) 

2931 next_navframe_sense = None 

2932 else: 

2933 parse_linkage_recurse(node.children, field, sense) 

2934 elif kind in LEVEL_KINDS: 2934 ↛ 2936line 2934 didn't jump to line 2936 because the condition on line 2934 was never true

2935 # Just recurse to any possible subsections 

2936 parse_linkage_recurse(node.children, field, sense) 

2937 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

2938 # Skip these on top level; at least sometimes bold is 

2939 # used for indicating a subtitle 

2940 continue 

2941 elif kind == NodeKind.LINK: 2941 ↛ 2947line 2941 didn't jump to line 2947 because the condition on line 2941 was always true

2942 # Recurse into the last argument 

2943 # Apparently ":/" is used as a link to "/", so strip 

2944 # initial value 

2945 parse_linkage_recurse(node.largs[-1], field, sense) 

2946 else: 

2947 wxr.wtp.debug( 

2948 "parse_linkage_recurse unhandled {}: {}".format( 

2949 kind, node 

2950 ), 

2951 sortid="page/2196", 

2952 ) 

2953 

2954 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

2955 nonlocal have_panel_template 

2956 if is_panel_template(wxr, name): 

2957 have_panel_template = True 

2958 return "" 

2959 return None 

2960 

2961 # Main body of parse_linkage() 

2962 l_nodes = [] 

2963 l_sense = "" 

2964 for node in linkagenode.children: 

2965 if ( 

2966 isinstance(node, TemplateNode) 

2967 and node.template_name == "zh-dial" 

2968 ): 

2969 extract_zh_dial_template(wxr, data, node, l_sense) 

2970 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

2971 for list_item in node.find_child(NodeKind.LIST_ITEM): 

2972 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

2973 if t_node.template_name in ["s", "sense"]: 

2974 l_sense = clean_node(wxr, None, t_node).strip( 

2975 "(): " 

2976 ) 

2977 l_nodes.append(node) 

2978 else: 

2979 l_nodes.append(node) 

2980 text = wxr.wtp.node_to_wikitext(l_nodes) 

2981 parsed = wxr.wtp.parse( 

2982 text, expand_all=True, template_fn=linkage_template_fn1 

2983 ) 

2984 parse_linkage_recurse(parsed.children, field, None) 

2985 if not data.get(field) and not have_panel_template: 

2986 text = "".join(toplevel_text).strip() 

2987 if "\n" not in text and "," in text and text.count(",") > 3: 

2988 if not text.startswith("See "): 2988 ↛ exitline 2988 didn't return from function 'parse_linkage' because the condition on line 2988 was always true

2989 parse_linkage_item([text], field, None) 

2990 

2991 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2992 """Parses translations for a word. This may also pull in translations 

2993 from separate translation subpages.""" 

2994 assert isinstance(data, dict) 

2995 assert isinstance(xlatnode, WikiNode) 

2996 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2997 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2998 # print("parse_translations xlatnode={}".format(xlatnode)) 

2999 if not wxr.config.capture_translations: 2999 ↛ 3000line 2999 didn't jump to line 3000 because the condition on line 2999 was never true

3000 return 

3001 sense_parts: list[Union[WikiNode, str]] = [] 

3002 sense: Optional[str] = None 

3003 

3004 def parse_translation_item( 

3005 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

3006 ) -> None: 

3007 nonlocal sense 

3008 assert isinstance(contents, list) 

3009 assert lang is None or isinstance(lang, str) 

3010 # print("PARSE_TRANSLATION_ITEM:", contents) 

3011 

3012 langcode: Optional[str] = None 

3013 if sense is None: 

3014 sense = clean_node(wxr, data, sense_parts).strip() 

3015 # print("sense <- clean_node: ", sense) 

3016 idx = sense.find("See also translations at") 

3017 if idx > 0: 3017 ↛ 3018line 3017 didn't jump to line 3018 because the condition on line 3017 was never true

3018 wxr.wtp.debug( 

3019 "Skipping translation see also: {}".format(sense), 

3020 sortid="page/2361", 

3021 ) 

3022 sense = sense[:idx].strip() 

3023 if sense.endswith(":"): 3023 ↛ 3024line 3023 didn't jump to line 3024 because the condition on line 3023 was never true

3024 sense = sense[:-1].strip() 

3025 if sense.endswith("—"): 3025 ↛ 3026line 3025 didn't jump to line 3026 because the condition on line 3025 was never true

3026 sense = sense[:-1].strip() 

3027 translations_from_template: list[str] = [] 

3028 

3029 def translation_item_template_fn( 

3030 name: str, ht: TemplateArgs 

3031 ) -> Optional[str]: 

3032 nonlocal langcode 

3033 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

3034 if is_panel_template(wxr, name): 

3035 return "" 

3036 if name in ("t+check", "t-check", "t-needed"): 

3037 # We ignore these templates. They seem to have outright 

3038 # garbage in some entries, and very varying formatting in 

3039 # others. These should be transitory and unreliable 

3040 # anyway. 

3041 return "__IGNORE__" 

3042 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

3043 code = ht.get(1) 

3044 if code: 3044 ↛ 3054line 3044 didn't jump to line 3054 because the condition on line 3044 was always true

3045 if langcode and code != langcode: 

3046 wxr.wtp.debug( 

3047 "inconsistent language codes {} vs " 

3048 "{} in translation item: {!r} {}".format( 

3049 langcode, code, name, ht 

3050 ), 

3051 sortid="page/2386", 

3052 ) 

3053 langcode = code 

3054 tr = ht.get(2) 

3055 if tr: 

3056 tr = clean_node(wxr, None, [tr]) 

3057 translations_from_template.append(tr) 

3058 return None 

3059 if name == "t-egy": 

3060 langcode = "egy" 

3061 return None 

3062 if name == "ttbc": 

3063 code = ht.get(1) 

3064 if code: 3064 ↛ 3066line 3064 didn't jump to line 3066 because the condition on line 3064 was always true

3065 langcode = code 

3066 return None 

3067 if name == "trans-see": 3067 ↛ 3068line 3067 didn't jump to line 3068 because the condition on line 3067 was never true

3068 wxr.wtp.error( 

3069 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

3070 ) 

3071 return "" 

3072 if name.endswith("-top"): 3072 ↛ 3073line 3072 didn't jump to line 3073 because the condition on line 3072 was never true

3073 return "" 

3074 if name.endswith("-bottom"): 3074 ↛ 3075line 3074 didn't jump to line 3075 because the condition on line 3074 was never true

3075 return "" 

3076 if name.endswith("-mid"): 3076 ↛ 3077line 3076 didn't jump to line 3077 because the condition on line 3076 was never true

3077 return "" 

3078 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

3079 # .format(name), 

3080 # sortid="page/2414") 

3081 return None 

3082 

3083 sublists = list( 

3084 x 

3085 for x in contents 

3086 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

3087 ) 

3088 contents = list( 

3089 x 

3090 for x in contents 

3091 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

3092 ) 

3093 

3094 item = clean_node( 

3095 wxr, data, contents, template_fn=translation_item_template_fn 

3096 ) 

3097 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

3098 

3099 # Parse the translation item. 

3100 if item: 3100 ↛ exitline 3100 didn't return from function 'parse_translation_item' because the condition on line 3100 was always true

3101 lang = parse_translation_item_text( 

3102 wxr, 

3103 word, 

3104 data, 

3105 item, 

3106 sense, 

3107 lang, 

3108 langcode, 

3109 translations_from_template, 

3110 is_reconstruction, 

3111 ) 

3112 

3113 # Handle sublists. They are frequently used for different 

3114 # scripts for the language and different variants of the 

3115 # language. We will include the lower-level header as a 

3116 # tag in those cases. 

3117 for listnode in sublists: 

3118 assert listnode.kind == NodeKind.LIST 

3119 for node in listnode.children: 

3120 if not isinstance(node, WikiNode): 3120 ↛ 3121line 3120 didn't jump to line 3121 because the condition on line 3120 was never true

3121 continue 

3122 if node.kind == NodeKind.LIST_ITEM: 3122 ↛ 3119line 3122 didn't jump to line 3119 because the condition on line 3122 was always true

3123 parse_translation_item(node.children, lang=lang) 

3124 

3125 def parse_translation_template(node: WikiNode) -> None: 

3126 assert isinstance(node, WikiNode) 

3127 

3128 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3129 nonlocal sense_parts 

3130 nonlocal sense 

3131 if is_panel_template(wxr, name): 

3132 return "" 

3133 if name == "see also": 

3134 # XXX capture 

3135 # XXX for example, "/" has top-level list containing 

3136 # see also items. So also should parse those. 

3137 return "" 

3138 if name == "trans-see": 

3139 # XXX capture 

3140 return "" 

3141 if name == "see translation subpage": 3141 ↛ 3142line 3141 didn't jump to line 3142 because the condition on line 3141 was never true

3142 sense_parts = [] 

3143 sense = None 

3144 sub = ht.get(1, "") 

3145 if sub: 

3146 m = re.match( 

3147 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

3148 ) 

3149 else: 

3150 m = None 

3151 etym = "" 

3152 etym_numbered = "" 

3153 pos = "" 

3154 if m: 

3155 etym_numbered = m.group(1) 

3156 etym = m.group(2) 

3157 pos = m.group(3) 

3158 if not sub: 

3159 wxr.wtp.debug( 

3160 "no part-of-speech in " 

3161 "{{see translation subpage|...}}, " 

3162 "defaulting to just wxr.wtp.section " 

3163 "(= language)", 

3164 sortid="page/2468", 

3165 ) 

3166 # seq sent to get_subpage_section without sub and pos 

3167 seq = [ 

3168 language, 

3169 TRANSLATIONS_TITLE, 

3170 ] 

3171 elif ( 

3172 m 

3173 and etym.lower().strip() in ETYMOLOGY_TITLES 

3174 and pos.lower() in POS_TITLES 

3175 ): 

3176 seq = [ 

3177 language, 

3178 etym_numbered, 

3179 pos, 

3180 TRANSLATIONS_TITLE, 

3181 ] 

3182 elif sub.lower() in POS_TITLES: 

3183 # seq with sub but not pos 

3184 seq = [ 

3185 language, 

3186 sub, 

3187 TRANSLATIONS_TITLE, 

3188 ] 

3189 else: 

3190 # seq with sub and pos 

3191 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3192 if pos.lower() not in POS_TITLES: 

3193 wxr.wtp.debug( 

3194 "unhandled see translation subpage: " 

3195 "language={} sub={} " 

3196 "wxr.wtp.subsection={}".format( 

3197 language, sub, wxr.wtp.subsection 

3198 ), 

3199 sortid="page/2478", 

3200 ) 

3201 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

3202 subnode = get_subpage_section( 

3203 wxr.wtp.title or "MISSING_TITLE", 

3204 TRANSLATIONS_TITLE, 

3205 [seq], 

3206 ) 

3207 if subnode is None or not isinstance(subnode, WikiNode): 

3208 # Failed to find the normal subpage section 

3209 # seq with sub and pos 

3210 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3211 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

3212 seqs: list[list[str] | tuple[str, ...]] = [ 

3213 [TRANSLATIONS_TITLE], 

3214 [language, pos], 

3215 ] 

3216 subnode = get_subpage_section( 

3217 wxr.wtp.title or "MISSING_TITLE", 

3218 TRANSLATIONS_TITLE, 

3219 seqs, 

3220 ) 

3221 if subnode is not None and isinstance(subnode, WikiNode): 

3222 parse_translations(data, subnode) 

3223 return "" 

3224 if name in ( 

3225 "c", 

3226 "C", 

3227 "categorize", 

3228 "cat", 

3229 "catlangname", 

3230 "topics", 

3231 "top", 

3232 "qualifier", 

3233 "cln", 

3234 ): 

3235 # These are expanded in the default way 

3236 return None 

3237 if name in ( 

3238 "trans-top", 

3239 "trans-top-see", 

3240 ): 

3241 # XXX capture id from trans-top? Capture sense here 

3242 # instead of trying to parse it from expanded content? 

3243 if ht.get(1): 

3244 sense_parts = [] 

3245 sense = ht.get(1) 

3246 else: 

3247 sense_parts = [] 

3248 sense = None 

3249 return None 

3250 if name in ( 

3251 "trans-bottom", 

3252 "trans-mid", 

3253 "checktrans-mid", 

3254 "checktrans-bottom", 

3255 ): 

3256 return None 

3257 if name == "checktrans-top": 

3258 sense_parts = [] 

3259 sense = None 

3260 return "" 

3261 if name == "trans-top-also": 

3262 # XXX capture? 

3263 sense_parts = [] 

3264 sense = None 

3265 return "" 

3266 wxr.wtp.error( 

3267 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3268 name, ht 

3269 ), 

3270 sortid="page/2517", 

3271 ) 

3272 return "" 

3273 

3274 wxr.wtp.expand( 

3275 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3276 ) 

3277 

3278 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3279 nonlocal sense 

3280 nonlocal sense_parts 

3281 for node in xlatnode.children: 

3282 # print(node) 

3283 if isinstance(node, str): 

3284 if sense: 

3285 if not node.isspace(): 

3286 wxr.wtp.debug( 

3287 "skipping string in the middle of " 

3288 "translations: {}".format(node), 

3289 sortid="page/2530", 

3290 ) 

3291 continue 

3292 # Add a part to the sense 

3293 sense_parts.append(node) 

3294 sense = None 

3295 continue 

3296 assert isinstance(node, WikiNode) 

3297 kind = node.kind 

3298 if kind == NodeKind.LIST: 

3299 for item in node.children: 

3300 if not isinstance(item, WikiNode): 3300 ↛ 3301line 3300 didn't jump to line 3301 because the condition on line 3300 was never true

3301 continue 

3302 if item.kind != NodeKind.LIST_ITEM: 3302 ↛ 3303line 3302 didn't jump to line 3303 because the condition on line 3302 was never true

3303 continue 

3304 if item.sarg == ":": 3304 ↛ 3305line 3304 didn't jump to line 3305 because the condition on line 3304 was never true

3305 continue 

3306 parse_translation_item(item.children) 

3307 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3307 ↛ 3311line 3307 didn't jump to line 3311 because the condition on line 3307 was never true

3308 # Silently skip list items that are just indented; these 

3309 # are used for text between translations, such as indicating 

3310 # translations that need to be checked. 

3311 pass 

3312 elif kind == NodeKind.TEMPLATE: 

3313 parse_translation_template(node) 

3314 elif kind in ( 3314 ↛ 3319line 3314 didn't jump to line 3319 because the condition on line 3314 was never true

3315 NodeKind.TABLE, 

3316 NodeKind.TABLE_ROW, 

3317 NodeKind.TABLE_CELL, 

3318 ): 

3319 parse_translation_recurse(node) 

3320 elif kind == NodeKind.HTML: 

3321 if node.attrs.get("class") == "NavFrame": 3321 ↛ 3327line 3321 didn't jump to line 3327 because the condition on line 3321 was never true

3322 # Reset ``sense_parts`` (and force recomputing 

3323 # by clearing ``sense``) as each NavFrame specifies 

3324 # its own sense. This helps eliminate garbage coming 

3325 # from text at the beginning at the translations 

3326 # section. 

3327 sense_parts = [] 

3328 sense = None 

3329 # for item in node.children: 

3330 # if not isinstance(item, WikiNode): 

3331 # continue 

3332 # parse_translation_recurse(item) 

3333 parse_translation_recurse(node) 

3334 elif kind in LEVEL_KINDS: 3334 ↛ 3336line 3334 didn't jump to line 3336 because the condition on line 3334 was never true

3335 # Sub-levels will be recursed elsewhere 

3336 pass 

3337 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3338 parse_translation_recurse(node) 

3339 elif kind == NodeKind.PREFORMATTED: 3339 ↛ 3340line 3339 didn't jump to line 3340 because the condition on line 3339 was never true

3340 print("parse_translation_recurse: PREFORMATTED:", node) 

3341 elif kind == NodeKind.LINK: 3341 ↛ 3395line 3341 didn't jump to line 3395 because the condition on line 3341 was always true

3342 arg0 = node.largs[0] 

3343 # Kludge: I've seen occasional normal links to translation 

3344 # subpages from main pages (e.g., language/English/Noun 

3345 # in July 2021) instead of the normal 

3346 # {{see translation subpage|...}} template. This should 

3347 # handle them. Note: must be careful not to read other 

3348 # links, particularly things like in "human being": 

3349 # "a human being -- see [[man/translations]]" (group title) 

3350 if ( 3350 ↛ 3358line 3350 didn't jump to line 3358 because the condition on line 3350 was never true

3351 isinstance(arg0, (list, tuple)) 

3352 and arg0 

3353 and isinstance(arg0[0], str) 

3354 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3355 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3356 == wxr.wtp.title 

3357 ): 

3358 wxr.wtp.debug( 

3359 "translations subpage link found on main " 

3360 "page instead " 

3361 "of normal {{see translation subpage|...}}", 

3362 sortid="page/2595", 

3363 ) 

3364 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3365 if sub.lower() in POS_TITLES: 

3366 seq = [ 

3367 language, 

3368 sub, 

3369 TRANSLATIONS_TITLE, 

3370 ] 

3371 subnode = get_subpage_section( 

3372 wxr.wtp.title, 

3373 TRANSLATIONS_TITLE, 

3374 [seq], 

3375 ) 

3376 if subnode is not None and isinstance( 

3377 subnode, WikiNode 

3378 ): 

3379 parse_translations(data, subnode) 

3380 else: 

3381 wxr.wtp.error( 

3382 "/translations link outside part-of-speech" 

3383 ) 

3384 

3385 if ( 

3386 len(arg0) >= 1 

3387 and isinstance(arg0[0], str) 

3388 and not arg0[0].lower().startswith("category:") 

3389 ): 

3390 for x in node.largs[-1]: 

3391 if isinstance(x, str): 3391 ↛ 3394line 3391 didn't jump to line 3394 because the condition on line 3391 was always true

3392 sense_parts.append(x) 

3393 else: 

3394 parse_translation_recurse(x) 

3395 elif not sense: 

3396 sense_parts.append(node) 

3397 else: 

3398 wxr.wtp.debug( 

3399 "skipping text between translation items/senses: " 

3400 "{}".format(node), 

3401 sortid="page/2621", 

3402 ) 

3403 

3404 # Main code of parse_translation(). We want ``sense`` to be assigned 

3405 # regardless of recursion levels, and thus the code is structured 

3406 # to define at this level and recurse in parse_translation_recurse(). 

3407 parse_translation_recurse(xlatnode) 

3408 

3409 def parse_etymology(data: WordData, node: WikiNode) -> None: 

3410 """Parses an etymology section.""" 

3411 assert isinstance(data, dict) 

3412 assert isinstance(node, WikiNode) 

3413 

3414 templates: list[TemplateData] = [] 

3415 

3416 # Counter for preventing the capture of etymology templates 

3417 # when we are inside templates that we want to ignore (i.e., 

3418 # not capture). 

3419 ignore_count = 0 

3420 

3421 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3422 nonlocal ignore_count 

3423 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3424 return "" 

3425 if re.match(ignored_etymology_templates_re, name): 

3426 ignore_count += 1 

3427 return None 

3428 

3429 # CONTINUE_HERE 

3430 

3431 def etym_post_template_fn( 

3432 name: str, ht: TemplateArgs, expansion: str 

3433 ) -> None: 

3434 nonlocal ignore_count 

3435 if name in wikipedia_templates: 

3436 parse_wikipedia_template(wxr, data, ht) 

3437 return None 

3438 if re.match(ignored_etymology_templates_re, name): 

3439 ignore_count -= 1 

3440 return None 

3441 if ignore_count == 0: 3441 ↛ 3447line 3441 didn't jump to line 3447 because the condition on line 3441 was always true

3442 ht = clean_template_args(wxr, ht) 

3443 expansion = clean_node(wxr, None, expansion) 

3444 templates.append( 

3445 {"name": name, "args": ht, "expansion": expansion} 

3446 ) 

3447 return None 

3448 

3449 # Remove any subsections 

3450 contents = list( 

3451 x 

3452 for x in node.children 

3453 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3454 ) 

3455 # Convert to text, also capturing templates using post_template_fn 

3456 text = clean_node( 

3457 wxr, 

3458 None, 

3459 contents, 

3460 template_fn=etym_template_fn, 

3461 post_template_fn=etym_post_template_fn, 

3462 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3463 # Save the collected information. 

3464 if len(text) > 0: 

3465 data["etymology_text"] = text 

3466 if len(templates) > 0: 

3467 # Some etymology templates, like Template:root do not generate 

3468 # text, so they should be added here. Elsewhere, we check 

3469 # for Template:root and add some text to the expansion to please 

3470 # the validation. 

3471 data["etymology_templates"] = templates 

3472 

3473 for child_node in node.find_child_recursively( 3473 ↛ exitline 3473 didn't return from function 'parse_etymology' because the loop on line 3473 didn't complete

3474 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3475 ): 

3476 if child_node.kind in LEVEL_KIND_FLAGS: 

3477 break 

3478 elif isinstance( 3478 ↛ 3481line 3478 didn't jump to line 3481 because the condition on line 3478 was never true

3479 child_node, TemplateNode 

3480 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3481 if "etymology_examples" not in data: 

3482 data["etymology_examples"] = [] 

3483 data["etymology_examples"].extend( 

3484 extract_template_zh_x( 

3485 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3486 ) 

3487 ) 

3488 

3489 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3490 """This recurses into a subtree in the parse tree for a page.""" 

3491 nonlocal etym_data 

3492 nonlocal pos_data 

3493 nonlocal inside_level_four 

3494 

3495 redirect_list: list[str] = [] # for `zh-see` template 

3496 

3497 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3498 """This is called for otherwise unprocessed parts of the page. 

3499 We still expand them so that e.g. Category links get captured.""" 

3500 if name in wikipedia_templates: 

3501 data = select_data() 

3502 parse_wikipedia_template(wxr, data, ht) 

3503 return None 

3504 if is_panel_template(wxr, name): 

3505 return "" 

3506 return None 

3507 

3508 for node in treenode.children: 

3509 if not isinstance(node, WikiNode): 

3510 # print(" X{}".format(repr(node)[:40])) 

3511 continue 

3512 if isinstance(node, TemplateNode): 

3513 if process_soft_redirect_template(wxr, node, redirect_list): 

3514 continue 

3515 elif node.template_name == "zh-forms": 

3516 extract_zh_forms_template(wxr, node, select_data()) 

3517 

3518 if node.kind not in LEVEL_KINDS: 

3519 # XXX handle e.g. wikipedia links at the top of a language 

3520 # XXX should at least capture "also" at top of page 

3521 if node.kind in ( 

3522 NodeKind.HLINE, 

3523 NodeKind.LIST, 

3524 NodeKind.LIST_ITEM, 

3525 ): 

3526 continue 

3527 # print(" UNEXPECTED: {}".format(node)) 

3528 # Clean the node to collect category links 

3529 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3530 continue 

3531 t = clean_node( 

3532 wxr, etym_data, node.sarg if node.sarg else node.largs 

3533 ) 

3534 t = t.lower() 

3535 # XXX these counts were never implemented fully, and even this 

3536 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3537 wxr.config.section_counts[t] += 1 

3538 # print("PROCESS_CHILDREN: T:", repr(t)) 

3539 if t in IGNORED_TITLES: 

3540 pass 

3541 elif t.startswith(PRONUNCIATION_TITLE): 

3542 # Chinese Pronunciation section kludge; we demote these to 

3543 # be level 4 instead of 3 so that they're part of a larger 

3544 # etymology hierarchy; usually the data here is empty and 

3545 # acts as an inbetween between POS and Etymology data 

3546 inside_level_four = True 

3547 if t.startswith(PRONUNCIATION_TITLE + " "): 

3548 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3549 # and each of them may have senses under Definition 

3550 push_level_four_section(True) 

3551 wxr.wtp.start_subsection(None) 

3552 if wxr.config.capture_pronunciation: 3552 ↛ 3644line 3552 didn't jump to line 3644 because the condition on line 3552 was always true

3553 data = select_data() 

3554 parse_pronunciation( 

3555 wxr, 

3556 node, 

3557 data, 

3558 etym_data, 

3559 have_etym, 

3560 base_data, 

3561 lang_code, 

3562 ) 

3563 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3564 push_etym() 

3565 wxr.wtp.start_subsection(None) 

3566 if wxr.config.capture_etymologies: 3566 ↛ 3644line 3566 didn't jump to line 3644 because the condition on line 3566 was always true

3567 m = re.search(r"\s(\d+)$", t) 

3568 if m: 

3569 etym_data["etymology_number"] = int(m.group(1)) 

3570 parse_etymology(etym_data, node) 

3571 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3572 data = select_data() 

3573 extract_descendant_section(wxr, data, node, False) 

3574 elif ( 

3575 t in PROTO_ROOT_DERIVED_TITLES 

3576 and pos == "root" 

3577 and is_reconstruction 

3578 and wxr.config.capture_descendants 

3579 ): 

3580 data = select_data() 

3581 extract_descendant_section(wxr, data, node, True) 

3582 elif t == TRANSLATIONS_TITLE: 

3583 data = select_data() 

3584 parse_translations(data, node) 

3585 elif t in INFLECTION_TITLES: 

3586 parse_inflection(node, t, pos) 

3587 elif t == "alternative forms": 

3588 extract_alt_form_section(wxr, select_data(), node) 

3589 else: 

3590 lst = t.split() 

3591 while len(lst) > 1 and lst[-1].isdigit(): 3591 ↛ 3592line 3591 didn't jump to line 3592 because the condition on line 3591 was never true

3592 lst = lst[:-1] 

3593 t_no_number = " ".join(lst).lower() 

3594 if t_no_number in POS_TITLES: 

3595 push_pos() 

3596 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3597 pos = dt["pos"] or "MISSING_POS" 

3598 wxr.wtp.start_subsection(t) 

3599 if "debug" in dt: 

3600 wxr.wtp.debug( 

3601 "{} in section {}".format(dt["debug"], t), 

3602 sortid="page/2755", 

3603 ) 

3604 if "warning" in dt: 3604 ↛ 3605line 3604 didn't jump to line 3605 because the condition on line 3604 was never true

3605 wxr.wtp.wiki_notice( 

3606 "{} in section {}".format(dt["warning"], t), 

3607 sortid="page/2759", 

3608 ) 

3609 if "error" in dt: 3609 ↛ 3610line 3609 didn't jump to line 3610 because the condition on line 3609 was never true

3610 wxr.wtp.error( 

3611 "{} in section {}".format(dt["error"], t), 

3612 sortid="page/2763", 

3613 ) 

3614 if "note" in dt: 3614 ↛ 3615line 3614 didn't jump to line 3615 because the condition on line 3614 was never true

3615 wxr.wtp.note( 

3616 "{} in section {}".format(dt["note"], t), 

3617 sortid="page/20251017a", 

3618 ) 

3619 if "wiki_notice" in dt: 3619 ↛ 3620line 3619 didn't jump to line 3620 because the condition on line 3619 was never true

3620 wxr.wtp.wiki_notice( 

3621 "{} in section {}".format(dt["wiki_notice"], t), 

3622 sortid="page/20251017b", 

3623 ) 

3624 # Parse word senses for the part-of-speech 

3625 parse_part_of_speech(node, pos) 

3626 if "tags" in dt: 

3627 for pdata in sense_datas: 

3628 data_extend(pdata, "tags", dt["tags"]) 

3629 elif t_no_number in LINKAGE_TITLES: 

3630 # print(f"LINKAGE_TITLES NODE {node=}") 

3631 rel = LINKAGE_TITLES[t_no_number] 

3632 data = select_data() 

3633 parse_linkage(data, rel, node) 

3634 elif t_no_number == COMPOUNDS_TITLE: 

3635 data = select_data() 

3636 if wxr.config.capture_compounds: 3636 ↛ 3644line 3636 didn't jump to line 3644 because the condition on line 3636 was always true

3637 parse_linkage(data, "derived", node) 

3638 

3639 # XXX parse interesting templates also from other sections. E.g., 

3640 # {{Letter|...}} in ===See also=== 

3641 # Also <gallery> 

3642 

3643 # Recurse to children of this node, processing subtitles therein 

3644 stack.append(t) 

3645 process_children(node, pos) 

3646 stack.pop() 

3647 

3648 if len(redirect_list) > 0: 

3649 if len(pos_data) > 0: 

3650 pos_data["redirects"] = redirect_list 

3651 if "pos" not in pos_data: 3651 ↛ 3652line 3651 didn't jump to line 3652 because the condition on line 3651 was never true

3652 pos_data["pos"] = "soft-redirect" 

3653 else: 

3654 new_page_data = copy.deepcopy(base_data) 

3655 new_page_data["redirects"] = redirect_list 

3656 if "pos" not in new_page_data: 3656 ↛ 3658line 3656 didn't jump to line 3658 because the condition on line 3656 was always true

3657 new_page_data["pos"] = "soft-redirect" 

3658 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3659 page_datas.append(new_page_data) 

3660 

3661 def extract_examples( 

3662 others: list[WikiNode], sense_base: SenseData 

3663 ) -> list[ExampleData]: 

3664 """Parses through a list of definitions and quotes to find examples. 

3665 Returns a list of example dicts to be added to sense data. Adds 

3666 meta-data, mostly categories, into sense_base.""" 

3667 assert isinstance(others, list) 

3668 examples: list[ExampleData] = [] 

3669 

3670 for sub in others: 

3671 if not sub.sarg.endswith((":", "*")): 3671 ↛ 3672line 3671 didn't jump to line 3672 because the condition on line 3671 was never true

3672 continue 

3673 for item in sub.children: 

3674 if not isinstance(item, WikiNode): 3674 ↛ 3675line 3674 didn't jump to line 3675 because the condition on line 3674 was never true

3675 continue 

3676 if item.kind != NodeKind.LIST_ITEM: 3676 ↛ 3677line 3676 didn't jump to line 3677 because the condition on line 3676 was never true

3677 continue 

3678 usex_type = None 

3679 example_template_args = [] 

3680 example_template_names = [] 

3681 taxons = set() 

3682 

3683 # Bypass this function when parsing Chinese, Japanese and 

3684 # quotation templates. 

3685 new_example_lists = extract_example_list_item( 

3686 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3687 ) 

3688 if len(new_example_lists) > 0: 

3689 examples.extend(new_example_lists) 

3690 continue 

3691 

3692 def usex_template_fn( 

3693 name: str, ht: TemplateArgs 

3694 ) -> Optional[str]: 

3695 nonlocal usex_type 

3696 if is_panel_template(wxr, name): 

3697 return "" 

3698 if name in usex_templates: 

3699 usex_type = "example" 

3700 example_template_args.append(ht) 

3701 example_template_names.append(name) 

3702 elif name in quotation_templates: 

3703 usex_type = "quotation" 

3704 elif name in taxonomy_templates: 3704 ↛ 3705line 3704 didn't jump to line 3705 because the condition on line 3704 was never true

3705 taxons.update(ht.get(1, "").split()) 

3706 for prefix in template_linkages_to_ignore_in_examples: 

3707 if re.search( 

3708 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3709 ): 

3710 return "" 

3711 return None 

3712 

3713 # bookmark 

3714 ruby: list[tuple[str, str]] = [] 

3715 contents = item.children 

3716 if lang_code == "ja": 

3717 # Capture ruby contents if this is a Japanese language 

3718 # example. 

3719 # print(contents) 

3720 if ( 3720 ↛ 3725line 3720 didn't jump to line 3725 because the condition on line 3720 was never true

3721 contents 

3722 and isinstance(contents, str) 

3723 and re.match(r"\s*$", contents[0]) 

3724 ): 

3725 contents = contents[1:] 

3726 exp = wxr.wtp.parse( 

3727 wxr.wtp.node_to_wikitext(contents), 

3728 # post_template_fn=head_post_template_fn, 

3729 expand_all=True, 

3730 ) 

3731 rub, rest = extract_ruby(wxr, exp.children) 

3732 if rub: 

3733 for rtup in rub: 

3734 ruby.append(rtup) 

3735 contents = rest 

3736 subtext = clean_node( 

3737 wxr, sense_base, contents, template_fn=usex_template_fn 

3738 ) 

3739 

3740 frozen_taxons = frozenset(taxons) 

3741 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3742 

3743 # print(f"{subtext=}") 

3744 subtext = re.sub( 

3745 r"\s*\(please add an English " 

3746 r"translation of this " 

3747 r"(example|usage example|quote)\)", 

3748 "", 

3749 subtext, 

3750 ).strip() 

3751 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3752 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3753 # print("subtext:", repr(subtext)) 

3754 

3755 lines = subtext.splitlines() 

3756 # print(lines) 

3757 

3758 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3759 lines = list( 

3760 x 

3761 for x in lines 

3762 if not re.match( 

3763 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3764 r"Synonym: |Antonym: |Hyponym: |" 

3765 r"Hypernyms: |Derived terms: |" 

3766 r"Related terms: |" 

3767 r"Hypernym: |Derived term: |" 

3768 r"Coordinate terms:|" 

3769 r"Related term: |" 

3770 r"For more quotations using )", 

3771 x, 

3772 ) 

3773 ) 

3774 tr = "" 

3775 ref = "" 

3776 roman = "" 

3777 # for line in lines: 

3778 # print("LINE:", repr(line)) 

3779 # print(classify_desc(line)) 

3780 if len(lines) == 1 and lang_code != "en": 

3781 parts = example_splitter_re.split(lines[0]) 

3782 if ( 3782 ↛ 3790line 3782 didn't jump to line 3790 because the condition on line 3782 was never true

3783 len(parts) > 2 

3784 and len(example_template_args) == 1 

3785 and any( 

3786 ("―" in s) or ("—" in s) 

3787 for s in example_template_args[0].values() 

3788 ) 

3789 ): 

3790 if nparts := synch_splits_with_args( 

3791 lines[0], example_template_args[0] 

3792 ): 

3793 parts = nparts 

3794 if ( 3794 ↛ 3799line 3794 didn't jump to line 3799 because the condition on line 3794 was never true

3795 len(example_template_args) == 1 

3796 and "lit" in example_template_args[0] 

3797 ): 

3798 # ugly brute-force kludge in case there's a lit= arg 

3799 literally = example_template_args[0].get("lit", "") 

3800 if literally: 

3801 literally = ( 

3802 " (literally, “" 

3803 + clean_value(wxr, literally) 

3804 + "”)" 

3805 ) 

3806 else: 

3807 literally = "" 

3808 if ( 3808 ↛ 3847line 3808 didn't jump to line 3847 because the condition on line 3808 was never true

3809 len(example_template_args) == 1 

3810 and len(parts) == 2 

3811 and len(example_template_args[0]) 

3812 - ( 

3813 # horrible kludge to ignore these arguments 

3814 # when calculating how many there are 

3815 sum( 

3816 s in example_template_args[0] 

3817 for s in ( 

3818 "lit", # generates text, but we handle it 

3819 "inline", 

3820 "noenum", 

3821 "nocat", 

3822 "sort", 

3823 ) 

3824 ) 

3825 ) 

3826 == 3 

3827 and clean_value( 

3828 wxr, example_template_args[0].get(2, "") 

3829 ) 

3830 == parts[0].strip() 

3831 and clean_value( 

3832 wxr, 

3833 ( 

3834 example_template_args[0].get(3) 

3835 or example_template_args[0].get("translation") 

3836 or example_template_args[0].get("t", "") 

3837 ) 

3838 + literally, # in case there's a lit= argument 

3839 ) 

3840 == parts[1].strip() 

3841 ): 

3842 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3843 # is a pretty reliable 'heuristic', so we use it here 

3844 # before the others. To be extra sure the template 

3845 # doesn't do anything weird, we compare the arguments 

3846 # and the output to each other. 

3847 lines = [parts[0].strip()] 

3848 tr = parts[1].strip() 

3849 elif ( 

3850 len(parts) == 2 

3851 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3852 ): 

3853 # These other branches just do some simple heuristics w/ 

3854 # the expanded output of the template (if applicable). 

3855 lines = [parts[0].strip()] 

3856 tr = parts[1].strip() 

3857 elif ( 3857 ↛ 3863line 3857 didn't jump to line 3863 because the condition on line 3857 was never true

3858 len(parts) == 3 

3859 and classify_desc2(parts[1]) 

3860 in ("romanization", "english") 

3861 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3862 ): 

3863 lines = [parts[0].strip()] 

3864 roman = parts[1].strip() 

3865 tr = parts[2].strip() 

3866 else: 

3867 parts = re.split(r"\s+-\s+", lines[0]) 

3868 if ( 3868 ↛ 3872line 3868 didn't jump to line 3872 because the condition on line 3868 was never true

3869 len(parts) == 2 

3870 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3871 ): 

3872 lines = [parts[0].strip()] 

3873 tr = parts[1].strip() 

3874 elif len(lines) > 1: 

3875 if any( 

3876 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3877 ) and not (len(example_template_names) == 1): 

3878 refs: list[str] = [] 

3879 for i in range(len(lines)): 3879 ↛ 3885line 3879 didn't jump to line 3885 because the loop on line 3879 didn't complete

3880 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3880 ↛ 3881line 3880 didn't jump to line 3881 because the condition on line 3880 was never true

3881 break 

3882 refs.append(lines[i].strip()) 

3883 if re.search(r"[]\d:)]\s*$", lines[i]): 

3884 break 

3885 ref = " ".join(refs) 

3886 lines = lines[i + 1 :] 

3887 if ( 

3888 lang_code != "en" 

3889 and len(lines) >= 2 

3890 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3891 ): 

3892 i = len(lines) - 1 

3893 while ( 3893 ↛ 3898line 3893 didn't jump to line 3898 because the condition on line 3893 was never true

3894 i > 1 

3895 and classify_desc2(lines[i - 1]) 

3896 in ENGLISH_TEXTS 

3897 ): 

3898 i -= 1 

3899 tr = "\n".join(lines[i:]) 

3900 lines = lines[:i] 

3901 if len(lines) >= 2: 

3902 if classify_desc2(lines[-1]) == "romanization": 

3903 roman = lines[-1].strip() 

3904 lines = lines[:-1] 

3905 

3906 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

3907 ref = lines[0] 

3908 lines = lines[1:] 

3909 elif lang_code != "en" and len(lines) == 2: 

3910 cls1 = classify_desc2(lines[0]) 

3911 cls2 = classify_desc2(lines[1]) 

3912 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3913 tr = lines[1] 

3914 lines = [lines[0]] 

3915 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3915 ↛ 3916line 3915 didn't jump to line 3916 because the condition on line 3915 was never true

3916 tr = lines[0] 

3917 lines = [lines[1]] 

3918 elif ( 3918 ↛ 3925line 3918 didn't jump to line 3925 because the condition on line 3918 was never true

3919 re.match(r"^[#*]*:+", lines[1]) 

3920 and classify_desc2( 

3921 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3922 ) 

3923 in ENGLISH_TEXTS 

3924 ): 

3925 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3926 lines = [lines[0]] 

3927 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3928 # Both were classified as English, but 

3929 # presumably one is not. Assume first is 

3930 # non-English, as that seems more common. 

3931 tr = lines[1] 

3932 lines = [lines[0]] 

3933 elif ( 

3934 usex_type != "quotation" 

3935 and lang_code != "en" 

3936 and len(lines) == 3 

3937 ): 

3938 cls1 = classify_desc2(lines[0]) 

3939 cls2 = classify_desc2(lines[1]) 

3940 cls3 = classify_desc2(lines[2]) 

3941 if ( 

3942 cls3 == "english" 

3943 and cls2 in ("english", "romanization") 

3944 and cls1 != "english" 

3945 ): 

3946 tr = lines[2].strip() 

3947 roman = lines[1].strip() 

3948 lines = [lines[0].strip()] 

3949 elif ( 3949 ↛ 3957line 3949 didn't jump to line 3957 because the condition on line 3949 was never true

3950 usex_type == "quotation" 

3951 and lang_code != "en" 

3952 and len(lines) > 2 

3953 ): 

3954 # for x in lines: 

3955 # print(" LINE: {}: {}" 

3956 # .format(classify_desc2(x), x)) 

3957 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3958 ref = lines[0] 

3959 lines = lines[2:] 

3960 cls1 = classify_desc2(lines[-1]) 

3961 if cls1 == "english": 

3962 i = len(lines) - 1 

3963 while ( 

3964 i > 1 

3965 and classify_desc2(lines[i - 1]) 

3966 == ENGLISH_TEXTS 

3967 ): 

3968 i -= 1 

3969 tr = "\n".join(lines[i:]) 

3970 lines = lines[:i] 

3971 

3972 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3973 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3974 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3975 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3976 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3977 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3978 ref = re.sub( 

3979 r", (volume |number |page )?“?" 

3980 r"\(please specify ([^)]|\(s\))*\)”?|" 

3981 ", text here$", 

3982 "", 

3983 ref, 

3984 ) 

3985 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3986 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3987 subtext = "\n".join(x for x in lines if x) 

3988 if not tr and lang_code != "en": 

3989 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3990 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3990 ↛ 3991line 3990 didn't jump to line 3991 because the condition on line 3990 was never true

3991 tr = m.group(2) 

3992 subtext = subtext[: m.start()] + m.group(1) 

3993 elif lines: 

3994 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

3995 if ( 3995 ↛ 3999line 3995 didn't jump to line 3999 because the condition on line 3995 was never true

3996 len(parts) == 2 

3997 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3998 ): 

3999 subtext = parts[0].strip() 

4000 tr = parts[1].strip() 

4001 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

4002 subtext = re.sub( 

4003 r"(please add an English translation of " 

4004 r"this (quote|usage example))", 

4005 "", 

4006 subtext, 

4007 ) 

4008 subtext = re.sub( 

4009 r"\s*→New International Version " "translation$", 

4010 "", 

4011 subtext, 

4012 ) # e.g. pis/Tok Pisin (Bible) 

4013 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

4014 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

4015 note = None 

4016 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

4017 if ( 4017 ↛ 4025line 4017 didn't jump to line 4025 because the condition on line 4017 was never true

4018 m is not None 

4019 and lang_code != "en" 

4020 and ( 

4021 m.group(1).startswith("with ") 

4022 or classify_desc2(m.group(1)) == "english" 

4023 ) 

4024 ): 

4025 note = m.group(1) 

4026 subtext = subtext[m.end() :] 

4027 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

4028 ref = re.sub(r",\s*→ISBN", "", ref) 

4029 ref = ref.strip() 

4030 if ref.endswith(":") or ref.endswith(","): 

4031 ref = ref[:-1].strip() 

4032 ref = re.sub(r"\s+,\s+", ", ", ref) 

4033 ref = re.sub(r"\s+", " ", ref) 

4034 if ref and not subtext: 4034 ↛ 4035line 4034 didn't jump to line 4035 because the condition on line 4034 was never true

4035 subtext = ref 

4036 ref = "" 

4037 if subtext: 

4038 dt: ExampleData = {"text": subtext} 

4039 if ref: 

4040 dt["ref"] = ref 

4041 if tr: 

4042 dt["english"] = tr # DEPRECATED for "translation" 

4043 dt["translation"] = tr 

4044 if usex_type: 

4045 dt["type"] = usex_type 

4046 if note: 4046 ↛ 4047line 4046 didn't jump to line 4047 because the condition on line 4046 was never true

4047 dt["note"] = note 

4048 if roman: 

4049 dt["roman"] = roman 

4050 if ruby: 

4051 dt["ruby"] = ruby 

4052 examples.append(dt) 

4053 

4054 return examples 

4055 

4056 # Main code of parse_language() 

4057 # Process the section 

4058 stack.append(language) 

4059 process_children(langnode, None) 

4060 stack.pop() 

4061 

4062 # Finalize word entires 

4063 push_etym() 

4064 ret = [] 

4065 for data in page_datas: 

4066 merge_base(data, base_data) 

4067 ret.append(data) 

4068 

4069 # Copy all tags to word senses 

4070 for data in ret: 

4071 if "senses" not in data: 4071 ↛ 4072line 4071 didn't jump to line 4072 because the condition on line 4071 was never true

4072 continue 

4073 # WordData should not have a 'tags' field, but if it does, it's 

4074 # deleted and its contents removed and placed in each sense; 

4075 # that's why the type ignores. 

4076 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

4077 if "tags" in data: 

4078 del data["tags"] # type: ignore[typeddict-item] 

4079 for sense in data["senses"]: 

4080 data_extend(sense, "tags", tags) 

4081 

4082 return ret 

4083 

4084 

4085def parse_wikipedia_template( 

4086 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

4087) -> None: 

4088 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

4089 assert isinstance(wxr, WiktextractContext) 

4090 assert isinstance(data, dict) 

4091 assert isinstance(ht, dict) 

4092 langid = clean_node(wxr, data, ht.get("lang", ())) 

4093 pagename = ( 

4094 clean_node(wxr, data, ht.get(1, ())) 

4095 or wxr.wtp.title 

4096 or "MISSING_PAGE_TITLE" 

4097 ) 

4098 if langid: 

4099 data_append(data, "wikipedia", langid + ":" + pagename) 

4100 else: 

4101 data_append(data, "wikipedia", pagename) 

4102 

4103 

4104def parse_top_template( 

4105 wxr: WiktextractContext, node: WikiNode, data: WordData 

4106) -> None: 

4107 """Parses a template that occurs on the top-level in a page, before any 

4108 language subtitles.""" 

4109 assert isinstance(wxr, WiktextractContext) 

4110 assert isinstance(node, WikiNode) 

4111 assert isinstance(data, dict) 

4112 

4113 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

4114 if name in wikipedia_templates: 

4115 parse_wikipedia_template(wxr, data, ht) 

4116 return None 

4117 if is_panel_template(wxr, name): 

4118 return "" 

4119 if name in ("reconstruction",): 4119 ↛ 4120line 4119 didn't jump to line 4120 because the condition on line 4119 was never true

4120 return "" 

4121 if name.lower() == "also" or name.lower().startswith("also/"): 

4122 # XXX shows related words that might really have been the intended 

4123 # word, capture them 

4124 return "" 

4125 if name == "see also": 4125 ↛ 4127line 4125 didn't jump to line 4127 because the condition on line 4125 was never true

4126 # XXX capture 

4127 return "" 

4128 if name == "cardinalbox": 4128 ↛ 4130line 4128 didn't jump to line 4130 because the condition on line 4128 was never true

4129 # XXX capture 

4130 return "" 

4131 if name == "character info": 4131 ↛ 4133line 4131 didn't jump to line 4133 because the condition on line 4131 was never true

4132 # XXX capture 

4133 return "" 

4134 if name == "commonscat": 4134 ↛ 4136line 4134 didn't jump to line 4136 because the condition on line 4134 was never true

4135 # XXX capture link to Wikimedia commons 

4136 return "" 

4137 if name == "wrongtitle": 4137 ↛ 4140line 4137 didn't jump to line 4140 because the condition on line 4137 was never true

4138 # XXX this should be captured to replace page title with the 

4139 # correct title. E.g. ⿰亻革家 

4140 return "" 

4141 if name == "wikidata": 4141 ↛ 4142line 4141 didn't jump to line 4142 because the condition on line 4141 was never true

4142 arg = clean_node(wxr, data, ht.get(1, ())) 

4143 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

4144 data_append(data, "wikidata", arg) 

4145 return "" 

4146 wxr.wtp.debug( 

4147 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

4148 sortid="page/2870", 

4149 ) 

4150 return "" 

4151 

4152 clean_node(wxr, None, [node], template_fn=top_template_fn) 

4153 

4154 

4155def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

4156 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

4157 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

4158 that are next to each other.""" 

4159 

4160 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

4161 # section get overwritten. In this case, let's just combine the two. 

4162 

4163 # In Chinese entries, Pronunciation can be preceded on the 

4164 # same level 3 by its Etymology *and* Glyph Origin sections: 

4165 # ===Glyph Origin=== 

4166 # ===Etymology=== 

4167 # ===Pronunciation=== 

4168 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

4169 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

4170 # are now level 6 

4171 

4172 # Known lowercase PoS names are in part_of_speech_map 

4173 # Known lowercase linkage section names are in linkage_map 

4174 

4175 old = re.split( 

4176 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

4177 ) 

4178 

4179 parts = [] 

4180 npar = 4 # Number of parentheses in above expression 

4181 parts.append(old[0]) 

4182 prev_level = None 

4183 level = None 

4184 skip_level_title = False # When combining etymology sections 

4185 for i in range(1, len(old), npar + 1): 

4186 left = old[i] 

4187 right = old[i + npar - 1] 

4188 # remove Wikilinks in title 

4189 title = re.sub(r"^\[\[", "", old[i + 1]) 

4190 title = re.sub(r"\]\]$", "", title) 

4191 prev_level = level 

4192 level = len(left) 

4193 part = old[i + npar] 

4194 if level != len(right): 4194 ↛ 4195line 4194 didn't jump to line 4195 because the condition on line 4194 was never true

4195 wxr.wtp.debug( 

4196 "subtitle has unbalanced levels: " 

4197 "{!r} has {} on the left and {} on the right".format( 

4198 title, left, right 

4199 ), 

4200 sortid="page/2904", 

4201 ) 

4202 lc = title.lower() 

4203 if name_to_code(title, "en") != "": 

4204 if level > 2: 4204 ↛ 4205line 4204 didn't jump to line 4205 because the condition on line 4204 was never true

4205 wxr.wtp.debug( 

4206 "subtitle has language name {} at level {}".format( 

4207 title, level 

4208 ), 

4209 sortid="page/2911", 

4210 ) 

4211 level = 2 

4212 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4213 if level > 3: 4213 ↛ 4214line 4213 didn't jump to line 4214 because the condition on line 4213 was never true

4214 wxr.wtp.debug( 

4215 "etymology section {} at level {}".format(title, level), 

4216 sortid="page/2917", 

4217 ) 

4218 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

4219 # sections cheek-to-cheek 

4220 skip_level_title = True 

4221 # Modify the title of previous ("Glyph Origin") section, in 

4222 # case we have a meaningful title like "Etymology 1" 

4223 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4224 level = 3 

4225 elif lc.startswith(PRONUNCIATION_TITLE): 

4226 # Pronunciation is now a level between POS and Etymology, so 

4227 # we need to shift everything down by one 

4228 level = 4 

4229 elif lc in POS_TITLES: 

4230 level = 5 

4231 elif lc == TRANSLATIONS_TITLE: 

4232 level = 6 

4233 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

4234 level = 6 

4235 elif lc in INFLECTION_TITLES: 

4236 level = 6 

4237 elif lc == DESCENDANTS_TITLE: 

4238 level = 6 

4239 elif title in PROTO_ROOT_DERIVED_TITLES: 4239 ↛ 4240line 4239 didn't jump to line 4240 because the condition on line 4239 was never true

4240 level = 6 

4241 elif lc in IGNORED_TITLES: 

4242 level = 6 

4243 else: 

4244 level = 6 

4245 if skip_level_title: 

4246 skip_level_title = False 

4247 parts.append(part) 

4248 else: 

4249 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4250 parts.append(part) 

4251 # print("=" * level, title) 

4252 # if level != len(left): 

4253 # print(" FIXED LEVEL OF {} {} -> {}" 

4254 # .format(title, len(left), level)) 

4255 

4256 text = "".join(parts) 

4257 # print(text) 

4258 return text 

4259 

4260 

4261def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4262 # Skip translation pages 

4263 if word.endswith("/" + TRANSLATIONS_TITLE): 4263 ↛ 4264line 4263 didn't jump to line 4264 because the condition on line 4263 was never true

4264 return [] 

4265 

4266 if wxr.config.verbose: 4266 ↛ 4267line 4266 didn't jump to line 4267 because the condition on line 4266 was never true

4267 logger.info(f"Parsing page: {word}") 

4268 

4269 wxr.config.word = word 

4270 wxr.wtp.start_page(word) 

4271 

4272 # Remove <noinclude> and similar tags from main pages. They 

4273 # should not appear there, but at least net/Elfdala has one and it 

4274 # is probably not the only one. 

4275 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4276 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4277 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4278 

4279 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4280 # pages that have, for example, Translations section under Linkage, or 

4281 # Translations section on the same level as Noun. Enforce a proper 

4282 # hierarchy by manipulating the subtitle levels in certain cases. 

4283 text = fix_subtitle_hierarchy(wxr, text) 

4284 

4285 # Parse the page, pre-expanding those templates that are likely to 

4286 # influence parsing 

4287 tree = wxr.wtp.parse( 

4288 text, 

4289 pre_expand=True, 

4290 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4291 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4292 ) 

4293 # from wikitextprocessor.parser import print_tree 

4294 # print("PAGE PARSE:", print_tree(tree)) 

4295 

4296 top_data: WordData = {} 

4297 

4298 # Iterate over top-level titles, which should be languages for normal 

4299 # pages 

4300 by_lang = defaultdict(list) 

4301 for langnode in tree.children: 

4302 if not isinstance(langnode, WikiNode): 

4303 continue 

4304 if langnode.kind == NodeKind.TEMPLATE: 

4305 parse_top_template(wxr, langnode, top_data) 

4306 continue 

4307 if langnode.kind == NodeKind.LINK: 

4308 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4309 continue 

4310 if langnode.kind != NodeKind.LEVEL2: 4310 ↛ 4311line 4310 didn't jump to line 4311 because the condition on line 4310 was never true

4311 wxr.wtp.debug( 

4312 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4313 ) 

4314 continue 

4315 lang = clean_node( 

4316 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4317 ) 

4318 lang_code = name_to_code(lang, "en") 

4319 if lang_code == "": 4319 ↛ 4320line 4319 didn't jump to line 4320 because the condition on line 4319 was never true

4320 wxr.wtp.debug( 

4321 f"unrecognized language name: {lang}", sortid="page/3019" 

4322 ) 

4323 if ( 

4324 wxr.config.capture_language_codes 

4325 and lang_code not in wxr.config.capture_language_codes 

4326 ): 

4327 continue 

4328 wxr.wtp.start_section(lang) 

4329 

4330 # Collect all words from the page. 

4331 # print(f"{langnode=}") 

4332 datas = parse_language(wxr, langnode, lang, lang_code) 

4333 

4334 # Propagate fields resulting from top-level templates to this 

4335 # part-of-speech. 

4336 for data in datas: 

4337 if "lang" not in data: 4337 ↛ 4338line 4337 didn't jump to line 4338 because the condition on line 4337 was never true

4338 wxr.wtp.debug( 

4339 "internal error -- no lang in data: {}".format(data), 

4340 sortid="page/3034", 

4341 ) 

4342 continue 

4343 for k, v in top_data.items(): 

4344 assert isinstance(v, (list, tuple)) 

4345 data_extend(data, k, v) 

4346 by_lang[data["lang"]].append(data) 

4347 

4348 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4349 # field. FIX OR REMOVE. 

4350 # Do some post-processing on the words. For example, we may distribute 

4351 # conjugation information to all the words. 

4352 ret = [] 

4353 for lang, lang_datas in by_lang.items(): 

4354 ret.extend(lang_datas) 

4355 

4356 for x in ret: 

4357 if x["word"] != word: 

4358 if word.startswith("Unsupported titles/"): 

4359 wxr.wtp.debug( 

4360 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4361 sortid="20231101/3578page.py", 

4362 ) 

4363 else: 

4364 wxr.wtp.debug( 

4365 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4366 sortid="20231101/3582page.py", 

4367 ) 

4368 x["original_title"] = word 

4369 # validate tag data 

4370 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4371 return ret 

4372 

4373 

4374def recursively_separate_raw_tags( 

4375 wxr: WiktextractContext, data: dict[str, Any] 

4376) -> None: 

4377 if not isinstance(data, dict): 4377 ↛ 4378line 4377 didn't jump to line 4378 because the condition on line 4377 was never true

4378 wxr.wtp.error( 

4379 "'data' is not dict; most probably " 

4380 "data has a list that contains at least one dict and " 

4381 "at least one non-dict item", 

4382 sortid="en/page-4016/20240419", 

4383 ) 

4384 return 

4385 new_tags: list[str] = [] 

4386 raw_tags: list[str] = data.get("raw_tags", []) 

4387 for field, val in data.items(): 

4388 if field == "tags": 

4389 for tag in val: 

4390 if tag not in valid_tags: 

4391 raw_tags.append(tag) 

4392 else: 

4393 new_tags.append(tag) 

4394 if isinstance(val, list): 

4395 if len(val) > 0 and isinstance(val[0], dict): 

4396 for d in val: 

4397 recursively_separate_raw_tags(wxr, d) 

4398 if "tags" in data and not new_tags: 

4399 del data["tags"] 

4400 elif new_tags: 

4401 data["tags"] = new_tags 

4402 if raw_tags: 

4403 data["raw_tags"] = raw_tags 

4404 

4405 

4406def process_soft_redirect_template( 

4407 wxr: WiktextractContext, 

4408 template_node: TemplateNode, 

4409 redirect_pages: list[str], 

4410) -> bool: 

4411 # return `True` if the template is soft redirect template 

4412 if template_node.template_name == "zh-see": 

4413 # https://en.wiktionary.org/wiki/Template:zh-see 

4414 title = clean_node( 

4415 wxr, None, template_node.template_parameters.get(1, "") 

4416 ) 

4417 if title != "": 4417 ↛ 4419line 4417 didn't jump to line 4419 because the condition on line 4417 was always true

4418 redirect_pages.append(title) 

4419 return True 

4420 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4421 # https://en.wiktionary.org/wiki/Template:ja-see 

4422 for key, value in template_node.template_parameters.items(): 

4423 if isinstance(key, int): 4423 ↛ 4422line 4423 didn't jump to line 4422 because the condition on line 4423 was always true

4424 title = clean_node(wxr, None, value) 

4425 if title != "": 4425 ↛ 4422line 4425 didn't jump to line 4422 because the condition on line 4425 was always true

4426 redirect_pages.append(title) 

4427 return True 

4428 return False 

4429 

4430 

4431ZH_FORMS_TAGS = { 

4432 "trad.": "Traditional-Chinese", 

4433 "simp.": "Simplified-Chinese", 

4434 "alternative forms": "alternative", 

4435 "2nd round simp.": "Second-Round-Simplified-Chinese", 

4436} 

4437 

4438 

4439def extract_zh_forms_template( 

4440 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4441): 

4442 # https://en.wiktionary.org/wiki/Template:zh-forms 

4443 lit_meaning = clean_node( 

4444 wxr, None, t_node.template_parameters.get("lit", "") 

4445 ) 

4446 if lit_meaning != "": 

4447 base_data["literal_meaning"] = lit_meaning 

4448 expanded_node = wxr.wtp.parse( 

4449 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4450 ) 

4451 for table in expanded_node.find_child(NodeKind.TABLE): 

4452 for row in table.find_child(NodeKind.TABLE_ROW): 

4453 row_header = "" 

4454 row_header_tags = [] 

4455 header_has_span = False 

4456 for cell in row.find_child( 

4457 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

4458 ): 

4459 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

4460 row_header, row_header_tags, header_has_span = ( 

4461 extract_zh_forms_header_cell(wxr, base_data, cell) 

4462 ) 

4463 elif not header_has_span: 

4464 extract_zh_forms_data_cell( 

4465 wxr, base_data, cell, row_header, row_header_tags 

4466 ) 

4467 

4468 if "forms" in base_data and len(base_data["forms"]) == 0: 4468 ↛ 4469line 4468 didn't jump to line 4469 because the condition on line 4468 was never true

4469 del base_data["forms"] 

4470 

4471 

4472def extract_zh_forms_header_cell( 

4473 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode 

4474) -> tuple[str, list[str], bool]: 

4475 row_header = "" 

4476 row_header_tags = [] 

4477 header_has_span = False 

4478 first_span_index = len(header_cell.children) 

4479 for index, span_tag in header_cell.find_html("span", with_index=True): 

4480 if index < first_span_index: 4480 ↛ 4482line 4480 didn't jump to line 4482 because the condition on line 4480 was always true

4481 first_span_index = index 

4482 header_has_span = True 

4483 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

4484 for raw_tag in row_header.split(" and "): 

4485 raw_tag = raw_tag.strip() 

4486 if raw_tag != "": 

4487 row_header_tags.append(raw_tag) 

4488 for span_tag in header_cell.find_html_recursively("span"): 

4489 span_lang = span_tag.attrs.get("lang", "") 

4490 form_nodes = [] 

4491 sup_title = "" 

4492 for node in span_tag.children: 

4493 if isinstance(node, HTMLNode) and node.tag == "sup": 4493 ↛ 4494line 4493 didn't jump to line 4494 because the condition on line 4493 was never true

4494 for sup_span in node.find_html("span"): 

4495 sup_title = sup_span.attrs.get("title", "") 

4496 else: 

4497 form_nodes.append(node) 

4498 if span_lang in ["zh-Hant", "zh-Hans"]: 

4499 for word in clean_node(wxr, None, form_nodes).split("/"): 

4500 if word not in [wxr.wtp.title, ""]: 

4501 form = {"form": word} 

4502 for raw_tag in row_header_tags: 

4503 if raw_tag in ZH_FORMS_TAGS: 4503 ↛ 4506line 4503 didn't jump to line 4506 because the condition on line 4503 was always true

4504 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4505 else: 

4506 data_append(form, "raw_tags", raw_tag) 

4507 if sup_title != "": 4507 ↛ 4508line 4507 didn't jump to line 4508 because the condition on line 4507 was never true

4508 data_append(form, "raw_tags", sup_title) 

4509 data_append(base_data, "forms", form) 

4510 return row_header, row_header_tags, header_has_span 

4511 

4512 

4513def extract_zh_forms_data_cell( 

4514 wxr: WiktextractContext, 

4515 base_data: WordData, 

4516 cell: WikiNode, 

4517 row_header: str, 

4518 row_header_tags: list[str], 

4519): 

4520 from .zh_pron_tags import ZH_PRON_TAGS 

4521 

4522 forms = [] 

4523 for top_span_tag in cell.find_html("span"): 

4524 span_style = top_span_tag.attrs.get("style", "") 

4525 span_lang = top_span_tag.attrs.get("lang", "") 

4526 if span_style == "white-space:nowrap;": 

4527 extract_zh_forms_data_cell( 

4528 wxr, base_data, top_span_tag, row_header, row_header_tags 

4529 ) 

4530 elif "font-size:80%" in span_style: 

4531 raw_tag = clean_node(wxr, None, top_span_tag) 

4532 if raw_tag != "": 4532 ↛ 4523line 4532 didn't jump to line 4523 because the condition on line 4532 was always true

4533 for form in forms: 

4534 if raw_tag in ZH_PRON_TAGS: 4534 ↛ 4540line 4534 didn't jump to line 4540 because the condition on line 4534 was always true

4535 tr_tag = ZH_PRON_TAGS[raw_tag] 

4536 if isinstance(tr_tag, list): 4536 ↛ 4537line 4536 didn't jump to line 4537 because the condition on line 4536 was never true

4537 data_extend(form, "tags", tr_tag) 

4538 elif isinstance(tr_tag, str): 4538 ↛ 4533line 4538 didn't jump to line 4533 because the condition on line 4538 was always true

4539 data_append(form, "tags", tr_tag) 

4540 elif raw_tag in valid_tags: 

4541 data_append(form, "tags", raw_tag) 

4542 else: 

4543 data_append(form, "raw_tags", raw_tag) 

4544 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4544 ↛ 4523line 4544 didn't jump to line 4523 because the condition on line 4544 was always true

4545 word = clean_node(wxr, None, top_span_tag) 

4546 if word not in ["", "/", wxr.wtp.title]: 

4547 form = {"form": word} 

4548 if row_header != "anagram": 4548 ↛ 4554line 4548 didn't jump to line 4554 because the condition on line 4548 was always true

4549 for raw_tag in row_header_tags: 

4550 if raw_tag in ZH_FORMS_TAGS: 4550 ↛ 4553line 4550 didn't jump to line 4553 because the condition on line 4550 was always true

4551 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4552 else: 

4553 data_append(form, "raw_tags", raw_tag) 

4554 if span_lang == "zh-Hant": 

4555 data_append(form, "tags", "Traditional-Chinese") 

4556 elif span_lang == "zh-Hans": 

4557 data_append(form, "tags", "Simplified-Chinese") 

4558 forms.append(form) 

4559 

4560 if row_header == "anagram": 4560 ↛ 4561line 4560 didn't jump to line 4561 because the condition on line 4560 was never true

4561 for form in forms: 

4562 l_data = {"word": form["form"]} 

4563 for key in ["tags", "raw_tags"]: 

4564 if key in form: 

4565 l_data[key] = form[key] 

4566 data_append(base_data, "anagrams", l_data) 

4567 else: 

4568 data_extend(base_data, "forms", forms)