Coverage for src/wiktextract/extractor/en/page.py: 79%

1839 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-23 09:14 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8from collections import defaultdict 

9from functools import partial 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 Iterable, 

14 Literal, 

15 Optional, 

16 Set, 

17 Union, 

18 cast, 

19) 

20 

21from mediawiki_langcodes import get_all_names, name_to_code 

22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

23from wikitextprocessor.parser import ( 

24 LEVEL_KIND_FLAGS, 

25 GeneralNode, 

26 HTMLNode, 

27 LevelNode, 

28 NodeKind, 

29 TemplateNode, 

30 WikiNode, 

31) 

32 

33from ...clean import clean_template_args, clean_value 

34from ...datautils import ( 

35 data_append, 

36 data_extend, 

37 ns_title_prefix_tuple, 

38) 

39from ...page import ( 

40 LEVEL_KINDS, 

41 clean_node, 

42 is_panel_template, 

43 recursively_extract, 

44) 

45from ...tags import valid_tags 

46from ...wxr_context import WiktextractContext 

47from ...wxr_logging import logger 

48from ..ruby import extract_ruby, parse_ruby 

49from ..share import strip_nodes 

50from .descendant import extract_descendant_section 

51from .example import extract_example_list_item, extract_template_zh_x 

52from .form_descriptions import ( 

53 classify_desc, 

54 decode_tags, 

55 distw, 

56 parse_alt_or_inflection_of, 

57 parse_sense_qualifier, 

58 parse_word_head, 

59) 

60from .inflection import TableContext, parse_inflection_section 

61from .info_templates import ( 

62 INFO_TEMPLATE_FUNCS, 

63 parse_info_template_arguments, 

64 parse_info_template_node, 

65) 

66from .linkages import ( 

67 extract_alt_form_section, 

68 parse_linkage, 

69) 

70from .parts_of_speech import PARTS_OF_SPEECH 

71from .section_titles import ( 

72 COMPOUNDS_TITLE, 

73 DESCENDANTS_TITLE, 

74 ETYMOLOGY_TITLES, 

75 IGNORED_TITLES, 

76 INFLECTION_TITLES, 

77 LINKAGE_TITLES, 

78 POS_TITLES, 

79 PRONUNCIATION_TITLE, 

80 PROTO_ROOT_DERIVED_TITLES, 

81 TRANSLATIONS_TITLE, 

82) 

83from .translations import parse_translation_item_text 

84from .type_utils import ( 

85 AttestationData, 

86 ExampleData, 

87 FormData, 

88 LinkageData, 

89 ReferenceData, 

90 SenseData, 

91 SoundData, 

92 TemplateData, 

93 WordData, 

94) 

95from .unsupported_titles import unsupported_title_map 

96 

97# When determining whether a string is 'english', classify_desc 

98# might return 'taxonomic' which is English text 99% of the time. 

99ENGLISH_TEXTS = ("english", "taxonomic") 

100 

101# Matches head tag 

102HEAD_TAG_RE = re.compile( 

103 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

104 r"hangul-symbol|syllable-hangul)$|" 

105 + r"^(latin|" 

106 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

107 + r")-(" 

108 + "|".join( 

109 [ 

110 "abbr", 

111 "adj", 

112 "adjective", 

113 "adjective form", 

114 "adjective-form", 

115 "adv", 

116 "adverb", 

117 "affix", 

118 "animal command", 

119 "art", 

120 "article", 

121 "aux", 

122 "bound pronoun", 

123 "bound-pronoun", 

124 "Buyla", 

125 "card num", 

126 "card-num", 

127 "cardinal", 

128 "chunom", 

129 "classifier", 

130 "clitic", 

131 "cls", 

132 "cmene", 

133 "cmavo", 

134 "colloq-verb", 

135 "colverbform", 

136 "combining form", 

137 "combining-form", 

138 "comparative", 

139 "con", 

140 "concord", 

141 "conj", 

142 "conjunction", 

143 "conjug", 

144 "cont", 

145 "contr", 

146 "converb", 

147 "daybox", 

148 "decl", 

149 "decl noun", 

150 "def", 

151 "dem", 

152 "det", 

153 "determ", 

154 "Deva", 

155 "ending", 

156 "entry", 

157 "form", 

158 "fuhivla", 

159 "gerund", 

160 "gismu", 

161 "hanja", 

162 "hantu", 

163 "hanzi", 

164 "head", 

165 "ideophone", 

166 "idiom", 

167 "inf", 

168 "indef", 

169 "infixed pronoun", 

170 "infixed-pronoun", 

171 "infl", 

172 "inflection", 

173 "initialism", 

174 "int", 

175 "interfix", 

176 "interj", 

177 "interjection", 

178 "jyut", 

179 "latin", 

180 "letter", 

181 "locative", 

182 "lujvo", 

183 "monthbox", 

184 "mutverb", 

185 "name", 

186 "nisba", 

187 "nom", 

188 "noun", 

189 "noun form", 

190 "noun-form", 

191 "noun plural", 

192 "noun-plural", 

193 "nounprefix", 

194 "num", 

195 "number", 

196 "numeral", 

197 "ord", 

198 "ordinal", 

199 "par", 

200 "part", 

201 "part form", 

202 "part-form", 

203 "participle", 

204 "particle", 

205 "past", 

206 "past neg", 

207 "past-neg", 

208 "past participle", 

209 "past-participle", 

210 "perfect participle", 

211 "perfect-participle", 

212 "personal pronoun", 

213 "personal-pronoun", 

214 "pref", 

215 "prefix", 

216 "phrase", 

217 "pinyin", 

218 "plural noun", 

219 "plural-noun", 

220 "pos", 

221 "poss-noun", 

222 "post", 

223 "postp", 

224 "postposition", 

225 "PP", 

226 "pp", 

227 "ppron", 

228 "pred", 

229 "predicative", 

230 "prep", 

231 "prep phrase", 

232 "prep-phrase", 

233 "preposition", 

234 "present participle", 

235 "present-participle", 

236 "pron", 

237 "prondem", 

238 "pronindef", 

239 "pronoun", 

240 "prop", 

241 "proper noun", 

242 "proper-noun", 

243 "proper noun form", 

244 "proper-noun form", 

245 "proper noun-form", 

246 "proper-noun-form", 

247 "prov", 

248 "proverb", 

249 "prpn", 

250 "prpr", 

251 "punctuation mark", 

252 "punctuation-mark", 

253 "regnoun", 

254 "rel", 

255 "rom", 

256 "romanji", 

257 "root", 

258 "sign", 

259 "suff", 

260 "suffix", 

261 "syllable", 

262 "symbol", 

263 "verb", 

264 "verb form", 

265 "verb-form", 

266 "verbal noun", 

267 "verbal-noun", 

268 "verbnec", 

269 "vform", 

270 ] 

271 ) 

272 + r")(-|/|\+|$)" 

273) 

274 

275# Head-templates causing problems (like newlines) that can be squashed into 

276# an empty string in the template handler while saving their template 

277# data for later. 

278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

279 

280# Annoying templates that should be in etymology sections, but sometimes 

281# are thrown in heads because the etymology section is missing, like at 

282# the oldest level of a reconstruction: see wiktextract#1658 

283ETYMOLOGY_TEMPLATES_IN_HEADS = { 

284 "etymon", 

285} 

286 

287PROBLEMATIC_TEMPLATES_CLUMP = ( 

288 WORD_LEVEL_HEAD_TEMPLATES | ETYMOLOGY_TEMPLATES_IN_HEADS 

289) 

290 

291FLOATING_TABLE_TEMPLATES: set[str] = { 

292 # az-suffix-form creates a style=floatright div that is otherwise 

293 # deleted; if it is not pre-expanded, we can intercept the template 

294 # so we add this set into do_not_pre_expand, and intercept the 

295 # templates in parse_part_of_speech 

296 "az-suffix-forms", 

297 "az-inf-p", 

298 "kk-suffix-forms", 

299 "ky-suffix-forms", 

300 "tr-inf-p", 

301 "tr-suffix-forms", 

302 "tt-suffix-forms", 

303 "uz-suffix-forms", 

304} 

305# These two should contain template names that should always be 

306# pre-expanded when *first* processing the tree, or not pre-expanded 

307# so that the template are left in place with their identifying 

308# name intact for later filtering. 

309 

310DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

311DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

312 

313# Additional templates to be expanded in the pre-expand phase 

314ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

315 "multitrans", 

316 "multitrans-nowiki", 

317 "trans-top", 

318 "trans-top-also", 

319 "trans-bottom", 

320 "checktrans-top", 

321 "checktrans-bottom", 

322 "col", 

323 "col1", 

324 "col2", 

325 "col3", 

326 "col4", 

327 "col5", 

328 "col1-u", 

329 "col2-u", 

330 "col3-u", 

331 "col4-u", 

332 "col5-u", 

333 "check deprecated lang param usage", 

334 "deprecated code", 

335 "ru-verb-alt-ё", 

336 "ru-noun-alt-ё", 

337 "ru-adj-alt-ё", 

338 "ru-proper noun-alt-ё", 

339 "ru-pos-alt-ё", 

340 "ru-alt-ё", 

341 "inflection of", 

342 "no deprecated lang param usage", 

343 "transclude", # these produce sense entries (or other lists) 

344 "tcl", 

345} 

346 

347# Inverse linkage for those that have them 

348linkage_inverses: dict[str, str] = { 

349 # XXX this is not currently used, move to post-processing 

350 "synonyms": "synonyms", 

351 "hypernyms": "hyponyms", 

352 "hyponyms": "hypernyms", 

353 "holonyms": "meronyms", 

354 "meronyms": "holonyms", 

355 "derived": "derived_from", 

356 "coordinate_terms": "coordinate_terms", 

357 "troponyms": "hypernyms", 

358 "antonyms": "antonyms", 

359 "instances": "instance_of", 

360 "related": "related", 

361} 

362 

363# Templates that are used to form panels on pages and that 

364# should be ignored in various positions 

365PANEL_TEMPLATES: set[str] = { 

366 "Character info", 

367 "CJKV", 

368 "French personal pronouns", 

369 "French possessive adjectives", 

370 "French possessive pronouns", 

371 "Han etym", 

372 "Han etyl", # this redirects to Han etym and would cause Lua errors, 

373 # and I don't know why, but I'm putting it here because 

374 # we should be ignoring it anyhow. 

375 "Japanese demonstratives", 

376 "Latn-script", 

377 "LDL", 

378 "MW1913Abbr", 

379 "Number-encoding", 

380 "Nuttall", 

381 "Spanish possessive adjectives", 

382 "Spanish possessive pronouns", 

383 "USRegionDisputed", 

384 "Webster 1913", 

385 "ase-rfr", 

386 "attention", 

387 "attn", 

388 "beer", 

389 "broken ref", 

390 "ca-compass", 

391 "character info", 

392 "character info/var", 

393 "checksense", 

394 "compass-fi", 

395 "copyvio suspected", 

396 "delete", 

397 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

398 "etystub", 

399 "examples", 

400 "hu-corr", 

401 "hu-suff-pron", 

402 "interwiktionary", 

403 "ja-kanjitab", 

404 "ja-kt", 

405 "ko-hanja-search", 

406 "look", 

407 "maintenance box", 

408 "maintenance line", 

409 "mediagenic terms", 

410 "merge", 

411 "missing template", 

412 "morse links", 

413 "move", 

414 "multiple images", 

415 "no inline", 

416 "picdic", 

417 "picdicimg", 

418 "picdiclabel", 

419 "polyominoes", 

420 "predidential nomics", 

421 "punctuation", # This actually gets pre-expanded 

422 "reconstructed", 

423 "request box", 

424 "rf-sound example", 

425 "rfaccents", 

426 "rfap", 

427 "rfaspect", 

428 "rfc", 

429 "rfc-auto", 

430 "rfc-header", 

431 "rfc-level", 

432 "rfc-pron-n", 

433 "rfc-sense", 

434 "rfclarify", 

435 "rfd", 

436 "rfd-redundant", 

437 "rfd-sense", 

438 "rfdate", 

439 "rfdatek", 

440 "rfdef", 

441 "rfe", 

442 "rfe/dowork", 

443 "rfex", 

444 "rfexp", 

445 "rfform", 

446 "rfgender", 

447 "rfi", 

448 "rfinfl", 

449 "rfm", 

450 "rfm-sense", 

451 "rfp", 

452 "rfp-old", 

453 "rfquote", 

454 "rfquote-sense", 

455 "rfquotek", 

456 "rfref", 

457 "rfscript", 

458 "rft2", 

459 "rftaxon", 

460 "rftone", 

461 "rftranslit", 

462 "rfv", 

463 "rfv-etym", 

464 "rfv-pron", 

465 "rfv-quote", 

466 "rfv-sense", 

467 "selfref", 

468 "split", 

469 "stroke order", # XXX consider capturing this? 

470 "stub entry", 

471 "t-needed", 

472 "tbot entry", 

473 "tea room", 

474 "tea room sense", 

475 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

476 "unblock", 

477 "unsupportedpage", 

478 "video frames", 

479 "was wotd", 

480 "wrongtitle", 

481 "zh-forms", 

482 "zh-hanzi-box", 

483 "no entry", 

484} 

485 

486# Template name prefixes used for language-specific panel templates (i.e., 

487# templates that create side boxes or notice boxes or that should generally 

488# be ignored). 

489PANEL_PREFIXES: set[str] = { 

490 "list:compass points/", 

491 "list:Gregorian calendar months/", 

492 "RQ:", 

493} 

494 

495# Templates used for wikipedia links. 

496wikipedia_templates: set[str] = { 

497 "wikipedia", 

498 "slim-wikipedia", 

499 "w", 

500 "W", 

501 "swp", 

502 "wiki", 

503 "Wikipedia", 

504 "wtorw", 

505} 

506for x in PANEL_PREFIXES & wikipedia_templates: 506 ↛ 507line 506 didn't jump to line 507 because the loop on line 506 never started

507 print( 

508 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

509 x 

510 ) 

511 ) 

512 

513# Mapping from a template name (without language prefix) for the main word 

514# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

515# it could validly occur. This is used as just a sanity check to give 

516# warnings about probably incorrect coding in Wiktionary. 

517template_allowed_pos_map: dict[str, list[str]] = { 

518 "abbr": ["abbrev"], 

519 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

520 "plural noun": ["noun", "name"], 

521 "plural-noun": ["noun", "name"], 

522 "proper noun": ["noun", "name"], 

523 "proper-noun": ["name", "noun"], 

524 "prop": ["name", "noun"], 

525 "verb": ["verb", "phrase"], 

526 "gerund": ["verb"], 

527 "particle": ["adv", "particle"], 

528 "adj": ["adj", "adj_noun"], 

529 "pron": ["pron", "noun"], 

530 "name": ["name", "noun"], 

531 "adv": ["adv", "intj", "conj", "particle"], 

532 "phrase": ["phrase", "prep_phrase"], 

533 "noun phrase": ["phrase"], 

534 "ordinal": ["num"], 

535 "number": ["num"], 

536 "pos": ["affix", "name", "num"], 

537 "suffix": ["suffix", "affix"], 

538 "character": ["character"], 

539 "letter": ["character"], 

540 "kanji": ["character"], 

541 "cont": ["abbrev"], 

542 "interj": ["intj"], 

543 "con": ["conj"], 

544 "part": ["particle"], 

545 "prep": ["prep", "postp"], 

546 "postp": ["postp"], 

547 "misspelling": ["noun", "adj", "verb", "adv"], 

548 "part-form": ["verb"], 

549} 

550for k, v in template_allowed_pos_map.items(): 

551 for x in v: 

552 if x not in PARTS_OF_SPEECH: 552 ↛ 553line 552 didn't jump to line 553 because the condition on line 552 was never true

553 print( 

554 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

555 "".format(x, k, v) 

556 ) 

557 assert False 

558 

559 

560# Templates ignored during etymology extraction, i.e., these will not be listed 

561# in the extracted etymology templates. 

562ignored_etymology_templates: list[str] = [ 

563 "...", 

564 "IPAchar", 

565 "ipachar", 

566 "ISBN", 

567 "isValidPageName", 

568 "redlink category", 

569 "deprecated code", 

570 "check deprecated lang param usage", 

571 "para", 

572 "p", 

573 "cite", 

574 "Cite news", 

575 "Cite newsgroup", 

576 "cite paper", 

577 "cite MLLM 1976", 

578 "cite journal", 

579 "cite news/documentation", 

580 "cite paper/documentation", 

581 "cite video game", 

582 "cite video game/documentation", 

583 "cite newsgroup", 

584 "cite newsgroup/documentation", 

585 "cite web/documentation", 

586 "cite news", 

587 "Cite book", 

588 "Cite-book", 

589 "cite book", 

590 "cite web", 

591 "cite-usenet", 

592 "cite-video/documentation", 

593 "Cite-journal", 

594 "rfe", 

595 "catlangname", 

596 "cln", 

597 "langname-lite", 

598 "no deprecated lang param usage", 

599 "mention", 

600 "m", 

601 "m-self", 

602 "link", 

603 "l", 

604 "ll", 

605 "l-self", 

606] 

607# Regexp for matching ignored etymology template names. This adds certain 

608# prefixes to the names listed above. 

609ignored_etymology_templates_re = re.compile( 

610 r"^((cite-|R:|RQ:).*|" 

611 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

612 + r")$" 

613) 

614 

615# Regexp for matching ignored descendants template names. Right now we just 

616# copy the ignored etymology templates 

617ignored_descendants_templates_re = ignored_etymology_templates_re 

618 

619# Set of template names that are used to define usage examples. If the usage 

620# example contains one of these templates, then it its type is set to 

621# "example" 

622usex_templates: set[str] = { 

623 "afex", 

624 "affixusex", 

625 "co", # {{collocation}} acts like a example template, specifically for 

626 # pairs of combinations of words that are more common than you'd 

627 # except would be randomly; hlavní#Czech 

628 "coi", 

629 "collocation", 

630 "el-example", 

631 "el-x", 

632 "example", 

633 "examples", 

634 "he-usex", 

635 "he-x", 

636 "hi-usex", 

637 "hi-x", 

638 "ja-usex-inline", 

639 "ja-usex", 

640 "ja-x", 

641 "jbo-example", 

642 "jbo-x", 

643 "km-usex", 

644 "km-x", 

645 "ko-usex", 

646 "ko-x", 

647 "lo-usex", 

648 "lo-x", 

649 "ne-x", 

650 "ne-usex", 

651 "prefixusex", 

652 "ryu-usex", 

653 "ryu-x", 

654 "shn-usex", 

655 "shn-x", 

656 "suffixusex", 

657 "th-usex", 

658 "th-x", 

659 "ur-usex", 

660 "ur-x", 

661 "usex", 

662 "usex-suffix", 

663 "ux", 

664 "uxi", 

665} 

666 

667stop_head_at_these_templates: set[str] = { 

668 "category", 

669 "cat", 

670 "topics", 

671 "catlangname", 

672 "c", 

673 "C", 

674 "top", 

675 "cln", 

676} 

677 

678# Set of template names that are used to define quotation examples. If the 

679# usage example contains one of these templates, then its type is set to 

680# "quotation". 

681quotation_templates: set[str] = { 

682 "collapse-quote", 

683 "quote-av", 

684 "quote-book", 

685 "quote-GYLD", 

686 "quote-hansard", 

687 "quotei", 

688 "quote-journal", 

689 "quotelite", 

690 "quote-mailing list", 

691 "quote-meta", 

692 "quote-newsgroup", 

693 "quote-song", 

694 "quote-text", 

695 "quote", 

696 "quote-us-patent", 

697 "quote-video game", 

698 "quote-web", 

699 "quote-wikipedia", 

700 "wikiquote", 

701 "Wikiquote", 

702 "Q", 

703} 

704 

705taxonomy_templates = { 

706 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

707 "taxfmt", 

708 "taxlink", 

709 "taxlink2", 

710 "taxlinknew", 

711 "taxlook", 

712} 

713 

714# Template names, this was exctracted from template_linkage_mappings, 

715# because the code using template_linkage_mappings was actually not used 

716# (but not removed). 

717template_linkages_to_ignore_in_examples: set[str] = { 

718 "syn", 

719 "synonyms", 

720 "ant", 

721 "antonyms", 

722 "hyp", 

723 "hyponyms", 

724 "der", 

725 "derived terms", 

726 "coordinate terms", 

727 "cot", 

728 "rel", 

729 "col", 

730 "inline alt forms", 

731 "alti", 

732 "comeronyms", 

733 "holonyms", 

734 "holo", 

735 "hypernyms", 

736 "hyper", 

737 "meronyms", 

738 "mero", 

739 "troponyms", 

740 "perfectives", 

741 "pf", 

742 "imperfectives", 

743 "impf", 

744 "syndiff", 

745 "synsee", 

746 # not linkage nor example templates 

747 "sense", 

748 "s", 

749 "color panel", 

750 "colour panel", 

751} 

752 

753# Maps template name used in a word sense to a linkage field that it adds. 

754sense_linkage_templates: dict[str, str] = { 

755 "syn": "synonyms", 

756 "synonyms": "synonyms", 

757 "synsee": "synonyms", 

758 "syndiff": "synonyms", 

759 "hyp": "hyponyms", 

760 "hyponyms": "hyponyms", 

761 "ant": "antonyms", 

762 "antonyms": "antonyms", 

763 "alti": "related", 

764 "inline alt forms": "related", 

765 "coordinate terms": "coordinate_terms", 

766 "cot": "coordinate_terms", 

767 "comeronyms": "related", 

768 "holonyms": "holonyms", 

769 "holo": "holonyms", 

770 "hypernyms": "hypernyms", 

771 "hyper": "hypernyms", 

772 "meronyms": "meronyms", 

773 "mero": "meronyms", 

774 "troponyms": "troponyms", 

775 "perfectives": "related", 

776 "pf": "related", 

777 "imperfectives": "related", 

778 "impf": "related", 

779 "parasynonyms": "synonyms", 

780 "par": "synonyms", 

781 "parasyn": "synonyms", 

782 "nearsyn": "synonyms", 

783 "near-syn": "synonyms", 

784} 

785 

786sense_linkage_templates_tags: dict[str, list[str]] = { 

787 "alti": ["alternative"], 

788 "inline alt forms": ["alternative"], 

789 "comeronyms": ["comeronym"], 

790 "perfectives": ["perfective"], 

791 "pf": ["perfective"], 

792 "imperfectives": ["imperfective"], 

793 "impf": ["imperfective"], 

794} 

795 

796 

797def decode_html_entities(v: Union[str, int]) -> str: 

798 """Decodes HTML entities from a value, converting them to the respective 

799 Unicode characters/strings.""" 

800 if isinstance(v, int): 

801 # I changed this to return str(v) instead of v = str(v), 

802 # but there might have been the intention to have more logic 

803 # here. html.unescape would not do anything special with an integer, 

804 # it needs html escape symbols (&xx;). 

805 return str(v) 

806 return html.unescape(v) 

807 

808 

809def parse_sense_linkage( 

810 wxr: WiktextractContext, 

811 data: SenseData, 

812 name: str, 

813 ht: TemplateArgs, 

814 pos: str, 

815) -> None: 

816 """Parses a linkage (synonym, etc) specified in a word sense.""" 

817 assert isinstance(wxr, WiktextractContext) 

818 assert isinstance(data, dict) 

819 assert isinstance(name, str) 

820 assert isinstance(ht, dict) 

821 field = sense_linkage_templates[name] 

822 field_tags = sense_linkage_templates_tags.get(name, []) 

823 for i in range(2, 20): 

824 if i not in ht: 

825 break 

826 w = clean_node(wxr, data, ht[i]) 

827 if "#" in w: 

828 w = w[: w.index("#")] 

829 if w in ["", "<"]: # `<` used in "hypernyms" template 

830 continue 

831 if ( 831 ↛ 836line 831 didn't jump to line 836 because the condition on line 831 was never true

832 i > 2 

833 and w in (",", "or", ";") 

834 or w.startswith(("see also", "See also")) 

835 ): 

836 continue 

837 is_thesaurus = False 

838 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

839 if w.startswith(alias): 

840 is_thesaurus = True 

841 w = w[len(alias) :] 

842 if w != wxr.wtp.title: 842 ↛ 862line 842 didn't jump to line 862 because the condition on line 842 was always true

843 from ...thesaurus import search_thesaurus 

844 

845 lang_code = clean_node(wxr, None, ht.get(1, "")) 

846 for t_data in search_thesaurus( 

847 wxr.thesaurus_db_conn, # type: ignore 

848 w, 

849 lang_code, 

850 pos, 

851 "synonyms", # GH issue #1570 

852 ): 

853 l_data: LinkageData = { 

854 "word": t_data.term, 

855 "source": "Thesaurus:" + w, 

856 } 

857 if len(t_data.tags) > 0: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true

858 l_data["tags"] = t_data.tags 

859 if len(t_data.raw_tags) > 0: 859 ↛ 860line 859 didn't jump to line 860 because the condition on line 859 was never true

860 l_data["raw_tags"] = t_data.raw_tags 

861 data_append(data, field, l_data) 

862 break 

863 if is_thesaurus: 

864 continue 

865 tags: list[str] = [] 

866 topics: list[str] = [] 

867 english: Optional[str] = None 

868 # Try to find qualifiers for this synonym 

869 q = ht.get("q{}".format(i - 1)) 

870 if q: 

871 cls = classify_desc(q) 

872 if cls == "tags": 

873 tagsets1, topics1 = decode_tags(q) 

874 for ts in tagsets1: 

875 tags.extend(ts) 

876 topics.extend(topics1) 

877 elif cls == "english": 877 ↛ 883line 877 didn't jump to line 883 because the condition on line 877 was always true

878 if english: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true

879 english += "; " + q 

880 else: 

881 english = q 

882 # Try to find English translation for this synonym 

883 t = ht.get("t{}".format(i - 1)) 

884 if t: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true

885 if english: 

886 english += "; " + t 

887 else: 

888 english = t 

889 

890 # See if the linkage contains a parenthesized alt 

891 alt = None 

892 m = re.search(r"\(([^)]+)\)$", w) 

893 if m: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true

894 w = w[: m.start()].strip() 

895 alt = m.group(1) 

896 

897 dt = {"word": w} 

898 if field_tags: 898 ↛ 899line 898 didn't jump to line 899 because the condition on line 898 was never true

899 data_extend(dt, "tags", field_tags) 

900 if tags: 

901 data_extend(dt, "tags", tags) 

902 if topics: 902 ↛ 903line 902 didn't jump to line 903 because the condition on line 902 was never true

903 data_extend(dt, "topics", topics) 

904 if english: 

905 dt["english"] = english # DEPRECATED for "translation" 

906 dt["translation"] = english 

907 if alt: 907 ↛ 908line 907 didn't jump to line 908 because the condition on line 907 was never true

908 dt["alt"] = alt 

909 data_append(data, field, dt) 

910 

911 

912EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

913example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

914captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

915 

916 

917def synch_splits_with_args( 

918 line: str, targs: TemplateArgs 

919) -> Optional[list[str]]: 

920 """If it looks like there's something weird with how a line of example 

921 text has been split, this function will do the splitting after counting 

922 occurences of the splitting regex inside the two main template arguments 

923 containing the string data for the original language example and the 

924 English translations. 

925 """ 

926 # Previously, we split without capturing groups, but here we want to 

927 # keep the original splitting hyphen regex intact. 

928 fparts = captured_splitters_re.split(line) 

929 new_parts = [] 

930 # ["First", " – ", "second", " – ", "third..."] from OL argument 

931 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

932 new_parts.append("".join(fparts[:first])) 

933 # Translation argument 

934 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

935 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

936 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

937 new_parts.append("".join(fparts[first + 1 : second])) 

938 

939 if all(new_parts): # no empty strings from the above spaghetti 

940 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

941 return new_parts 

942 else: 

943 return None 

944 

945 

946QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

947QUALIFIERS_RE = re.compile(QUALIFIERS) 

948# (...): ... or (...(...)...): ... 

949 

950 

951def parse_language( 

952 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

953) -> list[WordData]: 

954 """Iterates over the text of the page, returning words (parts-of-speech) 

955 defined on the page one at a time. (Individual word senses for the 

956 same part-of-speech are typically encoded in the same entry.)""" 

957 # imported here to avoid circular import 

958 from .pronunciation import parse_pronunciation 

959 

960 assert isinstance(wxr, WiktextractContext) 

961 assert isinstance(langnode, WikiNode) 

962 assert isinstance(language, str) 

963 assert isinstance(lang_code, str) 

964 # print("parse_language", language) 

965 

966 is_reconstruction = False 

967 word: str = wxr.wtp.title # type: ignore[assignment] 

968 unsupported_prefix = "Unsupported titles/" 

969 if word.startswith(unsupported_prefix): 

970 w = word[len(unsupported_prefix) :] 

971 if w in unsupported_title_map: 971 ↛ 974line 971 didn't jump to line 974 because the condition on line 971 was always true

972 word = unsupported_title_map[w] 

973 else: 

974 wxr.wtp.error( 

975 "Unimplemented unsupported title: {}".format(word), 

976 sortid="page/870", 

977 ) 

978 word = w 

979 elif word.startswith("Reconstruction:"): 

980 word = word[word.find("/") + 1 :] 

981 is_reconstruction = True 

982 elif word.startswith("a/languages"): 982 ↛ 984line 982 didn't jump to line 984 because the condition on line 982 was never true

983 # ATM there's only one "mammoth page" in English wiktionary, 'a' 

984 word = "a" 

985 

986 base_data: WordData = { 

987 "word": word, 

988 "lang": language, 

989 "lang_code": lang_code, 

990 } 

991 if is_reconstruction: 

992 data_append(base_data, "tags", "reconstruction") 

993 sense_data: SenseData = {} 

994 pos_data: WordData = {} # For a current part-of-speech 

995 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

996 etym_data: WordData = {} # For one etymology 

997 sense_datas: list[SenseData] = [] 

998 sense_ordinal = 0 # The recursive sense parsing messes up the ordering 

999 # Never reset, do not use as data 

1000 level_four_datas: list[WordData] = [] 

1001 etym_datas: list[WordData] = [] 

1002 page_datas: list[WordData] = [] 

1003 have_etym = False 

1004 inside_level_four = False # This is for checking if the etymology section 

1005 # or article has a Pronunciation section, for Chinese mostly; because 

1006 # Chinese articles can have three level three sections (two etymology 

1007 # sections and pronunciation sections) one after another, we need a kludge 

1008 # to better keep track of whether we're in a normal "etym" or inside a 

1009 # "level four" (which is what we've turned the level three Pron sections 

1010 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

1011 # a step. 

1012 stack: list[str] = [] # names of items on the "stack" 

1013 

1014 def merge_base(data: WordData, base: WordData) -> None: 

1015 for k, v in base.items(): 

1016 # Copy the value to ensure that we don't share lists or 

1017 # dicts between structures (even nested ones). 

1018 v = copy.deepcopy(v) 

1019 if k not in data: 

1020 # The list was copied above, so this will not create shared ref 

1021 data[k] = v # type: ignore[literal-required] 

1022 continue 

1023 if data[k] == v: # type: ignore[literal-required] 

1024 continue 

1025 if ( 1025 ↛ 1033line 1025 didn't jump to line 1033 because the condition on line 1025 was always true

1026 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

1027 or isinstance( 

1028 v, 

1029 (list, tuple), # Should this be "and"? 

1030 ) 

1031 ): 

1032 data[k] = list(data[k]) + list(v) # type: ignore 

1033 elif data[k] != v: # type: ignore[literal-required] 

1034 wxr.wtp.warning( 

1035 "conflicting values for {} in merge_base: " 

1036 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1037 sortid="page/904", 

1038 ) 

1039 

1040 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1041 """Remove unnecessary keys from dict values 

1042 in a list comprehension...""" 

1043 if key in pron: 

1044 pron.pop(key) # type: ignore 

1045 return pron 

1046 

1047 def sound_matches_pos(sound: SoundData, pos: str) -> bool: 

1048 if "pos" not in sound: 

1049 return True 

1050 sound_pos = sound["pos"] # type: ignore[typeddict-item] 

1051 return pos in sound_pos 

1052 

1053 def strip_sound_pos(sound: SoundData) -> SoundData: 

1054 complementary_pop(sound, "pos") 

1055 return sound 

1056 

1057 # If the result has sounds, eliminate sounds that have a prefix that 

1058 # does not match "word" or one of "forms" 

1059 if "sounds" in data and "word" in data: 

1060 accepted = [data["word"]] 

1061 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1062 data["sounds"] = list( 

1063 s 

1064 for s in data["sounds"] 

1065 if "form" not in s or s["form"] in accepted 

1066 ) 

1067 # If the result has sounds, eliminate sounds that have a pos that 

1068 # does not match "pos" 

1069 if "sounds" in data and "pos" in data: 

1070 data["sounds"] = list( 

1071 strip_sound_pos(s) 

1072 for s in data["sounds"] 

1073 # "pos" is not a field of SoundData, correctly, so we're 

1074 # removing it here. It's a kludge on a kludge on a kludge. 

1075 if sound_matches_pos(s, data["pos"]) 

1076 ) 

1077 elif "sounds" in data: 1077 ↛ 1078line 1077 didn't jump to line 1078 because the condition on line 1077 was never true

1078 data["sounds"] = [strip_sound_pos(s) for s in data["sounds"]] 

1079 

1080 def push_sense(sorting_ordinal: int | None = None) -> bool: 

1081 """Starts collecting data for a new word sense. This returns True 

1082 if a sense was added.""" 

1083 nonlocal sense_data 

1084 if sorting_ordinal is None: 

1085 sorting_ordinal = sense_ordinal 

1086 tags = sense_data.get("tags", ()) 

1087 if ( 

1088 not sense_data.get("glosses") 

1089 and "translation-hub" not in tags 

1090 and "no-gloss" not in tags 

1091 ): 

1092 return False 

1093 

1094 if ( 1094 ↛ 1104line 1094 didn't jump to line 1104 because the condition on line 1094 was never true

1095 ( 

1096 "participle" in sense_data.get("tags", ()) 

1097 or "infinitive" in sense_data.get("tags", ()) 

1098 ) 

1099 and "alt_of" not in sense_data 

1100 and "form_of" not in sense_data 

1101 and "etymology_text" in etym_data 

1102 and etym_data["etymology_text"] != "" 

1103 ): 

1104 etym = etym_data["etymology_text"] 

1105 etym = etym.split(". ")[0] 

1106 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1107 if ret is not None: 

1108 tags, lst = ret 

1109 assert isinstance(lst, (list, tuple)) 

1110 if "form-of" in tags: 

1111 data_extend(sense_data, "form_of", lst) 

1112 data_extend(sense_data, "tags", tags) 

1113 elif "alt-of" in tags: 

1114 data_extend(sense_data, "alt_of", lst) 

1115 data_extend(sense_data, "tags", tags) 

1116 

1117 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1117 ↛ 1120line 1117 didn't jump to line 1120 because the condition on line 1117 was never true

1118 "tags", () 

1119 ): 

1120 data_append(sense_data, "tags", "no-gloss") 

1121 

1122 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore 

1123 sense_datas.append(sense_data) 

1124 sense_data = {} 

1125 return True 

1126 

1127 def push_pos(sorting_ordinal: int | None = None) -> None: 

1128 """Starts collecting data for a new part-of-speech.""" 

1129 nonlocal pos_data 

1130 nonlocal sense_datas 

1131 push_sense(sorting_ordinal) 

1132 if wxr.wtp.subsection: 

1133 data: WordData = {"senses": sense_datas} 

1134 merge_base(data, pos_data) 

1135 level_four_datas.append(data) 

1136 pos_data = {} 

1137 sense_datas = [] 

1138 wxr.wtp.start_subsection(None) 

1139 

1140 def push_level_four_section(clear_sound_data: bool) -> None: 

1141 """Starts collecting data for a new level four sections, which 

1142 is usually virtual and empty, unless the article has Chinese 

1143 'Pronunciation' sections that are etymology-section-like but 

1144 under etymology, and at the same level in the source. We modify 

1145 the source to demote Pronunciation sections like that to level 

1146 4, and other sections one step lower.""" 

1147 nonlocal level_four_data 

1148 nonlocal level_four_datas 

1149 nonlocal etym_datas 

1150 push_pos() 

1151 # print(f"======\n{etym_data=}") 

1152 # print(f"======\n{etym_datas=}") 

1153 # print(f"======\n{level_four_data=}") 

1154 # print(f"======\n{level_four_datas=}") 

1155 for data in level_four_datas: 

1156 merge_base(data, level_four_data) 

1157 etym_datas.append(data) 

1158 for data in etym_datas: 

1159 merge_base(data, etym_data) 

1160 page_datas.append(data) 

1161 if clear_sound_data: 

1162 level_four_data = {} 

1163 level_four_datas = [] 

1164 etym_datas = [] 

1165 

1166 def push_etym() -> None: 

1167 """Starts collecting data for a new etymology.""" 

1168 nonlocal etym_data 

1169 nonlocal etym_datas 

1170 nonlocal have_etym 

1171 nonlocal inside_level_four 

1172 have_etym = True 

1173 push_level_four_section(False) 

1174 inside_level_four = False 

1175 # etymology section could under pronunciation section 

1176 etym_data = ( 

1177 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {} 

1178 ) 

1179 

1180 def select_data() -> WordData: 

1181 """Selects where to store data (pos or etym) based on whether we 

1182 are inside a pos (part-of-speech).""" 

1183 # print(f"{wxr.wtp.subsection=}") 

1184 # print(f"{stack=}") 

1185 if wxr.wtp.subsection is not None: 

1186 return pos_data 

1187 if inside_level_four: 

1188 return level_four_data 

1189 if stack[-1] == language: 

1190 return base_data 

1191 return etym_data 

1192 

1193 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1194 """Parses the subsection for a part-of-speech under a language on 

1195 a page.""" 

1196 assert isinstance(posnode, WikiNode) 

1197 assert isinstance(pos, str) 

1198 # print("parse_part_of_speech", pos) 

1199 pos_data["pos"] = pos 

1200 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1201 lists: list[list[WikiNode]] = [[]] # list of lists 

1202 first_para = True 

1203 first_head_tmplt = True 

1204 collecting_head = True 

1205 start_of_paragraph = True 

1206 

1207 # XXX extract templates from posnode with recursively_extract 

1208 # that break stuff, like ja-kanji or az-suffix-form. 

1209 # Do the extraction with a list of template names, combined from 

1210 # different lists, then separate out them into different lists 

1211 # that are handled at different points of the POS section. 

1212 # First, extract az-suffix-form, put it in `inflection`, 

1213 # and parse `inflection`'s content when appropriate later. 

1214 # The contents of az-suffix-form (and ja-kanji) that generate 

1215 # divs with "floatright" in their style gets deleted by 

1216 # clean_value, so templates that slip through from here won't 

1217 # break anything. 

1218 # XXX bookmark 

1219 # print("===================") 

1220 # print(posnode.children) 

1221 

1222 floaters, poschildren = recursively_extract( 

1223 posnode.children, 

1224 lambda x: ( 

1225 isinstance(x, WikiNode) 

1226 and ( 

1227 ( 

1228 isinstance(x, TemplateNode) 

1229 and x.template_name in FLOATING_TABLE_TEMPLATES 

1230 ) 

1231 or ( 

1232 x.kind == NodeKind.LINK 

1233 # Need to check for stringiness because some links are 

1234 # broken; for example, if a template is missing an 

1235 # argument, a link might look like `[[{{{1}}}...]]` 

1236 and len(x.largs) > 0 

1237 and len(x.largs[0]) > 0 

1238 and isinstance(x.largs[0][0], str) 

1239 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1240 ) 

1241 ) 

1242 ), 

1243 ) 

1244 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1245 tempnode.largs = [["Inflection"]] 

1246 tempnode.children = floaters 

1247 parse_inflection(tempnode, "Floating Div", pos) 

1248 # print(poschildren) 

1249 # XXX new above 

1250 

1251 if not poschildren: 1251 ↛ 1252line 1251 didn't jump to line 1252 because the condition on line 1251 was never true

1252 if not floaters: 

1253 wxr.wtp.debug( 

1254 "PoS section without contents", 

1255 sortid="en/page/1051/20230612", 

1256 ) 

1257 else: 

1258 wxr.wtp.debug( 

1259 "PoS section without contents except for a floating table", 

1260 sortid="en/page/1056/20230612", 

1261 ) 

1262 return 

1263 

1264 for node in poschildren: 

1265 if isinstance(node, str): 

1266 for m in re.finditer(r"\n+|[^\n]+", node): 

1267 p = m.group(0) 

1268 if p.startswith("\n\n") and pre: 

1269 first_para = False 

1270 start_of_paragraph = True 

1271 break 

1272 if p and collecting_head: 

1273 pre[-1].append(p) 

1274 continue 

1275 assert isinstance(node, WikiNode) 

1276 kind = node.kind 

1277 if kind == NodeKind.LIST: 

1278 lists[-1].append(node) 

1279 collecting_head = False 

1280 start_of_paragraph = True 

1281 continue 

1282 elif kind in LEVEL_KINDS: 

1283 # Stop parsing section if encountering any kind of 

1284 # level header (like ===Noun=== or ====Further Reading====). 

1285 # At a quick glance, this should be the default behavior, 

1286 # but if some kinds of source articles have sub-sub-sections 

1287 # that should be parsed XXX it should be handled by changing 

1288 # this break. 

1289 break 

1290 elif collecting_head and kind == NodeKind.LINK: 

1291 # We might collect relevant links as they are often pictures 

1292 # relating to the word 

1293 if len(node.largs[0]) >= 1 and isinstance( 1293 ↛ 1308line 1293 didn't jump to line 1308 because the condition on line 1293 was always true

1294 node.largs[0][0], str 

1295 ): 

1296 if node.largs[0][0].startswith( 1296 ↛ 1302line 1296 didn't jump to line 1302 because the condition on line 1296 was never true

1297 ns_title_prefix_tuple(wxr, "Category") 

1298 ): 

1299 # [[Category:...]] 

1300 # We're at the end of the file, probably, so stop 

1301 # here. Otherwise the head will get garbage. 

1302 break 

1303 if node.largs[0][0].startswith( 

1304 ns_title_prefix_tuple(wxr, "File") 

1305 ): 

1306 # Skips file links 

1307 continue 

1308 start_of_paragraph = False 

1309 pre[-1].append(node) 

1310 elif kind == NodeKind.HTML: 

1311 if node.sarg == "br": 

1312 if pre[-1]: 1312 ↛ 1264line 1312 didn't jump to line 1264 because the condition on line 1312 was always true

1313 pre.append([]) # Switch to next head 

1314 lists.append([]) # Lists parallels pre 

1315 collecting_head = True 

1316 start_of_paragraph = True 

1317 elif collecting_head and node.sarg not in ( 1317 ↛ 1323line 1317 didn't jump to line 1323 because the condition on line 1317 was never true

1318 "gallery", 

1319 "ref", 

1320 "cite", 

1321 "caption", 

1322 ): 

1323 start_of_paragraph = False 

1324 pre[-1].append(node) 

1325 else: 

1326 start_of_paragraph = False 

1327 elif isinstance(node, TemplateNode): 

1328 # XXX Insert code here that disambiguates between 

1329 # templates that generate word heads and templates 

1330 # that don't. 

1331 # There's head_tag_re that seems like a regex meant 

1332 # to identify head templates. Too bad it's None. 

1333 

1334 # ignore {{category}}, {{cat}}... etc. 

1335 if node.template_name in stop_head_at_these_templates: 

1336 # we've reached a template that should be at the end, 

1337 continue 

1338 

1339 # skip these templates; panel_templates is already used 

1340 # to skip certain templates else, but it also applies to 

1341 # head parsing quite well. 

1342 # node.largs[0][0] should always be str, but can't type-check 

1343 # that. 

1344 if is_panel_template(wxr, node.template_name): 

1345 continue 

1346 # skip these templates 

1347 # if node.largs[0][0] in skip_these_templates_in_head: 

1348 # first_head_tmplt = False # no first_head_tmplt at all 

1349 # start_of_paragraph = False 

1350 # continue 

1351 

1352 if first_head_tmplt and pre[-1]: 

1353 first_head_tmplt = False 

1354 start_of_paragraph = False 

1355 pre[-1].append(node) 

1356 elif pre[-1] and start_of_paragraph: 

1357 pre.append([]) # Switch to the next head 

1358 lists.append([]) # lists parallel pre 

1359 collecting_head = True 

1360 start_of_paragraph = False 

1361 pre[-1].append(node) 

1362 else: 

1363 pre[-1].append(node) 

1364 elif first_para: 

1365 start_of_paragraph = False 

1366 if collecting_head: 1366 ↛ 1264line 1366 didn't jump to line 1264 because the condition on line 1366 was always true

1367 pre[-1].append(node) 

1368 # XXX use template_fn in clean_node to check that the head macro 

1369 # is compatible with the current part-of-speech and generate warning 

1370 # if not. Use template_allowed_pos_map. 

1371 

1372 # Clean up empty pairs, and fix messes with extra newlines that 

1373 # separate templates that are followed by lists wiktextract issue #314 

1374 

1375 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1376 cleaned_lists: list[list[WikiNode]] = [] 

1377 pairless_pre_index = None 

1378 

1379 for pre1, ls in zip(pre, lists): 

1380 if pre1 and not ls: 

1381 pairless_pre_index = len(cleaned_pre) 

1382 if not pre1 and not ls: 1382 ↛ 1384line 1382 didn't jump to line 1384 because the condition on line 1382 was never true

1383 # skip [] + [] 

1384 continue 

1385 if not ls and all( 

1386 (isinstance(x, str) and not x.strip()) for x in pre1 

1387 ): 

1388 # skip ["\n", " "] + [] 

1389 continue 

1390 if ls and not pre1: 

1391 if pairless_pre_index is not None: 1391 ↛ 1392line 1391 didn't jump to line 1392 because the condition on line 1391 was never true

1392 cleaned_lists[pairless_pre_index] = ls 

1393 pairless_pre_index = None 

1394 continue 

1395 cleaned_pre.append(pre1) 

1396 cleaned_lists.append(ls) 

1397 

1398 pre = cleaned_pre 

1399 lists = cleaned_lists 

1400 

1401 there_are_many_heads = len(pre) > 1 

1402 header_tags: list[str] = [] 

1403 header_topics: list[str] = [] 

1404 previous_head_had_list = False 

1405 

1406 if not any(g for g in lists): 

1407 process_gloss_without_list( 

1408 poschildren, pos, pos_data, header_tags, header_topics 

1409 ) 

1410 else: 

1411 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1412 # if len(ls) == 0: 

1413 # # don't have gloss list 

1414 # # XXX add code here to filter out 'garbage', like text 

1415 # # that isn't a head template or head. 

1416 # continue 

1417 

1418 if all(not sl for sl in lists[i:]): 

1419 if i == 0: 1419 ↛ 1420line 1419 didn't jump to line 1420 because the condition on line 1419 was never true

1420 if isinstance(node, str): 

1421 wxr.wtp.debug( 

1422 "first head without list of senses," 

1423 "string: '{}[...]', {}/{}".format( 

1424 node[:20], word, language 

1425 ), 

1426 sortid="page/1689/20221215", 

1427 ) 

1428 if isinstance(node, WikiNode): 

1429 if node.largs and node.largs[0][0] in [ 

1430 "Han char", 

1431 ]: 

1432 # just ignore these templates 

1433 pass 

1434 else: 

1435 wxr.wtp.debug( 

1436 "first head without " 

1437 "list of senses, " 

1438 "template node " 

1439 "{}, {}/{}".format( 

1440 node.largs, word, language 

1441 ), 

1442 sortid="page/1694/20221215", 

1443 ) 

1444 else: 

1445 wxr.wtp.debug( 

1446 "first head without list of senses, " 

1447 "{}/{}".format(word, language), 

1448 sortid="page/1700/20221215", 

1449 ) 

1450 # no break here so that the first head always 

1451 # gets processed. 

1452 else: 

1453 if isinstance(node, str): 1453 ↛ 1454line 1453 didn't jump to line 1454 because the condition on line 1453 was never true

1454 wxr.wtp.debug( 

1455 "later head without list of senses," 

1456 "string: '{}[...]', {}/{}".format( 

1457 node[:20], word, language 

1458 ), 

1459 sortid="page/1708/20221215", 

1460 ) 

1461 if isinstance(node, WikiNode): 1461 ↛ 1473line 1461 didn't jump to line 1473 because the condition on line 1461 was always true

1462 wxr.wtp.debug( 

1463 "later head without list of senses," 

1464 "template node " 

1465 "{}, {}/{}".format( 

1466 node.sarg if node.sarg else node.largs, 

1467 word, 

1468 language, 

1469 ), 

1470 sortid="page/1713/20221215", 

1471 ) 

1472 else: 

1473 wxr.wtp.debug( 

1474 "later head without list of senses, " 

1475 "{}/{}".format(word, language), 

1476 sortid="page/1719/20221215", 

1477 ) 

1478 break 

1479 head_group = i + 1 if there_are_many_heads else None 

1480 # print("parse_part_of_speech: {}: {}: pre={}" 

1481 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1482 

1483 if previous_head_had_list: 

1484 # We use a boolean flag here because we want to be able 

1485 # let the header_tags data pass through after the loop 

1486 # is over without accidentally emptying it, if there are 

1487 # no pos_datas and we need a dummy data. 

1488 header_tags.clear() 

1489 header_topics.clear() 

1490 

1491 # print(f"{pre1=}") 

1492 process_gloss_header( 

1493 pre1, pos, head_group, pos_data, header_tags, header_topics 

1494 ) 

1495 for ln in ls: 

1496 # Parse each list associated with this head. 

1497 for node in ln.children: 

1498 # Parse nodes in l.children recursively. 

1499 # The recursion function uses push_sense() to 

1500 # add stuff into sense_datas, and returns True or 

1501 # False if something is added, which bubbles upward. 

1502 # If the bubble is "True", then higher levels of 

1503 # the recursion will not push_sense(), because 

1504 # the data is already pushed into a sub-gloss 

1505 # downstream, unless the higher level has examples 

1506 # that need to be put somewhere. 

1507 common_data: SenseData = { 

1508 "tags": list(header_tags), 

1509 "topics": list(header_topics), 

1510 } 

1511 if head_group: 

1512 common_data["head_nr"] = head_group 

1513 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1514 

1515 if len(ls) > 0: 

1516 previous_head_had_list = True 

1517 else: 

1518 previous_head_had_list = False 

1519 

1520 # If there are no senses extracted, add a dummy sense. We want to 

1521 # keep tags extracted from the head for the dummy sense. 

1522 push_sense() # Make sure unfinished data pushed, and start clean sense 

1523 if len(sense_datas) == 0: 

1524 data_extend(sense_data, "tags", header_tags) 

1525 data_extend(sense_data, "topics", header_topics) 

1526 data_append(sense_data, "tags", "no-gloss") 

1527 push_sense() 

1528 

1529 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore 

1530 

1531 for sd in sense_datas: 

1532 if "__temp_sense_sorting_ordinal" in sd: 1532 ↛ 1531line 1532 didn't jump to line 1531 because the condition on line 1532 was always true

1533 del sd["__temp_sense_sorting_ordinal"] # type: ignore 

1534 

1535 term_label_templates: list[TemplateData] = [] 

1536 normal_label_templates: list[TemplateData] = [] 

1537 

1538 def head_post_template_fn( 

1539 name: str, ht: TemplateArgs, expansion: str 

1540 ) -> Optional[str]: 

1541 """Handles special templates in the head section of a word. Head 

1542 section is the text after part-of-speech subtitle and before word 

1543 sense list. Typically it generates the bold line for the word, but 

1544 may also contain other useful information that often ends in 

1545 side boxes. We want to capture some of that additional information.""" 

1546 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1547 if is_panel_template(wxr, name): 1547 ↛ 1550line 1547 didn't jump to line 1550 because the condition on line 1547 was never true

1548 # Completely ignore these templates (not even recorded in 

1549 # head_templates) 

1550 return "" 

1551 if name == "head": 

1552 # XXX are these also captured in forms? Should this special case 

1553 # be removed? 

1554 t = ht.get(2, "") 

1555 if t == "pinyin": 1555 ↛ 1556line 1555 didn't jump to line 1556 because the condition on line 1555 was never true

1556 data_append(pos_data, "tags", "Pinyin") 

1557 elif t == "romanization": 1557 ↛ 1558line 1557 didn't jump to line 1558 because the condition on line 1557 was never true

1558 data_append(pos_data, "tags", "romanization") 

1559 if ( 

1560 HEAD_TAG_RE.search(name) is not None 

1561 or name in PROBLEMATIC_TEMPLATES_CLUMP 

1562 ): 

1563 args_ht = clean_template_args(wxr, ht) 

1564 cleaned_expansion = clean_node(wxr, None, expansion) 

1565 dt: TemplateData = { 

1566 "name": name, 

1567 "args": args_ht, 

1568 "expansion": cleaned_expansion, 

1569 } 

1570 if name in ETYMOLOGY_TEMPLATES_IN_HEADS: 

1571 data_append(pos_data, "etymology_templates", dt) 

1572 else: 

1573 data_append(pos_data, "head_templates", dt) 

1574 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1575 term_label_templates.append(dt) 

1576 # Squash these, their tags are applied to the whole word, 

1577 # and some cause problems like "term-label" 

1578 return "" 

1579 

1580 # The following are both captured in head_templates and parsed 

1581 # separately 

1582 

1583 if name in wikipedia_templates: 

1584 # Note: various places expect to have content from wikipedia 

1585 # templates, so cannot convert this to empty 

1586 parse_wikipedia_template(wxr, pos_data, ht) 

1587 return None 

1588 

1589 if name == "number box": 1589 ↛ 1591line 1589 didn't jump to line 1591 because the condition on line 1589 was never true

1590 # XXX extract numeric value? 

1591 return "" 

1592 if name == "enum": 

1593 # XXX extract? 

1594 return "" 

1595 if name == "cardinalbox": 1595 ↛ 1598line 1595 didn't jump to line 1598 because the condition on line 1595 was never true

1596 # XXX extract similar to enum? 

1597 # XXX this can also occur in top-level under language 

1598 return "" 

1599 if name == "Han simplified forms": 1599 ↛ 1601line 1599 didn't jump to line 1601 because the condition on line 1599 was never true

1600 # XXX extract? 

1601 return "" 

1602 # if name == "ja-kanji forms": 

1603 # # XXX extract? 

1604 # return "" 

1605 # if name == "vi-readings": 

1606 # # XXX extract? 

1607 # return "" 

1608 # if name == "ja-kanji": 

1609 # # XXX extract? 

1610 # return "" 

1611 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1611 ↛ 1613line 1611 didn't jump to line 1613 because the condition on line 1611 was never true

1612 # XXX extract? 

1613 return "" 

1614 if name == "defdate": 1614 ↛ 1616line 1614 didn't jump to line 1616 because the condition on line 1614 was never true

1615 # the one exampe I saw of this in a head was weird. 

1616 return "" 

1617 if name in ("lb", "lbl", "label"): 

1618 args_ht = clean_template_args(wxr, ht) 

1619 cleaned_expansion = clean_node(wxr, None, expansion).strip("()") 

1620 dt = { 

1621 "name": name, 

1622 "args": args_ht, 

1623 "expansion": cleaned_expansion, 

1624 } 

1625 normal_label_templates.append(dt) 

1626 # The parens around __LABEL... below is meaningful: label 

1627 # templates generate text with parens, so if we add the magical 

1628 # phrase here with parens, it will look like a normal label that 

1629 # will be handled as a parenthetical text; only when handling 

1630 # parenthetical text do we need to actually actually access 

1631 # the contents of the label. 

1632 return f"(__LABEL_TEMPLATE_{len(normal_label_templates) - 1}__)" 

1633 

1634 return None 

1635 

1636 def process_gloss_header( 

1637 header_nodes: list[Union[WikiNode, str]], 

1638 pos_type: str, 

1639 header_group: Optional[int], 

1640 pos_data: WordData, 

1641 header_tags: list[str], 

1642 header_topics: list[str], 

1643 ) -> None: 

1644 ruby = [] 

1645 

1646 # process template parse nodes here 

1647 new_nodes = [] 

1648 info_template_data = [] 

1649 for node in header_nodes: 

1650 # print(f"{node=}") 

1651 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1652 if info_data or info_out: 

1653 if info_data: 1653 ↛ 1655line 1653 didn't jump to line 1655 because the condition on line 1653 was always true

1654 info_template_data.append(info_data) 

1655 if info_out: # including just the original node 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true

1656 new_nodes.append(info_out) 

1657 else: 

1658 new_nodes.append(node) 

1659 header_nodes = new_nodes 

1660 

1661 if info_template_data: 

1662 if "info_templates" not in pos_data: 1662 ↛ 1665line 1662 didn't jump to line 1665 because the condition on line 1662 was always true

1663 pos_data["info_templates"] = info_template_data 

1664 else: 

1665 pos_data["info_templates"].extend(info_template_data) 

1666 

1667 if lang_code == "ja": 

1668 exp = wxr.wtp.parse( 

1669 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1670 ) 

1671 rub, _ = recursively_extract( 

1672 exp.children, 

1673 lambda x: ( 

1674 isinstance(x, WikiNode) 

1675 and x.kind == NodeKind.HTML 

1676 and x.sarg == "ruby" 

1677 ), 

1678 ) 

1679 if rub is not None: 1679 ↛ 1723line 1679 didn't jump to line 1723 because the condition on line 1679 was always true

1680 for r in rub: 

1681 if TYPE_CHECKING: 

1682 # we know the lambda above in recursively_extract 

1683 # returns only WikiNodes in rub 

1684 assert isinstance(r, WikiNode) 

1685 rt = parse_ruby(wxr, r) 

1686 if rt is not None: 1686 ↛ 1680line 1686 didn't jump to line 1680 because the condition on line 1686 was always true

1687 ruby.append(rt) 

1688 elif lang_code == "vi": 

1689 # Handle vi-readings templates that have a weird structures for 

1690 # Chu Nom vietnamese characters heads 

1691 # https://en.wiktionary.org/wiki/Template:vi-readings 

1692 new_header_nodes = [] 

1693 related_readings: list[LinkageData] = [] 

1694 for node in header_nodes: 

1695 if ( 1695 ↛ 1718line 1695 didn't jump to line 1718 because the condition on line 1695 was always true

1696 isinstance(node, TemplateNode) 

1697 and node.template_name == "vi-readings" 

1698 ): 

1699 for parameter, tag in ( 

1700 ("hanviet", "han-viet-reading"), 

1701 ("nom", "nom-reading"), 

1702 # we ignore the fanqie parameter "phienthiet" 

1703 ): 

1704 arg = node.template_parameters.get(parameter) 

1705 if arg is not None: 1705 ↛ 1699line 1705 didn't jump to line 1699 because the condition on line 1705 was always true

1706 text = clean_node(wxr, None, arg) 

1707 for w in text.split(","): 

1708 # ignore - separated references 

1709 if "-" in w: 

1710 w = w[: w.index("-")] 

1711 w = w.strip() 

1712 related_readings.append( 

1713 LinkageData(word=w, tags=[tag]) 

1714 ) 

1715 continue 

1716 

1717 # Skip the vi-reading template for the rest of the head parsing 

1718 new_header_nodes.append(node) 

1719 if len(related_readings) > 0: 1719 ↛ 1723line 1719 didn't jump to line 1723 because the condition on line 1719 was always true

1720 data_extend(pos_data, "related", related_readings) 

1721 header_nodes = new_header_nodes 

1722 

1723 header_text = clean_node( 

1724 wxr, 

1725 pos_data, 

1726 header_nodes, 

1727 post_template_fn=head_post_template_fn, 

1728 collect_links=True, 

1729 remove_anchors_from_links=True, 

1730 ) 

1731 if "links" in pos_data: 

1732 # WordData doesn't use `links`, so we can use `collect_links=True` 

1733 # above without special handling and smuggle link data. 

1734 extracted_links = pos_data["links"] # type: ignore 

1735 del pos_data["links"] # type: ignore 

1736 else: 

1737 extracted_links = None 

1738 # print(f"{header_text=}, {extracted_links=}") 

1739 

1740 header_text = re.sub(r"\s+", " ", header_text).strip() 

1741 

1742 if not header_text: 

1743 return 

1744 

1745 term_label_tags: list[str] = [] 

1746 term_label_topics: list[str] = [] 

1747 if len(term_label_templates) > 0: 

1748 # parse term label templates; if there are other similar kinds 

1749 # of templates in headers that you want to squash and apply as 

1750 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1751 for templ_data in term_label_templates: 

1752 # print(templ_data) 

1753 expan = templ_data.get("expansion", "").strip("().,; ") 

1754 if not expan: 1754 ↛ 1755line 1754 didn't jump to line 1755 because the condition on line 1754 was never true

1755 continue 

1756 tlb_tagsets, tlb_topics = decode_tags(expan) 

1757 for tlb_tags in tlb_tagsets: 

1758 if len(tlb_tags) > 0 and not any( 

1759 t.startswith("error-") for t in tlb_tags 

1760 ): 

1761 term_label_tags.extend(tlb_tags) 

1762 term_label_topics.extend(tlb_topics) 

1763 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1764 

1765 # print(f"{header_text=}") 

1766 parse_word_head( 

1767 wxr, 

1768 word, 

1769 pos_type, 

1770 header_text, 

1771 pos_data, 

1772 is_reconstruction, 

1773 header_group, 

1774 header_nodes, 

1775 ruby=ruby, 

1776 links=extracted_links, 

1777 label_templates=normal_label_templates, 

1778 ) 

1779 if "tags" in pos_data: 

1780 # pos_data can get "tags" data from some source; type-checkers 

1781 # doesn't like it, so let's ignore it. 

1782 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1783 del pos_data["tags"] # type: ignore[typeddict-item] 

1784 if len(term_label_tags) > 0: 

1785 header_tags.extend(term_label_tags) 

1786 if len(term_label_topics) > 0: 

1787 header_topics.extend(term_label_topics) 

1788 

1789 def process_gloss_without_list( 

1790 nodes: list[Union[WikiNode, str]], 

1791 pos_type: str, 

1792 pos_data: WordData, 

1793 header_tags: list[str], 

1794 header_topics: list[str], 

1795 ) -> None: 

1796 # gloss text might not inside a list 

1797 header_nodes: list[Union[str, WikiNode]] = [] 

1798 gloss_nodes: list[Union[str, WikiNode]] = [] 

1799 for node in strip_nodes(nodes): 

1800 if isinstance(node, WikiNode): 

1801 if isinstance(node, TemplateNode): 

1802 if node.template_name in ( 

1803 "zh-see", 

1804 "ja-see", 

1805 "ja-see-kango", 

1806 ): 

1807 continue # soft redirect 

1808 elif ( 

1809 node.template_name == "head" 

1810 or node.template_name.startswith(f"{lang_code}-") 

1811 ): 

1812 header_nodes.append(node) 

1813 continue 

1814 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1814 ↛ 1816line 1814 didn't jump to line 1816 because the condition on line 1814 was always true

1815 break 

1816 gloss_nodes.append(node) 

1817 

1818 if len(header_nodes) > 0: 

1819 process_gloss_header( 

1820 header_nodes, 

1821 pos_type, 

1822 None, 

1823 pos_data, 

1824 header_tags, 

1825 header_topics, 

1826 ) 

1827 if len(gloss_nodes) > 0: 

1828 process_gloss_contents( 

1829 gloss_nodes, 

1830 pos_type, 

1831 {"tags": list(header_tags), "topics": list(header_topics)}, 

1832 ) 

1833 

1834 def parse_sense_node( 

1835 node: Union[str, WikiNode], # never receives str 

1836 sense_base: SenseData, 

1837 pos: str, 

1838 ) -> bool: 

1839 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1840 Uses push_sense() to attempt adding data to pos_data in the scope 

1841 of parse_language() when it reaches deep in the recursion. push_sense() 

1842 returns True if it succeeds, and that is bubbled up the stack; if 

1843 a sense was added downstream, the higher levels (whose shared data 

1844 was already added by a subsense) do not push_sense(), unless it 

1845 has examples that need to be put somewhere. 

1846 """ 

1847 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1848 

1849 nonlocal sense_ordinal 

1850 my_ordinal = sense_ordinal # copies, not a reference 

1851 sense_ordinal += 1 # only use for sorting 

1852 

1853 if not isinstance(node, WikiNode): 1853 ↛ 1855line 1853 didn't jump to line 1855 because the condition on line 1853 was never true

1854 # This doesn't seem to ever happen in practice. 

1855 wxr.wtp.debug( 

1856 "{}: parse_sense_node called with" 

1857 "something that isn't a WikiNode".format(pos), 

1858 sortid="page/1287/20230119", 

1859 ) 

1860 return False 

1861 

1862 if node.kind != NodeKind.LIST_ITEM: 1862 ↛ 1863line 1862 didn't jump to line 1863 because the condition on line 1862 was never true

1863 wxr.wtp.debug( 

1864 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1865 ) 

1866 return False 

1867 

1868 if node.sarg == ":": 

1869 # Skip example entries at the highest level, ones without 

1870 # a sense ("...#") above them. 

1871 # If node.sarg is exactly and only ":", then it's at 

1872 # the highest level; lower levels would have more 

1873 # "indentation", like "#:" or "##:" 

1874 return False 

1875 

1876 # If a recursion call succeeds in push_sense(), bubble it up with 

1877 # `added`. 

1878 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1879 added = False 

1880 

1881 gloss_template_args: set[str] = set() 

1882 

1883 # For LISTs and LIST_ITEMS, their argument is something like 

1884 # "##" or "##:", and using that we can rudimentally determine 

1885 # list 'depth' if need be, and also what kind of list or 

1886 # entry it is; # is for normal glosses, : for examples (indent) 

1887 # and * is used for quotations on wiktionary. 

1888 current_depth = node.sarg 

1889 

1890 children = node.children 

1891 

1892 # subentries, (presumably) a list 

1893 # of subglosses below this. The list's 

1894 # argument ends with #, and its depth should 

1895 # be bigger than parent node. 

1896 subentries = [ 

1897 x 

1898 for x in children 

1899 if isinstance(x, WikiNode) 

1900 and x.kind == NodeKind.LIST 

1901 and x.sarg == current_depth + "#" 

1902 ] 

1903 

1904 # sublists of examples and quotations. .sarg 

1905 # does not end with "#". 

1906 others = [ 

1907 x 

1908 for x in children 

1909 if isinstance(x, WikiNode) 

1910 and x.kind == NodeKind.LIST 

1911 and x.sarg != current_depth + "#" 

1912 ] 

1913 

1914 # the actual contents of this particular node. 

1915 # can be a gloss (or a template that expands into 

1916 # many glosses which we can't easily pre-expand) 

1917 # or could be an "outer gloss" with more specific 

1918 # subglosses, or could be a qualfier for the subglosses. 

1919 contents = [ 

1920 x 

1921 for x in children 

1922 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1923 ] 

1924 # If this entry has sublists of entries, we should combine 

1925 # gloss information from both the "outer" and sublist content. 

1926 # Sometimes the outer gloss 

1927 # is more non-gloss or tags, sometimes it is a coarse sense 

1928 # and the inner glosses are more specific. The outer one 

1929 # does not seem to have qualifiers. 

1930 

1931 # If we have one sublist with one element, treat it 

1932 # specially as it may be a Wiktionary error; raise 

1933 # that nested element to the same level. 

1934 # XXX If need be, this block can be easily removed in 

1935 # the current recursive logicand the result is one sense entry 

1936 # with both glosses in the glosses list, as you would 

1937 # expect. If the higher entry has examples, there will 

1938 # be a higher entry with some duplicated data. 

1939 if len(subentries) == 1: 

1940 slc = subentries[0].children 

1941 if len(slc) == 1: 

1942 # copy current node and modify it so it doesn't 

1943 # loop infinitely. 

1944 cropped_node = copy.copy(node) 

1945 cropped_node.children = [ 

1946 x 

1947 for x in children 

1948 if not ( 

1949 isinstance(x, WikiNode) 

1950 and x.kind == NodeKind.LIST 

1951 and x.sarg == current_depth + "#" 

1952 ) 

1953 ] 

1954 added |= parse_sense_node(cropped_node, sense_base, pos) 

1955 nonlocal sense_data # this kludge causes duplicated raw_ 

1956 # glosses data if this is not done; 

1957 # if the top-level (cropped_node) 

1958 # does not push_sense() properly or 

1959 # parse_sense_node() returns early, 

1960 # sense_data is not reset. This happens 

1961 # for example when you have a no-gloss 

1962 # string like "(intransitive)": 

1963 # no gloss, push_sense() returns early 

1964 # and sense_data has duplicate data with 

1965 # sense_base 

1966 sense_data = {} 

1967 added |= parse_sense_node(slc[0], sense_base, pos) 

1968 return added 

1969 

1970 return process_gloss_contents( 

1971 contents, 

1972 pos, 

1973 sense_base, 

1974 subentries, 

1975 others, 

1976 gloss_template_args, 

1977 added, 

1978 my_ordinal, 

1979 ) 

1980 

1981 def process_gloss_contents( 

1982 contents: list[Union[str, WikiNode]], 

1983 pos: str, 

1984 sense_base: SenseData, 

1985 subentries: list[WikiNode] = [], 

1986 others: list[WikiNode] = [], 

1987 gloss_template_args: Set[str] = set(), 

1988 added: bool = False, 

1989 sorting_ordinal: int | None = None, 

1990 ) -> bool: 

1991 def sense_template_fn( 

1992 name: str, ht: TemplateArgs, is_gloss: bool = False 

1993 ) -> Optional[str]: 

1994 # print(f"sense_template_fn: {name}, {ht}") 

1995 if name in wikipedia_templates: 

1996 # parse_wikipedia_template(wxr, pos_data, ht) 

1997 return None 

1998 if is_panel_template(wxr, name): 

1999 return "" 

2000 if name in INFO_TEMPLATE_FUNCS: 

2001 info_data, info_exp = parse_info_template_arguments( 

2002 wxr, name, ht, "sense" 

2003 ) 

2004 if info_data or info_exp: 2004 ↛ 2010line 2004 didn't jump to line 2010 because the condition on line 2004 was always true

2005 if info_data: 2005 ↛ 2007line 2005 didn't jump to line 2007 because the condition on line 2005 was always true

2006 data_append(sense_base, "info_templates", info_data) 

2007 if info_exp and isinstance(info_exp, str): 2007 ↛ 2009line 2007 didn't jump to line 2009 because the condition on line 2007 was always true

2008 return info_exp 

2009 return "" 

2010 if name in ("defdate",): 

2011 date = clean_node(wxr, None, ht.get(1, ())) 

2012 if part_two := ht.get(2): 2012 ↛ 2014line 2012 didn't jump to line 2014 because the condition on line 2012 was never true

2013 # Unicode mdash, not '-' 

2014 date += "–" + clean_node(wxr, None, part_two) 

2015 refs: dict[str, ReferenceData] = {} 

2016 # ref, refn, ref2, ref2n, ref3, ref3n 

2017 # ref1 not valid 

2018 for k, v in sorted( 

2019 (k, v) for k, v in ht.items() if isinstance(k, str) 

2020 ): 

2021 if m := re.match(r"ref(\d?)(n?)", k): 2021 ↛ 2018line 2021 didn't jump to line 2018 because the condition on line 2021 was always true

2022 ref_v = clean_node(wxr, None, v) 

2023 if m.group(1) not in refs: # empty string or digit 

2024 refs[m.group(1)] = ReferenceData() 

2025 if m.group(2): 

2026 refs[m.group(1)]["refn"] = ref_v 

2027 else: 

2028 refs[m.group(1)]["text"] = ref_v 

2029 data_append( 

2030 sense_base, 

2031 "attestations", 

2032 AttestationData(date=date, references=list(refs.values())), 

2033 ) 

2034 return "" 

2035 if name == "senseid": 

2036 langid = clean_node(wxr, None, ht.get(1, ())) 

2037 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

2038 if re.match(r"Q\d+$", arg): 

2039 data_append(sense_base, "wikidata", arg) 

2040 data_append(sense_base, "senseid", langid + ":" + arg) 

2041 if name in sense_linkage_templates: 

2042 # print(f"SENSE_TEMPLATE_FN: {name}") 

2043 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

2044 return "" 

2045 if name == "†" or name == "zh-obsolete": 

2046 data_append(sense_base, "tags", "obsolete") 

2047 return "" 

2048 if name in { 

2049 "ux", 

2050 "uxi", 

2051 "usex", 

2052 "afex", 

2053 "prefixusex", 

2054 "ko-usex", 

2055 "ko-x", 

2056 "hi-x", 

2057 "ja-usex-inline", 

2058 "ja-x", 

2059 "quotei", 

2060 "he-x", 

2061 "hi-x", 

2062 "km-x", 

2063 "ne-x", 

2064 "shn-x", 

2065 "th-x", 

2066 "ur-x", 

2067 }: 

2068 # Usage examples are captured separately below. We don't 

2069 # want to expand them into glosses even when unusual coding 

2070 # is used in the entry. 

2071 # These templates may slip through inside another item, but 

2072 # currently we're separating out example entries (..#:) 

2073 # well enough that there seems to very little contamination. 

2074 if is_gloss: 

2075 wxr.wtp.wiki_notice( 

2076 "Example template is used for gloss text", 

2077 sortid="extractor.en.page.sense_template_fn/1415", 

2078 ) 

2079 else: 

2080 return "" 

2081 if name == "w": 2081 ↛ 2082line 2081 didn't jump to line 2082 because the condition on line 2081 was never true

2082 if ht.get(2) == "Wp": 

2083 return "" 

2084 for v in ht.values(): 

2085 v = v.strip() 

2086 if v and "<" not in v: 

2087 gloss_template_args.add(v) 

2088 return None 

2089 

2090 def extract_link_texts(item: GeneralNode) -> None: 

2091 """Recursively extracts link texts from the gloss source. This 

2092 information is used to select whether to remove final "." from 

2093 form_of/alt_of (e.g., ihm/Hunsrik).""" 

2094 if isinstance(item, (list, tuple)): 

2095 for x in item: 

2096 extract_link_texts(x) 

2097 return 

2098 if isinstance(item, str): 

2099 # There seem to be HTML sections that may futher contain 

2100 # unparsed links. 

2101 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2101 ↛ 2102line 2101 didn't jump to line 2102 because the loop on line 2101 never started

2102 print("ITER:", m.group(0)) 

2103 v = m.group(1).split("|")[-1].strip() 

2104 if v: 

2105 gloss_template_args.add(v) 

2106 return 

2107 if not isinstance(item, WikiNode): 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true

2108 return 

2109 if item.kind == NodeKind.LINK: 

2110 v = item.largs[-1] 

2111 if ( 2111 ↛ 2117line 2111 didn't jump to line 2117 because the condition on line 2111 was always true

2112 isinstance(v, list) 

2113 and len(v) == 1 

2114 and isinstance(v[0], str) 

2115 ): 

2116 gloss_template_args.add(v[0].strip()) 

2117 for x in item.children: 

2118 extract_link_texts(x) 

2119 

2120 extract_link_texts(contents) 

2121 

2122 # get the raw text of non-list contents of this node, and other stuff 

2123 # like tag and category data added to sense_base 

2124 # cast = no-op type-setter for the type-checker 

2125 partial_template_fn = cast( 

2126 TemplateFnCallable, 

2127 partial(sense_template_fn, is_gloss=True), 

2128 ) 

2129 rawgloss = clean_node( 

2130 wxr, 

2131 sense_base, 

2132 contents, 

2133 template_fn=partial_template_fn, 

2134 collect_links=True, 

2135 ) 

2136 

2137 if not rawgloss: 2137 ↛ 2138line 2137 didn't jump to line 2138 because the condition on line 2137 was never true

2138 return False 

2139 

2140 # remove manually typed ordered list text at the start("1. ") 

2141 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2142 

2143 # get stuff like synonyms and categories from "others", 

2144 # maybe examples and quotations 

2145 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2146 

2147 # The gloss could contain templates that produce more list items. 

2148 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2149 # to parts. However, e.g. Interlingua generates multiple glosses 

2150 # in HTML directly without Wikitext markup, so we must also split 

2151 # by just newlines. 

2152 subglosses = rawgloss.splitlines() 

2153 

2154 if len(subglosses) == 0: 2154 ↛ 2155line 2154 didn't jump to line 2155 because the condition on line 2154 was never true

2155 return False 

2156 

2157 if any(s.startswith("#") for s in subglosses): 

2158 subtree = wxr.wtp.parse(rawgloss) 

2159 # from wikitextprocessor.parser import print_tree 

2160 # print("SUBTREE GENERATED BY TEMPLATE:") 

2161 # print_tree(subtree) 

2162 new_subentries = [ 

2163 x 

2164 for x in subtree.children 

2165 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2166 ] 

2167 

2168 new_others = [ 

2169 x 

2170 for x in subtree.children 

2171 if isinstance(x, WikiNode) 

2172 and x.kind == NodeKind.LIST 

2173 and not x.sarg.endswith("#") 

2174 ] 

2175 

2176 new_contents = [ 

2177 clean_node(wxr, [], x) 

2178 for x in subtree.children 

2179 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2180 ] 

2181 

2182 subentries = subentries or new_subentries 

2183 others = others or new_others 

2184 subglosses = new_contents 

2185 rawgloss = "".join(subglosses) 

2186 # Generate no gloss for translation hub pages, but add the 

2187 # "translation-hub" tag for them 

2188 if rawgloss == "(This entry is a translation hub.)": 2188 ↛ 2189line 2188 didn't jump to line 2189 because the condition on line 2188 was never true

2189 data_append(sense_data, "tags", "translation-hub") 

2190 return push_sense(sorting_ordinal) 

2191 

2192 # Remove certain substrings specific to outer glosses 

2193 strip_ends = [", particularly:"] 

2194 for x in strip_ends: 

2195 if rawgloss.endswith(x): 

2196 rawgloss = rawgloss[: -len(x)].strip() 

2197 break 

2198 

2199 # A single gloss, or possibly an outer gloss. 

2200 # Check if the possible outer gloss starts with 

2201 # parenthesized tags/topics 

2202 

2203 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2204 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2205 m = QUALIFIERS_RE.match(rawgloss) 

2206 # (...): ... or (...(...)...): ... 

2207 if m: 

2208 q = m.group(1) 

2209 rawgloss = rawgloss[m.end() :].strip() 

2210 parse_sense_qualifier(wxr, q, sense_base) 

2211 if rawgloss == "A pejorative:": 2211 ↛ 2212line 2211 didn't jump to line 2212 because the condition on line 2211 was never true

2212 data_append(sense_base, "tags", "pejorative") 

2213 rawgloss = "" 

2214 elif rawgloss == "Short forms.": 2214 ↛ 2215line 2214 didn't jump to line 2215 because the condition on line 2214 was never true

2215 data_append(sense_base, "tags", "abbreviation") 

2216 rawgloss = "" 

2217 elif rawgloss == "Technical or specialized senses.": 2217 ↛ 2218line 2217 didn't jump to line 2218 because the condition on line 2217 was never true

2218 rawgloss = "" 

2219 elif rawgloss.startswith("inflection of "): 

2220 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2221 if parsed is not None: 2221 ↛ 2230line 2221 didn't jump to line 2230 because the condition on line 2221 was always true

2222 tags, origins = parsed 

2223 if origins is not None: 2223 ↛ 2225line 2223 didn't jump to line 2225 because the condition on line 2223 was always true

2224 data_extend(sense_base, "form_of", origins) 

2225 if tags is not None: 2225 ↛ 2228line 2225 didn't jump to line 2228 because the condition on line 2225 was always true

2226 data_extend(sense_base, "tags", tags) 

2227 else: 

2228 data_append(sense_base, "tags", "form-of") 

2229 else: 

2230 data_append(sense_base, "tags", "form-of") 

2231 if rawgloss: 2231 ↛ 2262line 2231 didn't jump to line 2262 because the condition on line 2231 was always true

2232 # Code duplicating a lot of clean-up operations from later in 

2233 # this block. We want to clean up the "supergloss" as much as 

2234 # possible, in almost the same way as a normal gloss. 

2235 supergloss = rawgloss 

2236 

2237 if supergloss.startswith("; "): 2237 ↛ 2238line 2237 didn't jump to line 2238 because the condition on line 2237 was never true

2238 supergloss = supergloss[1:].strip() 

2239 

2240 if supergloss.startswith(("^†", "†")): 

2241 data_append(sense_base, "tags", "obsolete") 

2242 supergloss = supergloss[2:].strip() 

2243 elif supergloss.startswith("^‡"): 2243 ↛ 2244line 2243 didn't jump to line 2244 because the condition on line 2243 was never true

2244 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2245 supergloss = supergloss[2:].strip() 

2246 

2247 # remove [14th century...] style brackets at the end 

2248 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2249 

2250 if supergloss.startswith((",", ":")): 

2251 supergloss = supergloss[1:] 

2252 supergloss = supergloss.strip() 

2253 if supergloss.startswith("N. of "): 2253 ↛ 2254line 2253 didn't jump to line 2254 because the condition on line 2253 was never true

2254 supergloss = "Name of " + supergloss[6:] 

2255 supergloss = supergloss[2:] 

2256 data_append(sense_base, "glosses", supergloss) 

2257 if supergloss in ("A person:",): 

2258 data_append(sense_base, "tags", "g-person") 

2259 

2260 # The main recursive call (except for the exceptions at the 

2261 # start of this function). 

2262 for sublist in subentries: 

2263 if not ( 2263 ↛ 2266line 2263 didn't jump to line 2266 because the condition on line 2263 was never true

2264 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2265 ): 

2266 wxr.wtp.debug( 

2267 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2268 f"with items that are not LISTs", 

2269 sortid="page/1511/20230119", 

2270 ) 

2271 continue 

2272 for item in sublist.children: 

2273 if not ( 2273 ↛ 2277line 2273 didn't jump to line 2277 because the condition on line 2273 was never true

2274 isinstance(item, WikiNode) 

2275 and item.kind == NodeKind.LIST_ITEM 

2276 ): 

2277 continue 

2278 # copy sense_base to prevent cross-contamination between 

2279 # subglosses and other subglosses and superglosses 

2280 sense_base2 = copy.deepcopy(sense_base) 

2281 if parse_sense_node(item, sense_base2, pos): 2281 ↛ 2272line 2281 didn't jump to line 2272 because the condition on line 2281 was always true

2282 added = True 

2283 

2284 # Capture examples. 

2285 # This is called after the recursive calls above so that 

2286 # sense_base is not contaminated with meta-data from 

2287 # example entries for *this* gloss. 

2288 examples = [] 

2289 if wxr.config.capture_examples: 2289 ↛ 2293line 2289 didn't jump to line 2293 because the condition on line 2289 was always true

2290 examples = extract_examples(others, sense_base) 

2291 

2292 # push_sense() succeeded somewhere down-river, so skip this level 

2293 if added: 

2294 if examples: 

2295 # this higher-up gloss has examples that we do not want to skip 

2296 wxr.wtp.debug( 

2297 "'{}[...]' gloss has examples we want to keep, " 

2298 "but there are subglosses.".format(repr(rawgloss[:30])), 

2299 sortid="page/1498/20230118", 

2300 ) 

2301 else: 

2302 return True 

2303 

2304 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2305 # after the gloss, but these sentences don't seem to be intended 

2306 # as glosses. Skip them. 

2307 indexed_subglosses = list( 

2308 (i, gl) 

2309 for i, gl in enumerate(subglosses) 

2310 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2311 ) 

2312 

2313 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2313 ↛ 2314line 2313 didn't jump to line 2314 because the condition on line 2313 was never true

2314 gl = indexed_subglosses[0][1].strip() 

2315 if gl.endswith(":"): 

2316 gl = gl[:-1].strip() 

2317 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2318 if parsed is not None: 

2319 infl_tags, infl_dts = parsed 

2320 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2321 # Interpret others as a particular form under 

2322 # "inflection of" 

2323 data_extend(sense_base, "tags", infl_tags) 

2324 data_extend(sense_base, "form_of", infl_dts) 

2325 indexed_subglosses = indexed_subglosses[1:] 

2326 elif not infl_dts: 

2327 data_extend(sense_base, "tags", infl_tags) 

2328 indexed_subglosses = indexed_subglosses[1:] 

2329 

2330 # Create senses for remaining subglosses 

2331 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2332 gloss = gloss.strip() 

2333 if not gloss and len(indexed_subglosses) > 1: 2333 ↛ 2334line 2333 didn't jump to line 2334 because the condition on line 2333 was never true

2334 continue 

2335 # Push a new sense (if the last one is not empty) 

2336 if push_sense(sorting_ordinal): 2336 ↛ 2337line 2336 didn't jump to line 2337 because the condition on line 2336 was never true

2337 added = True 

2338 # if gloss not in sense_data.get("raw_glosses", ()): 

2339 # data_append(sense_data, "raw_glosses", gloss) 

2340 if i == 0 and examples: 

2341 # In a multi-line gloss, associate examples 

2342 # with only one of them. 

2343 # XXX or you could use gloss_i == len(indexed_subglosses) 

2344 # to associate examples with the *last* one. 

2345 data_extend(sense_data, "examples", examples) 

2346 if gloss.startswith("; ") and gloss_i > 0: 2346 ↛ 2347line 2346 didn't jump to line 2347 because the condition on line 2346 was never true

2347 gloss = gloss[1:].strip() 

2348 # If the gloss starts with †, mark as obsolete 

2349 if gloss.startswith("^†"): 2349 ↛ 2350line 2349 didn't jump to line 2350 because the condition on line 2349 was never true

2350 data_append(sense_data, "tags", "obsolete") 

2351 gloss = gloss[2:].strip() 

2352 elif gloss.startswith("^‡"): 2352 ↛ 2353line 2352 didn't jump to line 2353 because the condition on line 2352 was never true

2353 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2354 gloss = gloss[2:].strip() 

2355 # Copy data for all senses to this sense 

2356 for k, v in sense_base.items(): 

2357 if isinstance(v, (list, tuple)): 

2358 if k != "tags": 

2359 # Tags handled below (countable/uncountable special) 

2360 data_extend(sense_data, k, v) 

2361 else: 

2362 assert k not in ("tags", "categories", "topics") 

2363 sense_data[k] = v # type:ignore[literal-required] 

2364 # Parse the gloss for this particular sense 

2365 m = QUALIFIERS_RE.match(gloss) 

2366 # (...): ... or (...(...)...): ... 

2367 if m: 

2368 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2369 gloss = gloss[m.end() :].strip() 

2370 

2371 # Remove common suffix "[from 14th c.]" and similar 

2372 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2373 

2374 # Check to make sure we don't have unhandled list items in gloss 

2375 ofs = max(gloss.find("#"), gloss.find("* ")) 

2376 if ofs > 10 and "(#)" not in gloss: 

2377 wxr.wtp.debug( 

2378 "gloss may contain unhandled list items: {}".format(gloss), 

2379 sortid="page/1412", 

2380 ) 

2381 elif "\n" in gloss: 2381 ↛ 2382line 2381 didn't jump to line 2382 because the condition on line 2381 was never true

2382 wxr.wtp.debug( 

2383 "gloss contains newline: {}".format(gloss), 

2384 sortid="page/1416", 

2385 ) 

2386 

2387 # Kludge, some glosses have a comma after initial qualifiers in 

2388 # parentheses 

2389 if gloss.startswith((",", ":")): 

2390 gloss = gloss[1:] 

2391 gloss = gloss.strip() 

2392 if gloss.endswith(":"): 

2393 gloss = gloss[:-1].strip() 

2394 if gloss.startswith("N. of "): 2394 ↛ 2395line 2394 didn't jump to line 2395 because the condition on line 2394 was never true

2395 gloss = "Name of " + gloss[6:] 

2396 if gloss.startswith("†"): 2396 ↛ 2397line 2396 didn't jump to line 2397 because the condition on line 2396 was never true

2397 data_append(sense_data, "tags", "obsolete") 

2398 gloss = gloss[1:] 

2399 elif gloss.startswith("^†"): 2399 ↛ 2400line 2399 didn't jump to line 2400 because the condition on line 2399 was never true

2400 data_append(sense_data, "tags", "obsolete") 

2401 gloss = gloss[2:] 

2402 

2403 # Copy tags from sense_base if any. This will not copy 

2404 # countable/uncountable if either was specified in the sense, 

2405 # as sometimes both are specified in word head but only one 

2406 # in individual senses. 

2407 countability_tags = [] 

2408 base_tags = sense_base.get("tags", ()) 

2409 sense_tags = sense_data.get("tags", ()) 

2410 for tag in base_tags: 

2411 if tag in ("countable", "uncountable"): 

2412 if tag not in countability_tags: 2412 ↛ 2414line 2412 didn't jump to line 2414 because the condition on line 2412 was always true

2413 countability_tags.append(tag) 

2414 continue 

2415 if tag not in sense_tags: 

2416 data_append(sense_data, "tags", tag) 

2417 if countability_tags: 

2418 if ( 2418 ↛ 2427line 2418 didn't jump to line 2427 because the condition on line 2418 was always true

2419 "countable" not in sense_tags 

2420 and "uncountable" not in sense_tags 

2421 ): 

2422 data_extend(sense_data, "tags", countability_tags) 

2423 

2424 # If outer gloss specifies a form-of ("inflection of", see 

2425 # aquamarine/German), try to parse the inner glosses as 

2426 # tags for an inflected form. 

2427 if "form-of" in sense_base.get("tags", ()): 

2428 parsed = parse_alt_or_inflection_of( 

2429 wxr, gloss, gloss_template_args 

2430 ) 

2431 if parsed is not None: 2431 ↛ 2437line 2431 didn't jump to line 2437 because the condition on line 2431 was always true

2432 infl_tags, infl_dts = parsed 

2433 if not infl_dts and infl_tags: 2433 ↛ 2437line 2433 didn't jump to line 2437 because the condition on line 2433 was always true

2434 # Interpret as a particular form under "inflection of" 

2435 data_extend(sense_data, "tags", infl_tags) 

2436 

2437 if not gloss: 2437 ↛ 2438line 2437 didn't jump to line 2438 because the condition on line 2437 was never true

2438 data_append(sense_data, "tags", "empty-gloss") 

2439 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2440 if ( 2440 ↛ 2451line 2440 didn't jump to line 2451 because the condition on line 2440 was always true

2441 gloss_i == 0 

2442 and len(sense_data.get("glosses", tuple())) >= 1 

2443 ): 

2444 # If we added a "high-level gloss" from rawgloss, but this 

2445 # is that same gloss_i, add this instead of the raw_gloss 

2446 # from before if they're different: the rawgloss was not 

2447 # cleaned exactly the same as this later gloss 

2448 sense_data["glosses"][-1] = gloss 

2449 else: 

2450 # Add the gloss for the sense. 

2451 data_append(sense_data, "glosses", gloss) 

2452 

2453 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2454 # two abbreviations in the same sense, both generated by the 

2455 # {{abbreviation of|...}} template. Handle these with some magic. 

2456 position = 0 

2457 split_glosses = [] 

2458 for m in re.finditer(r"Abbreviation of ", gloss): 

2459 if m.start() != position: 2459 ↛ 2458line 2459 didn't jump to line 2458 because the condition on line 2459 was always true

2460 split_glosses.append(gloss[position : m.start()]) 

2461 position = m.start() 

2462 split_glosses.append(gloss[position:]) 

2463 for gloss in split_glosses: 

2464 # Check if this gloss describes an alt-of or inflection-of 

2465 if ( 

2466 lang_code != "en" 

2467 and " " not in gloss 

2468 and distw([word], gloss) < 0.3 

2469 ): 

2470 # Don't try to parse gloss if it is one word 

2471 # that is close to the word itself for non-English words 

2472 # (probable translations of a tag/form name) 

2473 continue 

2474 parsed = parse_alt_or_inflection_of( 

2475 wxr, gloss, gloss_template_args 

2476 ) 

2477 if parsed is None: 

2478 continue 

2479 tags, dts = parsed 

2480 if not dts and tags: 

2481 data_extend(sense_data, "tags", tags) 

2482 continue 

2483 for dt in dts: # type:ignore[union-attr] 

2484 ftags = list(tag for tag in tags if tag != "form-of") 

2485 if "alt-of" in tags: 

2486 data_extend(sense_data, "tags", ftags) 

2487 data_append(sense_data, "alt_of", dt) 

2488 elif "compound-of" in tags: 2488 ↛ 2489line 2488 didn't jump to line 2489 because the condition on line 2488 was never true

2489 data_extend(sense_data, "tags", ftags) 

2490 data_append(sense_data, "compound_of", dt) 

2491 elif "synonym-of" in tags: 2491 ↛ 2492line 2491 didn't jump to line 2492 because the condition on line 2491 was never true

2492 data_extend(dt, "tags", ftags) 

2493 data_append(sense_data, "synonyms", dt) 

2494 elif tags and dt.get("word", "").startswith("of "): 2494 ↛ 2495line 2494 didn't jump to line 2495 because the condition on line 2494 was never true

2495 dt["word"] = dt["word"][3:] 

2496 data_append(sense_data, "tags", "form-of") 

2497 data_extend(sense_data, "tags", ftags) 

2498 data_append(sense_data, "form_of", dt) 

2499 elif "form-of" in tags: 2499 ↛ 2483line 2499 didn't jump to line 2483 because the condition on line 2499 was always true

2500 data_extend(sense_data, "tags", tags) 

2501 data_append(sense_data, "form_of", dt) 

2502 

2503 if len(sense_data) == 0: 

2504 if len(sense_base.get("tags", [])) == 0: 2504 ↛ 2506line 2504 didn't jump to line 2506 because the condition on line 2504 was always true

2505 del sense_base["tags"] 

2506 sense_data.update(sense_base) 

2507 if push_sense(sorting_ordinal): 2507 ↛ 2511line 2507 didn't jump to line 2511 because the condition on line 2507 was always true

2508 # push_sense succeded in adding a sense to pos_data 

2509 added = True 

2510 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2511 return added 

2512 

2513 def parse_inflection( 

2514 node: WikiNode, section: str, pos: Optional[str] 

2515 ) -> None: 

2516 """Parses inflection data (declension, conjugation) from the given 

2517 page. This retrieves the actual inflection template 

2518 parameters, which are very useful for applications that need 

2519 to learn the inflection classes and generate inflected 

2520 forms.""" 

2521 assert isinstance(node, WikiNode) 

2522 assert isinstance(section, str) 

2523 assert pos is None or isinstance(pos, str) 

2524 # print("parse_inflection:", node) 

2525 

2526 if pos is None: 2526 ↛ 2527line 2526 didn't jump to line 2527 because the condition on line 2526 was never true

2527 wxr.wtp.debug( 

2528 "inflection table outside part-of-speech", sortid="page/1812" 

2529 ) 

2530 return 

2531 

2532 def inflection_template_fn( 

2533 name: str, ht: TemplateArgs 

2534 ) -> Optional[str]: 

2535 # print("decl_conj_template_fn", name, ht) 

2536 if is_panel_template(wxr, name): 2536 ↛ 2537line 2536 didn't jump to line 2537 because the condition on line 2536 was never true

2537 return "" 

2538 if name in ("is-u-mutation",): 2538 ↛ 2541line 2538 didn't jump to line 2541 because the condition on line 2538 was never true

2539 # These are not to be captured as an exception to the 

2540 # generic code below 

2541 return None 

2542 m = re.search( 

2543 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2544 r"declension|inflection|mut|mutation)($|-)", 

2545 name, 

2546 ) 

2547 if m: 

2548 args_ht = clean_template_args(wxr, ht) 

2549 dt = {"name": name, "args": args_ht} 

2550 data_append(pos_data, "inflection_templates", dt) 

2551 

2552 return None 

2553 

2554 # Convert the subtree back to Wikitext, then expand all and parse, 

2555 # capturing templates in the process 

2556 text = wxr.wtp.node_to_wikitext(node.children) 

2557 

2558 # Split text into separate sections for each to-level template 

2559 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2560 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2561 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2562 # like the group around it, it would create elements in brace_matches, 

2563 # including None if it doesn't match. 

2564 # 20250114: Added {| and |} into the regex because tables were being 

2565 # cut into pieces by this code. Issue #973, introduction of two-part 

2566 # book-end templates similar to trans-top and tran-bottom. 

2567 template_sections = [] 

2568 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2569 # Because there is the possibility of triple curly braces 

2570 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2571 # count nesting depth using pairs of two brackets, but 

2572 # instead use singular braces ("{ }"). 

2573 # Because template delimiters should be balanced, regardless 

2574 # of whether {{ or {{{ is used, and because we only care 

2575 # about the outer-most delimiters (the highest level template) 

2576 # we can just count the single braces when those single 

2577 # braces are part of a group. 

2578 table_nesting = 0 

2579 # However, if we have a stray table ({| ... |}) that should always 

2580 # be its own section, and should prevent templates from cutting it 

2581 # into sections. 

2582 

2583 # print(f"Parse inflection: {text=}") 

2584 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2585 if len(brace_matches) > 1: 

2586 tsection: list[str] = [] 

2587 after_templates = False # kludge to keep any text 

2588 # before first template 

2589 # with the first template; 

2590 # otherwise, text 

2591 # goes with preceding template 

2592 for m in brace_matches: 

2593 if m.startswith("\n; ") and after_templates: 2593 ↛ 2594line 2593 didn't jump to line 2594 because the condition on line 2593 was never true

2594 after_templates = False 

2595 template_sections.append(tsection) 

2596 tsection = [] 

2597 tsection.append(m) 

2598 elif m.startswith("{{") or m.endswith("{|"): 

2599 if ( 

2600 template_nesting == 0 

2601 and after_templates 

2602 and table_nesting == 0 

2603 ): 

2604 template_sections.append(tsection) 

2605 tsection = [] 

2606 # start new section 

2607 after_templates = True 

2608 if m.startswith("{{"): 

2609 template_nesting += 1 

2610 else: 

2611 # m.endswith("{|") 

2612 table_nesting += 1 

2613 tsection.append(m) 

2614 elif m.startswith("}}") or m.endswith("|}"): 

2615 if m.startswith("}}"): 

2616 template_nesting -= 1 

2617 if template_nesting < 0: 2617 ↛ 2618line 2617 didn't jump to line 2618 because the condition on line 2617 was never true

2618 wxr.wtp.error( 

2619 "Negatively nested braces, " 

2620 "couldn't split inflection templates, " 

2621 "{}/{} section {}".format( 

2622 word, language, section 

2623 ), 

2624 sortid="page/1871", 

2625 ) 

2626 template_sections = [] # use whole text 

2627 break 

2628 else: 

2629 table_nesting -= 1 

2630 if table_nesting < 0: 2630 ↛ 2631line 2630 didn't jump to line 2631 because the condition on line 2630 was never true

2631 wxr.wtp.error( 

2632 "Negatively nested table braces, " 

2633 "couldn't split inflection section, " 

2634 "{}/{} section {}".format( 

2635 word, language, section 

2636 ), 

2637 sortid="page/20250114", 

2638 ) 

2639 template_sections = [] # use whole text 

2640 break 

2641 tsection.append(m) 

2642 else: 

2643 tsection.append(m) 

2644 if tsection: # dangling tsection 2644 ↛ 2652line 2644 didn't jump to line 2652 because the condition on line 2644 was always true

2645 template_sections.append(tsection) 

2646 # Why do it this way around? The parser has a preference 

2647 # to associate bits outside of tables with the preceding 

2648 # table (`after`-variable), so a new tsection begins 

2649 # at {{ and everything before it belongs to the previous 

2650 # template. 

2651 

2652 texts = [] 

2653 if not template_sections: 

2654 texts = [text] 

2655 else: 

2656 for tsection in template_sections: 

2657 texts.append("".join(tsection)) 

2658 if template_nesting != 0: 2658 ↛ 2659line 2658 didn't jump to line 2659 because the condition on line 2658 was never true

2659 wxr.wtp.error( 

2660 "Template nesting error: " 

2661 "template_nesting = {} " 

2662 "couldn't split inflection templates, " 

2663 "{}/{} section {}".format( 

2664 template_nesting, word, language, section 

2665 ), 

2666 sortid="page/1896", 

2667 ) 

2668 texts = [text] 

2669 for text in texts: 

2670 tree = wxr.wtp.parse( 

2671 text, expand_all=True, template_fn=inflection_template_fn 

2672 ) 

2673 

2674 if not text.strip(): 

2675 continue 

2676 

2677 # Parse inflection tables from the section. The data is stored 

2678 # under "forms". 

2679 if wxr.config.capture_inflections: 2679 ↛ 2669line 2679 didn't jump to line 2669 because the condition on line 2679 was always true

2680 tablecontext = None 

2681 m = re.search(r"{{([^}{|]+)\|?", text) 

2682 if m: 

2683 template_name = m.group(1).strip() 

2684 tablecontext = TableContext(template_name) 

2685 

2686 parse_inflection_section( 

2687 wxr, 

2688 pos_data, 

2689 word, 

2690 language, 

2691 pos, 

2692 section, 

2693 tree, 

2694 tablecontext=tablecontext, 

2695 ) 

2696 

2697 def get_subpage_section( 

2698 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2699 ) -> Optional[Union[WikiNode, str]]: 

2700 """Loads a subpage of the given page, and finds the section 

2701 for the given language, part-of-speech, and section title. This 

2702 is used for finding translations and other sections on subpages.""" 

2703 assert isinstance(language, str) 

2704 assert isinstance(title, str) 

2705 assert isinstance(subtitle, str) 

2706 assert isinstance(seqs, (list, tuple)) 

2707 for seq in seqs: 

2708 for x in seq: 

2709 assert isinstance(x, str) 

2710 subpage_title = word + "/" + subtitle 

2711 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2712 if subpage_content is None: 

2713 wxr.wtp.error( 

2714 "/translations not found despite " 

2715 "{{see translation subpage|...}}", 

2716 sortid="page/1934", 

2717 ) 

2718 return None 

2719 

2720 def recurse( 

2721 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2722 ) -> Optional[Union[str, WikiNode]]: 

2723 # print(f"seq: {seq}") 

2724 if not seq: 

2725 return node 

2726 if not isinstance(node, WikiNode): 

2727 return None 

2728 # print(f"node.kind: {node.kind}") 

2729 if node.kind in LEVEL_KINDS: 

2730 t = clean_node(wxr, None, node.largs[0]) 

2731 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2732 if t.lower() == seq[0].lower(): 

2733 seq = seq[1:] 

2734 if not seq: 

2735 return node 

2736 for n in node.children: 

2737 ret = recurse(n, seq) 

2738 if ret is not None: 

2739 return ret 

2740 return None 

2741 

2742 tree = wxr.wtp.parse( 

2743 subpage_content, 

2744 pre_expand=True, 

2745 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2746 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2747 ) 

2748 assert tree.kind == NodeKind.ROOT 

2749 for seq in seqs: 

2750 ret = recurse(tree, seq) 

2751 if ret is None: 

2752 wxr.wtp.debug( 

2753 "Failed to find subpage section {}/{} seq {}".format( 

2754 title, subtitle, seq 

2755 ), 

2756 sortid="page/1963", 

2757 ) 

2758 return ret 

2759 

2760 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2761 """Parses translations for a word. This may also pull in translations 

2762 from separate translation subpages.""" 

2763 assert isinstance(data, dict) 

2764 assert isinstance(xlatnode, WikiNode) 

2765 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2766 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2767 # print("parse_translations xlatnode={}".format(xlatnode)) 

2768 if not wxr.config.capture_translations: 2768 ↛ 2769line 2768 didn't jump to line 2769 because the condition on line 2768 was never true

2769 return 

2770 sense_parts: list[Union[WikiNode, str]] = [] 

2771 sense: Optional[str] = None 

2772 

2773 def parse_translation_item( 

2774 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

2775 ) -> None: 

2776 nonlocal sense 

2777 assert isinstance(contents, list) 

2778 assert lang is None or isinstance(lang, str) 

2779 # print("PARSE_TRANSLATION_ITEM:", contents) 

2780 

2781 langcode: Optional[str] = None 

2782 if sense is None: 

2783 sense = clean_node(wxr, data, sense_parts).strip() 

2784 # print("sense <- clean_node: ", sense) 

2785 idx = sense.find("See also translations at") 

2786 if idx > 0: 2786 ↛ 2787line 2786 didn't jump to line 2787 because the condition on line 2786 was never true

2787 wxr.wtp.debug( 

2788 "Skipping translation see also: {}".format(sense), 

2789 sortid="page/2361", 

2790 ) 

2791 sense = sense[:idx].strip() 

2792 if sense.endswith(":"): 2792 ↛ 2793line 2792 didn't jump to line 2793 because the condition on line 2792 was never true

2793 sense = sense[:-1].strip() 

2794 if sense.endswith("—"): 2794 ↛ 2795line 2794 didn't jump to line 2795 because the condition on line 2794 was never true

2795 sense = sense[:-1].strip() 

2796 translations_from_template: list[str] = [] 

2797 

2798 def translation_item_template_fn( 

2799 name: str, ht: TemplateArgs 

2800 ) -> Optional[str]: 

2801 nonlocal langcode 

2802 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

2803 if is_panel_template(wxr, name): 

2804 return "" 

2805 if name in ("t+check", "t-check", "t-needed"): 

2806 # We ignore these templates. They seem to have outright 

2807 # garbage in some entries, and very varying formatting in 

2808 # others. These should be transitory and unreliable 

2809 # anyway. 

2810 return "__IGNORE__" 

2811 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

2812 code = ht.get(1) 

2813 if code: 2813 ↛ 2823line 2813 didn't jump to line 2823 because the condition on line 2813 was always true

2814 if langcode and code != langcode: 

2815 wxr.wtp.debug( 

2816 "inconsistent language codes {} vs " 

2817 "{} in translation item: {!r} {}".format( 

2818 langcode, code, name, ht 

2819 ), 

2820 sortid="page/2386", 

2821 ) 

2822 langcode = code 

2823 tr = ht.get(2) 

2824 if tr: 

2825 tr = clean_node(wxr, None, [tr]) 

2826 translations_from_template.append(tr) 

2827 return None 

2828 if name == "t-egy": 

2829 langcode = "egy" 

2830 return None 

2831 if name == "ttbc": 

2832 code = ht.get(1) 

2833 if code: 2833 ↛ 2835line 2833 didn't jump to line 2835 because the condition on line 2833 was always true

2834 langcode = code 

2835 return None 

2836 if name == "trans-see": 2836 ↛ 2837line 2836 didn't jump to line 2837 because the condition on line 2836 was never true

2837 wxr.wtp.error( 

2838 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

2839 ) 

2840 return "" 

2841 if name.endswith("-top"): 2841 ↛ 2842line 2841 didn't jump to line 2842 because the condition on line 2841 was never true

2842 return "" 

2843 if name.endswith("-bottom"): 2843 ↛ 2844line 2843 didn't jump to line 2844 because the condition on line 2843 was never true

2844 return "" 

2845 if name.endswith("-mid"): 2845 ↛ 2846line 2845 didn't jump to line 2846 because the condition on line 2845 was never true

2846 return "" 

2847 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

2848 # .format(name), 

2849 # sortid="page/2414") 

2850 return None 

2851 

2852 sublists = list( 

2853 x 

2854 for x in contents 

2855 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2856 ) 

2857 contents = list( 

2858 x 

2859 for x in contents 

2860 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2861 ) 

2862 

2863 item = clean_node( 

2864 wxr, data, contents, template_fn=translation_item_template_fn 

2865 ) 

2866 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

2867 

2868 # Parse the translation item. 

2869 if item: 2869 ↛ exitline 2869 didn't return from function 'parse_translation_item' because the condition on line 2869 was always true

2870 lang = parse_translation_item_text( 

2871 wxr, 

2872 word, 

2873 data, 

2874 item, 

2875 sense, 

2876 lang, 

2877 langcode, 

2878 translations_from_template, 

2879 is_reconstruction, 

2880 ) 

2881 

2882 # Handle sublists. They are frequently used for different 

2883 # scripts for the language and different variants of the 

2884 # language. We will include the lower-level header as a 

2885 # tag in those cases. 

2886 for listnode in sublists: 

2887 assert listnode.kind == NodeKind.LIST 

2888 for node in listnode.children: 

2889 if not isinstance(node, WikiNode): 2889 ↛ 2890line 2889 didn't jump to line 2890 because the condition on line 2889 was never true

2890 continue 

2891 if node.kind == NodeKind.LIST_ITEM: 2891 ↛ 2888line 2891 didn't jump to line 2888 because the condition on line 2891 was always true

2892 parse_translation_item(node.children, lang=lang) 

2893 

2894 def parse_translation_template(node: WikiNode) -> None: 

2895 assert isinstance(node, WikiNode) 

2896 

2897 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

2898 nonlocal sense_parts 

2899 nonlocal sense 

2900 if is_panel_template(wxr, name): 

2901 return "" 

2902 if name == "see also": 

2903 # XXX capture 

2904 # XXX for example, "/" has top-level list containing 

2905 # see also items. So also should parse those. 

2906 return "" 

2907 if name == "trans-see": 

2908 # XXX capture 

2909 return "" 

2910 if name == "see translation subpage": 2910 ↛ 2911line 2910 didn't jump to line 2911 because the condition on line 2910 was never true

2911 sense_parts = [] 

2912 sense = None 

2913 sub = ht.get(1, "") 

2914 if sub: 

2915 m = re.match( 

2916 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

2917 ) 

2918 else: 

2919 m = None 

2920 etym = "" 

2921 etym_numbered = "" 

2922 pos = "" 

2923 if m: 

2924 etym_numbered = m.group(1) 

2925 etym = m.group(2) 

2926 pos = m.group(3) 

2927 if not sub: 

2928 wxr.wtp.debug( 

2929 "no part-of-speech in " 

2930 "{{see translation subpage|...}}, " 

2931 "defaulting to just wxr.wtp.section " 

2932 "(= language)", 

2933 sortid="page/2468", 

2934 ) 

2935 # seq sent to get_subpage_section without sub and pos 

2936 seq = [ 

2937 language, 

2938 TRANSLATIONS_TITLE, 

2939 ] 

2940 elif ( 

2941 m 

2942 and etym.lower().strip() in ETYMOLOGY_TITLES 

2943 and pos.lower() in POS_TITLES 

2944 ): 

2945 seq = [ 

2946 language, 

2947 etym_numbered, 

2948 pos, 

2949 TRANSLATIONS_TITLE, 

2950 ] 

2951 elif sub.lower() in POS_TITLES: 

2952 # seq with sub but not pos 

2953 seq = [ 

2954 language, 

2955 sub, 

2956 TRANSLATIONS_TITLE, 

2957 ] 

2958 else: 

2959 # seq with sub and pos 

2960 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2961 if pos.lower() not in POS_TITLES: 

2962 wxr.wtp.debug( 

2963 "unhandled see translation subpage: " 

2964 "language={} sub={} " 

2965 "wxr.wtp.subsection={}".format( 

2966 language, sub, wxr.wtp.subsection 

2967 ), 

2968 sortid="page/2478", 

2969 ) 

2970 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

2971 subnode = get_subpage_section( 

2972 wxr.wtp.title or "MISSING_TITLE", 

2973 TRANSLATIONS_TITLE, 

2974 [seq], 

2975 ) 

2976 if subnode is None or not isinstance(subnode, WikiNode): 

2977 # Failed to find the normal subpage section 

2978 # seq with sub and pos 

2979 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2980 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

2981 seqs: list[list[str] | tuple[str, ...]] = [ 

2982 [TRANSLATIONS_TITLE], 

2983 [language, pos], 

2984 ] 

2985 subnode = get_subpage_section( 

2986 wxr.wtp.title or "MISSING_TITLE", 

2987 TRANSLATIONS_TITLE, 

2988 seqs, 

2989 ) 

2990 if subnode is not None and isinstance(subnode, WikiNode): 

2991 parse_translations(data, subnode) 

2992 return "" 

2993 if name in ( 

2994 "c", 

2995 "C", 

2996 "categorize", 

2997 "cat", 

2998 "catlangname", 

2999 "topics", 

3000 "top", 

3001 "qualifier", 

3002 "cln", 

3003 ): 

3004 # These are expanded in the default way 

3005 return None 

3006 if name in ( 

3007 "trans-top", 

3008 "trans-top-see", 

3009 ): 

3010 # XXX capture id from trans-top? Capture sense here 

3011 # instead of trying to parse it from expanded content? 

3012 if ht.get(1): 

3013 sense_parts = [] 

3014 sense = ht.get(1) 

3015 else: 

3016 sense_parts = [] 

3017 sense = None 

3018 return None 

3019 if name in ( 

3020 "trans-bottom", 

3021 "trans-mid", 

3022 "checktrans-mid", 

3023 "checktrans-bottom", 

3024 ): 

3025 return None 

3026 if name == "checktrans-top": 

3027 sense_parts = [] 

3028 sense = None 

3029 return "" 

3030 if name == "trans-top-also": 

3031 # XXX capture? 

3032 sense_parts = [] 

3033 sense = None 

3034 return "" 

3035 wxr.wtp.error( 

3036 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3037 name, ht 

3038 ), 

3039 sortid="page/2517", 

3040 ) 

3041 return "" 

3042 

3043 wxr.wtp.expand( 

3044 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3045 ) 

3046 

3047 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3048 nonlocal sense 

3049 nonlocal sense_parts 

3050 for node in xlatnode.children: 

3051 # print(node) 

3052 if isinstance(node, str): 

3053 if sense: 

3054 if not node.isspace(): 

3055 wxr.wtp.debug( 

3056 "skipping string in the middle of " 

3057 "translations: {}".format(node), 

3058 sortid="page/2530", 

3059 ) 

3060 continue 

3061 # Add a part to the sense 

3062 sense_parts.append(node) 

3063 sense = None 

3064 continue 

3065 assert isinstance(node, WikiNode) 

3066 kind = node.kind 

3067 if kind == NodeKind.LIST: 

3068 for item in node.children: 

3069 if not isinstance(item, WikiNode): 3069 ↛ 3070line 3069 didn't jump to line 3070 because the condition on line 3069 was never true

3070 continue 

3071 if item.kind != NodeKind.LIST_ITEM: 3071 ↛ 3072line 3071 didn't jump to line 3072 because the condition on line 3071 was never true

3072 continue 

3073 if item.sarg == ":": 3073 ↛ 3074line 3073 didn't jump to line 3074 because the condition on line 3073 was never true

3074 continue 

3075 parse_translation_item(item.children) 

3076 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3076 ↛ 3080line 3076 didn't jump to line 3080 because the condition on line 3076 was never true

3077 # Silently skip list items that are just indented; these 

3078 # are used for text between translations, such as indicating 

3079 # translations that need to be checked. 

3080 pass 

3081 elif kind == NodeKind.TEMPLATE: 

3082 parse_translation_template(node) 

3083 elif kind in ( 3083 ↛ 3088line 3083 didn't jump to line 3088 because the condition on line 3083 was never true

3084 NodeKind.TABLE, 

3085 NodeKind.TABLE_ROW, 

3086 NodeKind.TABLE_CELL, 

3087 ): 

3088 parse_translation_recurse(node) 

3089 elif kind == NodeKind.HTML: 

3090 if node.attrs.get("class") == "NavFrame": 3090 ↛ 3096line 3090 didn't jump to line 3096 because the condition on line 3090 was never true

3091 # Reset ``sense_parts`` (and force recomputing 

3092 # by clearing ``sense``) as each NavFrame specifies 

3093 # its own sense. This helps eliminate garbage coming 

3094 # from text at the beginning at the translations 

3095 # section. 

3096 sense_parts = [] 

3097 sense = None 

3098 # for item in node.children: 

3099 # if not isinstance(item, WikiNode): 

3100 # continue 

3101 # parse_translation_recurse(item) 

3102 parse_translation_recurse(node) 

3103 elif kind in LEVEL_KINDS: 3103 ↛ 3105line 3103 didn't jump to line 3105 because the condition on line 3103 was never true

3104 # Sub-levels will be recursed elsewhere 

3105 pass 

3106 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3107 parse_translation_recurse(node) 

3108 elif kind == NodeKind.PREFORMATTED: 3108 ↛ 3109line 3108 didn't jump to line 3109 because the condition on line 3108 was never true

3109 print("parse_translation_recurse: PREFORMATTED:", node) 

3110 elif kind == NodeKind.LINK: 3110 ↛ 3164line 3110 didn't jump to line 3164 because the condition on line 3110 was always true

3111 arg0 = node.largs[0] 

3112 # Kludge: I've seen occasional normal links to translation 

3113 # subpages from main pages (e.g., language/English/Noun 

3114 # in July 2021) instead of the normal 

3115 # {{see translation subpage|...}} template. This should 

3116 # handle them. Note: must be careful not to read other 

3117 # links, particularly things like in "human being": 

3118 # "a human being -- see [[man/translations]]" (group title) 

3119 if ( 3119 ↛ 3127line 3119 didn't jump to line 3127 because the condition on line 3119 was never true

3120 isinstance(arg0, (list, tuple)) 

3121 and arg0 

3122 and isinstance(arg0[0], str) 

3123 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3124 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3125 == wxr.wtp.title 

3126 ): 

3127 wxr.wtp.debug( 

3128 "translations subpage link found on main " 

3129 "page instead " 

3130 "of normal {{see translation subpage|...}}", 

3131 sortid="page/2595", 

3132 ) 

3133 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3134 if sub.lower() in POS_TITLES: 

3135 seq = [ 

3136 language, 

3137 sub, 

3138 TRANSLATIONS_TITLE, 

3139 ] 

3140 subnode = get_subpage_section( 

3141 wxr.wtp.title, 

3142 TRANSLATIONS_TITLE, 

3143 [seq], 

3144 ) 

3145 if subnode is not None and isinstance( 

3146 subnode, WikiNode 

3147 ): 

3148 parse_translations(data, subnode) 

3149 else: 

3150 wxr.wtp.error( 

3151 "/translations link outside part-of-speech" 

3152 ) 

3153 

3154 if ( 

3155 len(arg0) >= 1 

3156 and isinstance(arg0[0], str) 

3157 and not arg0[0].lower().startswith("category:") 

3158 ): 

3159 for x in node.largs[-1]: 

3160 if isinstance(x, str): 3160 ↛ 3163line 3160 didn't jump to line 3163 because the condition on line 3160 was always true

3161 sense_parts.append(x) 

3162 else: 

3163 parse_translation_recurse(x) 

3164 elif not sense: 

3165 sense_parts.append(node) 

3166 else: 

3167 wxr.wtp.debug( 

3168 "skipping text between translation items/senses: " 

3169 "{}".format(node), 

3170 sortid="page/2621", 

3171 ) 

3172 

3173 # Main code of parse_translation(). We want ``sense`` to be assigned 

3174 # regardless of recursion levels, and thus the code is structured 

3175 # to define at this level and recurse in parse_translation_recurse(). 

3176 parse_translation_recurse(xlatnode) 

3177 

3178 def parse_etymology(data: WordData, node: LevelNode) -> None: 

3179 """Parses an etymology section.""" 

3180 assert isinstance(data, dict) 

3181 assert isinstance(node, WikiNode) 

3182 

3183 templates: list[TemplateData] = [] 

3184 

3185 # Counter for preventing the capture of etymology templates 

3186 # when we are inside templates that we want to ignore (i.e., 

3187 # not capture). 

3188 ignore_count = 0 

3189 

3190 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3191 nonlocal ignore_count 

3192 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3193 return "" 

3194 if re.match(ignored_etymology_templates_re, name): 

3195 ignore_count += 1 

3196 return None 

3197 

3198 # CONTINUE_HERE 

3199 

3200 def etym_post_template_fn( 

3201 name: str, ht: TemplateArgs, expansion: str 

3202 ) -> None: 

3203 nonlocal ignore_count 

3204 if name in wikipedia_templates: 

3205 parse_wikipedia_template(wxr, data, ht) 

3206 return None 

3207 if re.match(ignored_etymology_templates_re, name): 

3208 ignore_count -= 1 

3209 return None 

3210 if ignore_count == 0: 3210 ↛ 3216line 3210 didn't jump to line 3216 because the condition on line 3210 was always true

3211 ht = clean_template_args(wxr, ht) 

3212 expansion = clean_node(wxr, None, expansion) 

3213 templates.append( 

3214 {"name": name, "args": ht, "expansion": expansion} 

3215 ) 

3216 return None 

3217 

3218 # Remove any subsections 

3219 contents = list( 

3220 x 

3221 for x in node.children 

3222 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3223 ) 

3224 # Convert to text, also capturing templates using post_template_fn 

3225 text = clean_node( 

3226 wxr, 

3227 None, 

3228 contents, 

3229 template_fn=etym_template_fn, 

3230 post_template_fn=etym_post_template_fn, 

3231 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3232 # Save the collected information. 

3233 if len(text) > 0: 

3234 data["etymology_text"] = text 

3235 if len(templates) > 0: 

3236 # Some etymology templates, like Template:root do not generate 

3237 # text, so they should be added here. Elsewhere, we check 

3238 # for Template:root and add some text to the expansion to please 

3239 # the validation. 

3240 data["etymology_templates"] = templates 

3241 

3242 for child_node in node.find_child_recursively( 3242 ↛ exitline 3242 didn't return from function 'parse_etymology' because the loop on line 3242 didn't complete

3243 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3244 ): 

3245 if child_node.kind in LEVEL_KIND_FLAGS: 

3246 break 

3247 elif isinstance( 3247 ↛ 3250line 3247 didn't jump to line 3250 because the condition on line 3247 was never true

3248 child_node, TemplateNode 

3249 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3250 if "etymology_examples" not in data: 

3251 data["etymology_examples"] = [] 

3252 data["etymology_examples"].extend( 

3253 extract_template_zh_x( 

3254 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3255 ) 

3256 ) 

3257 

3258 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3259 """This recurses into a subtree in the parse tree for a page.""" 

3260 nonlocal etym_data 

3261 nonlocal pos_data 

3262 nonlocal inside_level_four 

3263 

3264 redirect_list: list[str] = [] # for `zh-see` template 

3265 

3266 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3267 """This is called for otherwise unprocessed parts of the page. 

3268 We still expand them so that e.g. Category links get captured.""" 

3269 if name in wikipedia_templates: 

3270 data = select_data() 

3271 parse_wikipedia_template(wxr, data, ht) 

3272 return None 

3273 if is_panel_template(wxr, name): 

3274 return "" 

3275 return None 

3276 

3277 for node in treenode.children: 

3278 if not isinstance(node, WikiNode): 

3279 # print(" X{}".format(repr(node)[:40])) 

3280 continue 

3281 if isinstance(node, TemplateNode): 

3282 if process_soft_redirect_template(wxr, node, redirect_list): 

3283 continue 

3284 elif node.template_name == "zh-forms": 

3285 extract_zh_forms_template(wxr, node, select_data()) 

3286 elif ( 

3287 node.template_name.endswith("-kanjitab") 

3288 or node.template_name == "ja-kt" 

3289 ): 

3290 extract_ja_kanjitab_template(wxr, node, select_data()) 

3291 

3292 if not isinstance(node, LevelNode): 

3293 # XXX handle e.g. wikipedia links at the top of a language 

3294 # XXX should at least capture "also" at top of page 

3295 if node.kind in ( 

3296 NodeKind.HLINE, 

3297 NodeKind.LIST, 

3298 NodeKind.LIST_ITEM, 

3299 ): 

3300 continue 

3301 # print(" UNEXPECTED: {}".format(node)) 

3302 # Clean the node to collect category links 

3303 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3304 continue 

3305 t = clean_node( 

3306 wxr, etym_data, node.sarg if node.sarg else node.largs 

3307 ) 

3308 t = t.lower() 

3309 # XXX these counts were never implemented fully, and even this 

3310 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3311 wxr.config.section_counts[t] += 1 

3312 # print("PROCESS_CHILDREN: T:", repr(t)) 

3313 if t in IGNORED_TITLES: 

3314 pass 

3315 elif t.startswith(PRONUNCIATION_TITLE): 

3316 # Chinese Pronunciation section kludge; we demote these to 

3317 # be level 4 instead of 3 so that they're part of a larger 

3318 # etymology hierarchy; usually the data here is empty and 

3319 # acts as an inbetween between POS and Etymology data 

3320 if lang_code in ("zh",): 

3321 inside_level_four = True 

3322 if t.startswith(PRONUNCIATION_TITLE + " "): 

3323 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3324 # and each of them may have senses under Definition 

3325 push_level_four_section(True) 

3326 wxr.wtp.start_subsection(None) 

3327 if wxr.config.capture_pronunciation: 3327 ↛ 3435line 3327 didn't jump to line 3435 because the condition on line 3327 was always true

3328 data = select_data() 

3329 parse_pronunciation( 

3330 wxr, 

3331 node, 

3332 data, 

3333 etym_data, 

3334 have_etym, 

3335 base_data, 

3336 lang_code, 

3337 ) 

3338 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3339 push_etym() 

3340 wxr.wtp.start_subsection(None) 

3341 if wxr.config.capture_etymologies: 3341 ↛ 3435line 3341 didn't jump to line 3435 because the condition on line 3341 was always true

3342 m = re.search(r"\s(\d+(\.\d+)?)$", t) 

3343 if m: 

3344 etym_data["etymology_number"] = m.group(1) 

3345 parse_etymology(etym_data, node) 

3346 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3347 data = select_data() 

3348 extract_descendant_section(wxr, data, node, False) 

3349 elif ( 

3350 t in PROTO_ROOT_DERIVED_TITLES 

3351 and pos == "root" 

3352 and is_reconstruction 

3353 and wxr.config.capture_descendants 

3354 ): 

3355 data = select_data() 

3356 extract_descendant_section(wxr, data, node, True) 

3357 elif t == TRANSLATIONS_TITLE: 

3358 data = select_data() 

3359 parse_translations(data, node) 

3360 elif t in INFLECTION_TITLES: 

3361 parse_inflection(node, t, pos) 

3362 elif t == "alternative forms": 

3363 extract_alt_form_section(wxr, select_data(), node) 

3364 else: 

3365 lst = t.split() 

3366 while len(lst) > 1 and lst[-1].isdigit(): 

3367 lst = lst[:-1] 

3368 t_no_number = " ".join(lst).lower() 

3369 if t_no_number in POS_TITLES: 

3370 push_pos() 

3371 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3372 pos = dt["pos"] or "MISSING_POS" 

3373 wxr.wtp.start_subsection(t) 

3374 if "debug" in dt: 

3375 wxr.wtp.debug( 

3376 "{} in section {}".format(dt["debug"], t), 

3377 sortid="page/2755", 

3378 ) 

3379 if "warning" in dt: 3379 ↛ 3380line 3379 didn't jump to line 3380 because the condition on line 3379 was never true

3380 wxr.wtp.wiki_notice( 

3381 "{} in section {}".format(dt["warning"], t), 

3382 sortid="page/2759", 

3383 ) 

3384 if "error" in dt: 3384 ↛ 3385line 3384 didn't jump to line 3385 because the condition on line 3384 was never true

3385 wxr.wtp.error( 

3386 "{} in section {}".format(dt["error"], t), 

3387 sortid="page/2763", 

3388 ) 

3389 if "note" in dt: 3389 ↛ 3390line 3389 didn't jump to line 3390 because the condition on line 3389 was never true

3390 wxr.wtp.note( 

3391 "{} in section {}".format(dt["note"], t), 

3392 sortid="page/20251017a", 

3393 ) 

3394 if "wiki_notice" in dt: 3394 ↛ 3395line 3394 didn't jump to line 3395 because the condition on line 3394 was never true

3395 wxr.wtp.wiki_notice( 

3396 "{} in section {}".format(dt["wiki_notices"], t), 

3397 sortid="page/20251017b", 

3398 ) 

3399 # Parse word senses for the part-of-speech 

3400 parse_part_of_speech(node, pos) 

3401 if "tags" in dt: 

3402 for pdata in sense_datas: 

3403 data_extend(pdata, "tags", dt["tags"]) 

3404 elif t_no_number in LINKAGE_TITLES: 

3405 # print(f"LINKAGE_TITLES NODE {node=}") 

3406 rel = LINKAGE_TITLES[t_no_number] 

3407 data = select_data() 

3408 parse_linkage( 

3409 wxr, 

3410 data, 

3411 rel, 

3412 node, 

3413 word, 

3414 sense_datas, 

3415 is_reconstruction, 

3416 ) 

3417 elif t_no_number == COMPOUNDS_TITLE: 

3418 data = select_data() 

3419 if wxr.config.capture_compounds: 3419 ↛ 3435line 3419 didn't jump to line 3435 because the condition on line 3419 was always true

3420 parse_linkage( 

3421 wxr, 

3422 data, 

3423 "derived", 

3424 node, 

3425 word, 

3426 sense_datas, 

3427 is_reconstruction, 

3428 ) 

3429 

3430 # XXX parse interesting templates also from other sections. E.g., 

3431 # {{Letter|...}} in ===See also=== 

3432 # Also <gallery> 

3433 

3434 # Recurse to children of this node, processing subtitles therein 

3435 stack.append(t) 

3436 process_children(node, pos) 

3437 stack.pop() 

3438 

3439 if len(redirect_list) > 0: 

3440 if len(pos_data) > 0: 

3441 pos_data["redirects"] = redirect_list 

3442 if "pos" not in pos_data: 3442 ↛ 3443line 3442 didn't jump to line 3443 because the condition on line 3442 was never true

3443 pos_data["pos"] = "soft-redirect" 

3444 else: 

3445 new_page_data = copy.deepcopy(base_data) 

3446 new_page_data["redirects"] = redirect_list 

3447 if "pos" not in new_page_data: 3447 ↛ 3449line 3447 didn't jump to line 3449 because the condition on line 3447 was always true

3448 new_page_data["pos"] = "soft-redirect" 

3449 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3450 page_datas.append(new_page_data) 

3451 

3452 def extract_examples( 

3453 others: list[WikiNode], sense_base: SenseData 

3454 ) -> list[ExampleData]: 

3455 """Parses through a list of definitions and quotes to find examples. 

3456 Returns a list of example dicts to be added to sense data. Adds 

3457 meta-data, mostly categories, into sense_base.""" 

3458 assert isinstance(others, list) 

3459 examples: list[ExampleData] = [] 

3460 

3461 for sub in others: 

3462 if not sub.sarg.endswith((":", "*")): 3462 ↛ 3463line 3462 didn't jump to line 3463 because the condition on line 3462 was never true

3463 continue 

3464 for item in sub.children: 

3465 if not isinstance(item, WikiNode): 3465 ↛ 3466line 3465 didn't jump to line 3466 because the condition on line 3465 was never true

3466 continue 

3467 if item.kind != NodeKind.LIST_ITEM: 3467 ↛ 3468line 3467 didn't jump to line 3468 because the condition on line 3467 was never true

3468 continue 

3469 usex_type = None 

3470 example_template_args = [] 

3471 example_template_names = [] 

3472 taxons = set() 

3473 

3474 # Bypass this function when parsing Chinese, Japanese and 

3475 # quotation templates. 

3476 new_example_lists = extract_example_list_item( 

3477 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3478 ) 

3479 if len(new_example_lists) > 0: 

3480 examples.extend(new_example_lists) 

3481 continue 

3482 

3483 def usex_template_fn( 

3484 name: str, ht: TemplateArgs 

3485 ) -> Optional[str]: 

3486 nonlocal usex_type 

3487 if is_panel_template(wxr, name): 

3488 return "" 

3489 if name in usex_templates: 

3490 usex_type = "example" 

3491 example_template_args.append(ht) 

3492 example_template_names.append(name) 

3493 elif name in quotation_templates: 

3494 usex_type = "quotation" 

3495 elif name in taxonomy_templates: 3495 ↛ 3496line 3495 didn't jump to line 3496 because the condition on line 3495 was never true

3496 taxons.update(ht.get(1, "").split()) 

3497 for prefix in template_linkages_to_ignore_in_examples: 

3498 if re.search( 

3499 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3500 ): 

3501 return "" 

3502 return None 

3503 

3504 # bookmark 

3505 ruby: list[tuple[str, str]] = [] 

3506 contents = item.children 

3507 if lang_code == "ja": 

3508 # Capture ruby contents if this is a Japanese language 

3509 # example. 

3510 # print(contents) 

3511 if ( 3511 ↛ 3516line 3511 didn't jump to line 3516 because the condition on line 3511 was never true

3512 contents 

3513 and isinstance(contents, str) 

3514 and re.match(r"\s*$", contents[0]) 

3515 ): 

3516 contents = contents[1:] 

3517 exp = wxr.wtp.parse( 

3518 wxr.wtp.node_to_wikitext(contents), 

3519 # post_template_fn=head_post_template_fn, 

3520 expand_all=True, 

3521 ) 

3522 rub, rest = extract_ruby(wxr, exp.children) 

3523 if rub: 

3524 for rtup in rub: 

3525 ruby.append(rtup) 

3526 contents = rest 

3527 subtext = clean_node( 

3528 wxr, sense_base, contents, template_fn=usex_template_fn 

3529 ) 

3530 

3531 frozen_taxons = frozenset(taxons) 

3532 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3533 

3534 # print(f"{subtext=}") 

3535 subtext = re.sub( 

3536 r"\s*\(please add an English " 

3537 r"translation of this " 

3538 r"(example|usage example|quote)\)", 

3539 "", 

3540 subtext, 

3541 ).strip() 

3542 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3543 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3544 # print("subtext:", repr(subtext)) 

3545 

3546 lines = subtext.splitlines() 

3547 # print(lines) 

3548 

3549 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3550 lines = list( 

3551 x 

3552 for x in lines 

3553 if not re.match( 

3554 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3555 r"Synonym: |Antonym: |Hyponym: |" 

3556 r"Hypernyms: |Derived terms: |" 

3557 r"Related terms: |" 

3558 r"Hypernym: |Derived term: |" 

3559 r"Coordinate terms:|" 

3560 r"Related term: |" 

3561 r"For more quotations using )", 

3562 x, 

3563 ) 

3564 ) 

3565 tr = "" 

3566 ref = "" 

3567 roman = "" 

3568 # for line in lines: 

3569 # print("LINE:", repr(line)) 

3570 # print(classify_desc(line)) 

3571 if len(lines) == 1 and lang_code != "en": 

3572 parts = example_splitter_re.split(lines[0]) 

3573 if ( 3573 ↛ 3581line 3573 didn't jump to line 3581 because the condition on line 3573 was never true

3574 len(parts) > 2 

3575 and len(example_template_args) == 1 

3576 and any( 

3577 ("―" in s) or ("—" in s) 

3578 for s in example_template_args[0].values() 

3579 ) 

3580 ): 

3581 if nparts := synch_splits_with_args( 

3582 lines[0], example_template_args[0] 

3583 ): 

3584 parts = nparts 

3585 if ( 3585 ↛ 3590line 3585 didn't jump to line 3590 because the condition on line 3585 was never true

3586 len(example_template_args) == 1 

3587 and "lit" in example_template_args[0] 

3588 ): 

3589 # ugly brute-force kludge in case there's a lit= arg 

3590 literally = example_template_args[0].get("lit", "") 

3591 if literally: 

3592 literally = ( 

3593 " (literally, “" 

3594 + clean_value(wxr, literally) 

3595 + "”)" 

3596 ) 

3597 else: 

3598 literally = "" 

3599 if ( 3599 ↛ 3638line 3599 didn't jump to line 3638 because the condition on line 3599 was never true

3600 len(example_template_args) == 1 

3601 and len(parts) == 2 

3602 and len(example_template_args[0]) 

3603 - ( 

3604 # horrible kludge to ignore these arguments 

3605 # when calculating how many there are 

3606 sum( 

3607 s in example_template_args[0] 

3608 for s in ( 

3609 "lit", # generates text, but we handle it 

3610 "inline", 

3611 "noenum", 

3612 "nocat", 

3613 "sort", 

3614 ) 

3615 ) 

3616 ) 

3617 == 3 

3618 and clean_value( 

3619 wxr, example_template_args[0].get(2, "") 

3620 ) 

3621 == parts[0].strip() 

3622 and clean_value( 

3623 wxr, 

3624 ( 

3625 example_template_args[0].get(3) 

3626 or example_template_args[0].get("translation") 

3627 or example_template_args[0].get("t", "") 

3628 ) 

3629 + literally, # in case there's a lit= argument 

3630 ) 

3631 == parts[1].strip() 

3632 ): 

3633 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3634 # is a pretty reliable 'heuristic', so we use it here 

3635 # before the others. To be extra sure the template 

3636 # doesn't do anything weird, we compare the arguments 

3637 # and the output to each other. 

3638 lines = [parts[0].strip()] 

3639 tr = parts[1].strip() 

3640 elif ( 

3641 len(parts) == 2 

3642 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3643 ): 

3644 # These other branches just do some simple heuristics w/ 

3645 # the expanded output of the template (if applicable). 

3646 lines = [parts[0].strip()] 

3647 tr = parts[1].strip() 

3648 elif ( 3648 ↛ 3654line 3648 didn't jump to line 3654 because the condition on line 3648 was never true

3649 len(parts) == 3 

3650 and classify_desc2(parts[1]) 

3651 in ("romanization", "english") 

3652 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3653 ): 

3654 lines = [parts[0].strip()] 

3655 roman = parts[1].strip() 

3656 tr = parts[2].strip() 

3657 else: 

3658 parts = re.split(r"\s+-\s+", lines[0]) 

3659 if ( 3659 ↛ 3663line 3659 didn't jump to line 3663 because the condition on line 3659 was never true

3660 len(parts) == 2 

3661 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3662 ): 

3663 lines = [parts[0].strip()] 

3664 tr = parts[1].strip() 

3665 elif len(lines) > 1: 

3666 if any( 

3667 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3668 ) and not (len(example_template_names) == 1): 

3669 refs: list[str] = [] 

3670 for i in range(len(lines)): 3670 ↛ 3676line 3670 didn't jump to line 3676 because the loop on line 3670 didn't complete

3671 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3671 ↛ 3672line 3671 didn't jump to line 3672 because the condition on line 3671 was never true

3672 break 

3673 refs.append(lines[i].strip()) 

3674 if re.search(r"[]\d:)]\s*$", lines[i]): 

3675 break 

3676 ref = " ".join(refs) 

3677 lines = lines[i + 1 :] 

3678 if ( 

3679 lang_code != "en" 

3680 and len(lines) >= 2 

3681 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3682 ): 

3683 i = len(lines) - 1 

3684 while ( 3684 ↛ 3689line 3684 didn't jump to line 3689 because the condition on line 3684 was never true

3685 i > 1 

3686 and classify_desc2(lines[i - 1]) 

3687 in ENGLISH_TEXTS 

3688 ): 

3689 i -= 1 

3690 tr = "\n".join(lines[i:]) 

3691 lines = lines[:i] 

3692 if len(lines) >= 2: 

3693 if classify_desc2(lines[-1]) == "romanization": 

3694 roman = lines[-1].strip() 

3695 lines = lines[:-1] 

3696 

3697 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

3698 ref = lines[0] 

3699 lines = lines[1:] 

3700 elif lang_code != "en" and len(lines) == 2: 

3701 cls1 = classify_desc2(lines[0]) 

3702 cls2 = classify_desc2(lines[1]) 

3703 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3704 tr = lines[1] 

3705 lines = [lines[0]] 

3706 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3706 ↛ 3707line 3706 didn't jump to line 3707 because the condition on line 3706 was never true

3707 tr = lines[0] 

3708 lines = [lines[1]] 

3709 elif ( 3709 ↛ 3716line 3709 didn't jump to line 3716 because the condition on line 3709 was never true

3710 re.match(r"^[#*]*:+", lines[1]) 

3711 and classify_desc2( 

3712 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3713 ) 

3714 in ENGLISH_TEXTS 

3715 ): 

3716 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3717 lines = [lines[0]] 

3718 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3719 # Both were classified as English, but 

3720 # presumably one is not. Assume first is 

3721 # non-English, as that seems more common. 

3722 tr = lines[1] 

3723 lines = [lines[0]] 

3724 elif ( 

3725 usex_type != "quotation" 

3726 and lang_code != "en" 

3727 and len(lines) == 3 

3728 ): 

3729 cls1 = classify_desc2(lines[0]) 

3730 cls2 = classify_desc2(lines[1]) 

3731 cls3 = classify_desc2(lines[2]) 

3732 if ( 

3733 cls3 == "english" 

3734 and cls2 in ("english", "romanization") 

3735 and cls1 != "english" 

3736 ): 

3737 tr = lines[2].strip() 

3738 roman = lines[1].strip() 

3739 lines = [lines[0].strip()] 

3740 elif ( 3740 ↛ 3748line 3740 didn't jump to line 3748 because the condition on line 3740 was never true

3741 usex_type == "quotation" 

3742 and lang_code != "en" 

3743 and len(lines) > 2 

3744 ): 

3745 # for x in lines: 

3746 # print(" LINE: {}: {}" 

3747 # .format(classify_desc2(x), x)) 

3748 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3749 ref = lines[0] 

3750 lines = lines[2:] 

3751 cls1 = classify_desc2(lines[-1]) 

3752 if cls1 == "english": 

3753 i = len(lines) - 1 

3754 while ( 

3755 i > 1 

3756 and classify_desc2(lines[i - 1]) 

3757 == ENGLISH_TEXTS 

3758 ): 

3759 i -= 1 

3760 tr = "\n".join(lines[i:]) 

3761 lines = lines[:i] 

3762 

3763 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3764 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3765 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3766 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3767 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3768 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3769 ref = re.sub( 

3770 r", (volume |number |page )?“?" 

3771 r"\(please specify ([^)]|\(s\))*\)”?|" 

3772 ", text here$", 

3773 "", 

3774 ref, 

3775 ) 

3776 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3777 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3778 subtext = "\n".join(x for x in lines if x) 

3779 if not tr and lang_code != "en": 

3780 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3781 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3781 ↛ 3782line 3781 didn't jump to line 3782 because the condition on line 3781 was never true

3782 tr = m.group(2) 

3783 subtext = subtext[: m.start()] + m.group(1) 

3784 elif lines: 

3785 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

3786 if ( 3786 ↛ 3790line 3786 didn't jump to line 3790 because the condition on line 3786 was never true

3787 len(parts) == 2 

3788 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3789 ): 

3790 subtext = parts[0].strip() 

3791 tr = parts[1].strip() 

3792 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

3793 subtext = re.sub( 

3794 r"(please add an English translation of " 

3795 r"this (quote|usage example))", 

3796 "", 

3797 subtext, 

3798 ) 

3799 subtext = re.sub( 

3800 r"\s*→New International Version " "translation$", 

3801 "", 

3802 subtext, 

3803 ) # e.g. pis/Tok Pisin (Bible) 

3804 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

3805 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

3806 note = None 

3807 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

3808 if ( 3808 ↛ 3816line 3808 didn't jump to line 3816 because the condition on line 3808 was never true

3809 m is not None 

3810 and lang_code != "en" 

3811 and ( 

3812 m.group(1).startswith("with ") 

3813 or classify_desc2(m.group(1)) == "english" 

3814 ) 

3815 ): 

3816 note = m.group(1) 

3817 subtext = subtext[m.end() :] 

3818 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

3819 ref = re.sub(r",\s*→ISBN", "", ref) 

3820 ref = ref.strip() 

3821 if ref.endswith(":") or ref.endswith(","): 

3822 ref = ref[:-1].strip() 

3823 ref = re.sub(r"\s+,\s+", ", ", ref) 

3824 ref = re.sub(r"\s+", " ", ref) 

3825 if ref and not subtext: 3825 ↛ 3826line 3825 didn't jump to line 3826 because the condition on line 3825 was never true

3826 subtext = ref 

3827 ref = "" 

3828 if subtext: 

3829 dt: ExampleData = {"text": subtext} 

3830 if ref: 

3831 dt["ref"] = ref 

3832 if tr: 

3833 dt["english"] = tr # DEPRECATED for "translation" 

3834 dt["translation"] = tr 

3835 if usex_type: 

3836 dt["type"] = usex_type 

3837 if note: 3837 ↛ 3838line 3837 didn't jump to line 3838 because the condition on line 3837 was never true

3838 dt["note"] = note 

3839 if roman: 

3840 dt["roman"] = roman 

3841 if ruby: 

3842 dt["ruby"] = ruby 

3843 examples.append(dt) 

3844 

3845 return examples 

3846 

3847 # Main code of parse_language() 

3848 # Process the section 

3849 stack.append(language) 

3850 process_children(langnode, None) 

3851 stack.pop() 

3852 

3853 # Finalize word entires 

3854 push_etym() 

3855 ret = [] 

3856 for data in page_datas: 

3857 merge_base(data, base_data) 

3858 ret.append(data) 

3859 

3860 # Copy all tags to word senses 

3861 for data in ret: 

3862 if "senses" not in data: 3862 ↛ 3863line 3862 didn't jump to line 3863 because the condition on line 3862 was never true

3863 continue 

3864 # WordData should not have a 'tags' field, but if it does, it's 

3865 # deleted and its contents removed and placed in each sense; 

3866 # that's why the type ignores. 

3867 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

3868 if "tags" in data: 

3869 del data["tags"] # type: ignore[typeddict-item] 

3870 for sense in data["senses"]: 

3871 data_extend(sense, "tags", tags) 

3872 

3873 return ret 

3874 

3875 

3876def parse_wikipedia_template( 

3877 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

3878) -> None: 

3879 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

3880 assert isinstance(wxr, WiktextractContext) 

3881 assert isinstance(data, dict) 

3882 assert isinstance(ht, dict) 

3883 langid = clean_node(wxr, data, ht.get("lang", ())) 

3884 pagename = ( 

3885 clean_node(wxr, data, ht.get(1, ())) 

3886 or wxr.wtp.title 

3887 or "MISSING_PAGE_TITLE" 

3888 ) 

3889 if langid: 

3890 data_append(data, "wikipedia", langid + ":" + pagename) 

3891 else: 

3892 data_append(data, "wikipedia", pagename) 

3893 

3894 

3895def parse_top_template( 

3896 wxr: WiktextractContext, node: WikiNode, data: WordData 

3897) -> None: 

3898 """Parses a template that occurs on the top-level in a page, before any 

3899 language subtitles.""" 

3900 assert isinstance(wxr, WiktextractContext) 

3901 assert isinstance(node, WikiNode) 

3902 assert isinstance(data, dict) 

3903 

3904 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3905 if name in wikipedia_templates: 

3906 parse_wikipedia_template(wxr, data, ht) 

3907 return None 

3908 if is_panel_template(wxr, name): 

3909 return "" 

3910 if name in ("reconstruction",): 3910 ↛ 3911line 3910 didn't jump to line 3911 because the condition on line 3910 was never true

3911 return "" 

3912 if name.lower() == "also" or name.lower().startswith("also/"): 

3913 # XXX shows related words that might really have been the intended 

3914 # word, capture them 

3915 return "" 

3916 if name == "see also": 3916 ↛ 3918line 3916 didn't jump to line 3918 because the condition on line 3916 was never true

3917 # XXX capture 

3918 return "" 

3919 if name == "cardinalbox": 3919 ↛ 3921line 3919 didn't jump to line 3921 because the condition on line 3919 was never true

3920 # XXX capture 

3921 return "" 

3922 if name == "character info": 3922 ↛ 3924line 3922 didn't jump to line 3924 because the condition on line 3922 was never true

3923 # XXX capture 

3924 return "" 

3925 if name == "commonscat": 3925 ↛ 3927line 3925 didn't jump to line 3927 because the condition on line 3925 was never true

3926 # XXX capture link to Wikimedia commons 

3927 return "" 

3928 if name == "wrongtitle": 3928 ↛ 3931line 3928 didn't jump to line 3931 because the condition on line 3928 was never true

3929 # XXX this should be captured to replace page title with the 

3930 # correct title. E.g. ⿰亻革家 

3931 return "" 

3932 if name == "wikidata": 3932 ↛ 3933line 3932 didn't jump to line 3933 because the condition on line 3932 was never true

3933 arg = clean_node(wxr, data, ht.get(1, ())) 

3934 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

3935 data_append(data, "wikidata", arg) 

3936 return "" 

3937 wxr.wtp.debug( 

3938 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

3939 sortid="page/2870", 

3940 ) 

3941 return "" 

3942 

3943 clean_node(wxr, None, [node], template_fn=top_template_fn) 

3944 

3945 

3946def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

3947 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

3948 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

3949 that are next to each other.""" 

3950 

3951 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

3952 # section get overwritten. In this case, let's just combine the two. 

3953 

3954 # In Chinese entries, Pronunciation can be preceded on the 

3955 # same level 3 by its Etymology *and* Glyph Origin sections: 

3956 # ===Glyph Origin=== 

3957 # ===Etymology=== 

3958 # ===Pronunciation=== 

3959 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

3960 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

3961 # are now level 6 

3962 

3963 # Known lowercase PoS names are in part_of_speech_map 

3964 # Known lowercase linkage section names are in linkage_map 

3965 

3966 old = re.split( 

3967 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

3968 ) 

3969 

3970 parts = [] 

3971 npar = 4 # Number of parentheses in above expression 

3972 parts.append(old[0]) 

3973 prev_level = None 

3974 level = None 

3975 skip_level_title = False # When combining etymology sections 

3976 for i in range(1, len(old), npar + 1): 

3977 left = old[i] 

3978 right = old[i + npar - 1] 

3979 # remove Wikilinks in title 

3980 title = re.sub(r"^\[\[", "", old[i + 1]) 

3981 title = re.sub(r"\]\]$", "", title) 

3982 prev_level = level 

3983 level = len(left) 

3984 part = old[i + npar] 

3985 if level != len(right): 3985 ↛ 3986line 3985 didn't jump to line 3986 because the condition on line 3985 was never true

3986 wxr.wtp.debug( 

3987 "subtitle has unbalanced levels: " 

3988 "{!r} has {} on the left and {} on the right".format( 

3989 title, left, right 

3990 ), 

3991 sortid="page/2904", 

3992 ) 

3993 lc = title.lower() 

3994 if name_to_code(title, "en") != "": 

3995 if level > 2: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true

3996 wxr.wtp.debug( 

3997 "subtitle has language name {} at level {}".format( 

3998 title, level 

3999 ), 

4000 sortid="page/2911", 

4001 ) 

4002 level = 2 

4003 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4004 if level > 3: 4004 ↛ 4005line 4004 didn't jump to line 4005 because the condition on line 4004 was never true

4005 wxr.wtp.debug( 

4006 "etymology section {} at level {}".format(title, level), 

4007 sortid="page/2917", 

4008 ) 

4009 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

4010 # sections cheek-to-cheek 

4011 skip_level_title = True 

4012 # Modify the title of previous ("Glyph Origin") section, in 

4013 # case we have a meaningful title like "Etymology 1" 

4014 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4015 level = 3 

4016 elif lc.startswith(PRONUNCIATION_TITLE): 

4017 # Pronunciation is now a level between POS and Etymology, so 

4018 # we need to shift everything down by one 

4019 level = 4 

4020 elif lc in POS_TITLES: 

4021 level = 5 

4022 elif lc == TRANSLATIONS_TITLE: 

4023 level = 6 

4024 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

4025 level = 6 

4026 elif lc in INFLECTION_TITLES: 

4027 level = 6 

4028 elif lc == DESCENDANTS_TITLE: 

4029 level = 6 

4030 elif title in PROTO_ROOT_DERIVED_TITLES: 4030 ↛ 4031line 4030 didn't jump to line 4031 because the condition on line 4030 was never true

4031 level = 6 

4032 elif lc in IGNORED_TITLES: 

4033 level = 6 

4034 else: 

4035 level = 6 

4036 if skip_level_title: 

4037 skip_level_title = False 

4038 parts.append(part) 

4039 else: 

4040 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4041 parts.append(part) 

4042 # print("=" * level, title) 

4043 # if level != len(left): 

4044 # print(" FIXED LEVEL OF {} {} -> {}" 

4045 # .format(title, len(left), level)) 

4046 

4047 text = "".join(parts) 

4048 # print(text) 

4049 return text 

4050 

4051 

4052def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4053 # Skip translation pages 

4054 if word.endswith("/" + TRANSLATIONS_TITLE): 4054 ↛ 4055line 4054 didn't jump to line 4055 because the condition on line 4054 was never true

4055 return [] 

4056 

4057 if wxr.config.verbose: 4057 ↛ 4058line 4057 didn't jump to line 4058 because the condition on line 4057 was never true

4058 logger.info(f"Parsing page: {word}") 

4059 

4060 wxr.config.word = word 

4061 wxr.wtp.start_page(word) 

4062 

4063 # Remove <noinclude> and similar tags from main pages. They 

4064 # should not appear there, but at least net/Elfdala has one and it 

4065 # is probably not the only one. 

4066 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4067 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4068 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4069 

4070 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4071 # pages that have, for example, Translations section under Linkage, or 

4072 # Translations section on the same level as Noun. Enforce a proper 

4073 # hierarchy by manipulating the subtitle levels in certain cases. 

4074 text = fix_subtitle_hierarchy(wxr, text) 

4075 

4076 # Parse the page, pre-expanding those templates that are likely to 

4077 # influence parsing 

4078 tree = wxr.wtp.parse( 

4079 text, 

4080 pre_expand=True, 

4081 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4082 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4083 ) 

4084 # from wikitextprocessor.parser import print_tree 

4085 # print("PAGE PARSE:", print_tree(tree)) 

4086 

4087 top_data: WordData = {} 

4088 

4089 # Iterate over top-level titles, which should be languages for normal 

4090 # pages 

4091 by_lang = defaultdict(list) 

4092 for langnode in tree.children: 

4093 if not isinstance(langnode, WikiNode): 

4094 continue 

4095 if langnode.kind == NodeKind.TEMPLATE: 

4096 parse_top_template(wxr, langnode, top_data) 

4097 continue 

4098 if langnode.kind == NodeKind.LINK: 

4099 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4100 continue 

4101 if langnode.kind != NodeKind.LEVEL2: 4101 ↛ 4102line 4101 didn't jump to line 4102 because the condition on line 4101 was never true

4102 wxr.wtp.debug( 

4103 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4104 ) 

4105 continue 

4106 lang = clean_node( 

4107 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4108 ) 

4109 lang_code = name_to_code(lang, "en") 

4110 if lang_code == "": 4110 ↛ 4111line 4110 didn't jump to line 4111 because the condition on line 4110 was never true

4111 wxr.wtp.debug( 

4112 f"unrecognized language name: {lang}", sortid="page/3019" 

4113 ) 

4114 if ( 

4115 wxr.config.capture_language_codes 

4116 and lang_code not in wxr.config.capture_language_codes 

4117 ): 

4118 continue 

4119 wxr.wtp.start_section(lang) 

4120 

4121 # Collect all words from the page. 

4122 # print(f"{langnode=}") 

4123 datas = parse_language(wxr, langnode, lang, lang_code) 

4124 

4125 # Propagate fields resulting from top-level templates to this 

4126 # part-of-speech. 

4127 for data in datas: 

4128 if "lang" not in data: 4128 ↛ 4129line 4128 didn't jump to line 4129 because the condition on line 4128 was never true

4129 wxr.wtp.debug( 

4130 "internal error -- no lang in data: {}".format(data), 

4131 sortid="page/3034", 

4132 ) 

4133 continue 

4134 for k, v in top_data.items(): 

4135 assert isinstance(v, (list, tuple)) 

4136 data_extend(data, k, v) 

4137 by_lang[data["lang"]].append(data) 

4138 

4139 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4140 # field. FIX OR REMOVE. 

4141 # Do some post-processing on the words. For example, we may distribute 

4142 # conjugation information to all the words. 

4143 ret = [] 

4144 for lang, lang_datas in by_lang.items(): 

4145 ret.extend(lang_datas) 

4146 

4147 for x in ret: 

4148 if x["word"] != word: 

4149 if word.startswith("Unsupported titles/"): 

4150 wxr.wtp.debug( 

4151 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4152 sortid="20231101/3578page.py", 

4153 ) 

4154 else: 

4155 wxr.wtp.debug( 

4156 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4157 sortid="20231101/3582page.py", 

4158 ) 

4159 x["original_title"] = word 

4160 # validate tag data 

4161 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4162 return ret 

4163 

4164 

4165def recursively_separate_raw_tags( 

4166 wxr: WiktextractContext, data: dict[str, Any] 

4167) -> None: 

4168 if not isinstance(data, dict): 4168 ↛ 4169line 4168 didn't jump to line 4169 because the condition on line 4168 was never true

4169 wxr.wtp.error( 

4170 "'data' is not dict; most probably " 

4171 "data has a list that contains at least one dict and " 

4172 "at least one non-dict item", 

4173 sortid="en/page-4016/20240419", 

4174 ) 

4175 return 

4176 new_tags: list[str] = [] 

4177 raw_tags: list[str] = data.get("raw_tags", []) 

4178 for field, val in data.items(): 

4179 if field == "tags": 

4180 for tag in val: 

4181 if tag not in valid_tags: 

4182 raw_tags.append(tag) 

4183 else: 

4184 new_tags.append(tag) 

4185 if isinstance(val, list): 

4186 if len(val) > 0 and isinstance(val[0], dict): 

4187 for d in val: 

4188 recursively_separate_raw_tags(wxr, d) 

4189 if "tags" in data and not new_tags: 

4190 del data["tags"] 

4191 elif new_tags: 

4192 data["tags"] = new_tags 

4193 if raw_tags: 

4194 data["raw_tags"] = raw_tags 

4195 

4196 

4197def process_soft_redirect_template( 

4198 wxr: WiktextractContext, 

4199 template_node: TemplateNode, 

4200 redirect_pages: list[str], 

4201) -> bool: 

4202 # return `True` if the template is soft redirect template 

4203 if template_node.template_name == "zh-see": 

4204 # https://en.wiktionary.org/wiki/Template:zh-see 

4205 title = clean_node( 

4206 wxr, None, template_node.template_parameters.get(1, "") 

4207 ) 

4208 if title != "": 4208 ↛ 4210line 4208 didn't jump to line 4210 because the condition on line 4208 was always true

4209 redirect_pages.append(title) 

4210 return True 

4211 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4212 # https://en.wiktionary.org/wiki/Template:ja-see 

4213 for key, value in template_node.template_parameters.items(): 

4214 if isinstance(key, int): 4214 ↛ 4213line 4214 didn't jump to line 4213 because the condition on line 4214 was always true

4215 title = clean_node(wxr, None, value) 

4216 if title != "": 4216 ↛ 4213line 4216 didn't jump to line 4213 because the condition on line 4216 was always true

4217 redirect_pages.append(title) 

4218 return True 

4219 return False 

4220 

4221 

4222ZH_FORMS_TAGS = { 

4223 "trad.": "Traditional-Chinese", 

4224 "simp.": "Simplified-Chinese", 

4225 "alternative forms": "alternative", 

4226 "2nd round simp.": "Second-Round-Simplified-Chinese", 

4227} 

4228 

4229 

4230def extract_zh_forms_template( 

4231 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4232): 

4233 # https://en.wiktionary.org/wiki/Template:zh-forms 

4234 lit_meaning = clean_node( 

4235 wxr, None, t_node.template_parameters.get("lit", "") 

4236 ) 

4237 if lit_meaning != "": 

4238 base_data["literal_meaning"] = lit_meaning 

4239 expanded_node = wxr.wtp.parse( 

4240 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4241 ) 

4242 for table in expanded_node.find_child(NodeKind.TABLE): 

4243 for row in table.find_child(NodeKind.TABLE_ROW): 

4244 row_header = "" 

4245 row_header_tags: list[str] = [] 

4246 header_has_span = False 

4247 for cell in row.find_child( 

4248 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

4249 ): 

4250 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

4251 row_header, row_header_tags, header_has_span = ( 

4252 extract_zh_forms_header_cell(wxr, base_data, cell) 

4253 ) 

4254 elif not header_has_span: 

4255 extract_zh_forms_data_cell( 

4256 wxr, base_data, cell, row_header, row_header_tags 

4257 ) 

4258 

4259 if "forms" in base_data and len(base_data["forms"]) == 0: 4259 ↛ 4260line 4259 didn't jump to line 4260 because the condition on line 4259 was never true

4260 del base_data["forms"] 

4261 

4262 

4263def extract_zh_forms_header_cell( 

4264 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode 

4265) -> tuple[str, list[str], bool]: 

4266 row_header = "" 

4267 row_header_tags = [] 

4268 header_has_span = False 

4269 first_span_index = len(header_cell.children) 

4270 for index, span_tag in header_cell.find_html("span", with_index=True): 

4271 if index < first_span_index: 4271 ↛ 4273line 4271 didn't jump to line 4273 because the condition on line 4271 was always true

4272 first_span_index = index 

4273 header_has_span = True 

4274 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

4275 for raw_tag in row_header.split(" and "): 

4276 raw_tag = raw_tag.strip() 

4277 if raw_tag != "": 

4278 row_header_tags.append(raw_tag) 

4279 for span_tag in header_cell.find_html_recursively("span"): 

4280 span_lang = span_tag.attrs.get("lang", "") 

4281 form_nodes = [] 

4282 sup_title = "" 

4283 for node in span_tag.children: 

4284 if isinstance(node, HTMLNode) and node.tag == "sup": 4284 ↛ 4285line 4284 didn't jump to line 4285 because the condition on line 4284 was never true

4285 for sup_span in node.find_html("span"): 

4286 sup_title = sup_span.attrs.get("title", "") 

4287 else: 

4288 form_nodes.append(node) 

4289 if span_lang in ["zh-Hant", "zh-Hans"]: 

4290 for word in clean_node(wxr, None, form_nodes).split("/"): 

4291 if word not in [wxr.wtp.title, ""]: 

4292 form = {"form": word} 

4293 for raw_tag in row_header_tags: 

4294 if raw_tag in ZH_FORMS_TAGS: 4294 ↛ 4297line 4294 didn't jump to line 4297 because the condition on line 4294 was always true

4295 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4296 else: 

4297 data_append(form, "raw_tags", raw_tag) 

4298 if sup_title != "": 4298 ↛ 4299line 4298 didn't jump to line 4299 because the condition on line 4298 was never true

4299 data_append(form, "raw_tags", sup_title) 

4300 data_append(base_data, "forms", form) 

4301 return row_header, row_header_tags, header_has_span 

4302 

4303 

4304TagLiteral = Literal["tags", "raw_tags"] 

4305TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags") 

4306 

4307 

4308def extract_zh_forms_data_cell( 

4309 wxr: WiktextractContext, 

4310 base_data: WordData, 

4311 cell: WikiNode, 

4312 row_header: str, 

4313 row_header_tags: list[str], 

4314) -> None: 

4315 from .zh_pron_tags import ZH_PRON_TAGS 

4316 

4317 forms: list[FormData] = [] 

4318 for top_span_tag in cell.find_html("span"): 

4319 span_style = top_span_tag.attrs.get("style", "") 

4320 span_lang = top_span_tag.attrs.get("lang", "") 

4321 if span_style == "white-space:nowrap;": 

4322 extract_zh_forms_data_cell( 

4323 wxr, base_data, top_span_tag, row_header, row_header_tags 

4324 ) 

4325 elif "font-size:80%" in span_style: 

4326 raw_tag = clean_node(wxr, None, top_span_tag) 

4327 if raw_tag != "": 4327 ↛ 4318line 4327 didn't jump to line 4318 because the condition on line 4327 was always true

4328 for form in forms: 

4329 if raw_tag in ZH_PRON_TAGS: 4329 ↛ 4335line 4329 didn't jump to line 4335 because the condition on line 4329 was always true

4330 tr_tag = ZH_PRON_TAGS[raw_tag] 

4331 if isinstance(tr_tag, list): 4331 ↛ 4332line 4331 didn't jump to line 4332 because the condition on line 4331 was never true

4332 data_extend(form, "tags", tr_tag) 

4333 elif isinstance(tr_tag, str): 4333 ↛ 4328line 4333 didn't jump to line 4328 because the condition on line 4333 was always true

4334 data_append(form, "tags", tr_tag) 

4335 elif raw_tag in valid_tags: 

4336 data_append(form, "tags", raw_tag) 

4337 else: 

4338 data_append(form, "raw_tags", raw_tag) 

4339 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4339 ↛ 4318line 4339 didn't jump to line 4318 because the condition on line 4339 was always true

4340 word = clean_node(wxr, None, top_span_tag) 

4341 if word not in ["", "/", wxr.wtp.title]: 

4342 form = {"form": word} 

4343 if row_header != "anagram": 4343 ↛ 4349line 4343 didn't jump to line 4349 because the condition on line 4343 was always true

4344 for raw_tag in row_header_tags: 

4345 if raw_tag in ZH_FORMS_TAGS: 4345 ↛ 4348line 4345 didn't jump to line 4348 because the condition on line 4345 was always true

4346 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4347 else: 

4348 data_append(form, "raw_tags", raw_tag) 

4349 if span_lang == "zh-Hant": 

4350 data_append(form, "tags", "Traditional-Chinese") 

4351 elif span_lang == "zh-Hans": 

4352 data_append(form, "tags", "Simplified-Chinese") 

4353 forms.append(form) 

4354 

4355 if row_header == "anagram": 4355 ↛ 4356line 4355 didn't jump to line 4356 because the condition on line 4355 was never true

4356 for form in forms: 

4357 l_data: LinkageData = {"word": form["form"]} 

4358 for key in TAG_LITERALS_TUPLE: 

4359 if key in form: 

4360 l_data[key] = form[key] 

4361 data_append(base_data, "anagrams", l_data) 

4362 else: 

4363 data_extend(base_data, "forms", forms) 

4364 

4365 

4366def extract_ja_kanjitab_template( 

4367 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4368): 

4369 # https://en.wiktionary.org/wiki/Template:ja-kanjitab 

4370 expanded_node = wxr.wtp.parse( 

4371 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4372 ) 

4373 for table in expanded_node.find_child(NodeKind.TABLE): 

4374 is_alt_form_table = False 

4375 for row in table.find_child(NodeKind.TABLE_ROW): 

4376 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

4377 header_text = clean_node(wxr, None, header_node) 

4378 if header_text.startswith("Alternative spelling"): 

4379 is_alt_form_table = True 

4380 if not is_alt_form_table: 

4381 continue 

4382 forms = [] 

4383 for row in table.find_child(NodeKind.TABLE_ROW): 

4384 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

4385 for child_node in cell_node.children: 

4386 if isinstance(child_node, HTMLNode): 

4387 if child_node.tag == "span": 

4388 word = clean_node(wxr, None, child_node) 

4389 if word != "": 4389 ↛ 4385line 4389 didn't jump to line 4385 because the condition on line 4389 was always true

4390 forms.append( 

4391 { 

4392 "form": word, 

4393 "tags": ["alternative", "kanji"], 

4394 } 

4395 ) 

4396 elif child_node.tag == "small": 

4397 raw_tag = clean_node(wxr, None, child_node).strip( 

4398 "()" 

4399 ) 

4400 if raw_tag != "" and len(forms) > 0: 4400 ↛ 4385line 4400 didn't jump to line 4385 because the condition on line 4400 was always true

4401 data_append( 

4402 forms[-1], 

4403 "tags" 

4404 if raw_tag in valid_tags 

4405 else "raw_tags", 

4406 raw_tag, 

4407 ) 

4408 data_extend(base_data, "forms", forms) 

4409 for link_node in expanded_node.find_child(NodeKind.LINK): 

4410 clean_node(wxr, base_data, link_node)