Coverage for src/wiktextract/extractor/en/page.py: 79%

1834 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 06:55 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8from collections import defaultdict 

9from functools import partial 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 Iterable, 

14 Literal, 

15 Optional, 

16 Set, 

17 Union, 

18 cast, 

19) 

20 

21from mediawiki_langcodes import get_all_names, name_to_code 

22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

23from wikitextprocessor.parser import ( 

24 LEVEL_KIND_FLAGS, 

25 GeneralNode, 

26 HTMLNode, 

27 LevelNode, 

28 NodeKind, 

29 TemplateNode, 

30 WikiNode, 

31) 

32 

33from ...clean import clean_template_args, clean_value 

34from ...datautils import ( 

35 data_append, 

36 data_extend, 

37 ns_title_prefix_tuple, 

38) 

39from ...page import ( 

40 LEVEL_KINDS, 

41 clean_node, 

42 is_panel_template, 

43 recursively_extract, 

44) 

45from ...tags import valid_tags 

46from ...wxr_context import WiktextractContext 

47from ...wxr_logging import logger 

48from ..ruby import extract_ruby, parse_ruby 

49from ..share import strip_nodes 

50from .descendant import extract_descendant_section 

51from .example import extract_example_list_item, extract_template_zh_x 

52from .form_descriptions import ( 

53 classify_desc, 

54 decode_tags, 

55 distw, 

56 parse_alt_or_inflection_of, 

57 parse_sense_qualifier, 

58 parse_word_head, 

59) 

60from .inflection import TableContext, parse_inflection_section 

61from .info_templates import ( 

62 INFO_TEMPLATE_FUNCS, 

63 parse_info_template_arguments, 

64 parse_info_template_node, 

65) 

66from .linkages import ( 

67 extract_alt_form_section, 

68 parse_linkage, 

69) 

70from .parts_of_speech import PARTS_OF_SPEECH 

71from .section_titles import ( 

72 COMPOUNDS_TITLE, 

73 DESCENDANTS_TITLE, 

74 ETYMOLOGY_TITLES, 

75 IGNORED_TITLES, 

76 INFLECTION_TITLES, 

77 LINKAGE_TITLES, 

78 POS_TITLES, 

79 PRONUNCIATION_TITLE, 

80 PROTO_ROOT_DERIVED_TITLES, 

81 TRANSLATIONS_TITLE, 

82) 

83from .translations import parse_translation_item_text 

84from .type_utils import ( 

85 AttestationData, 

86 ExampleData, 

87 FormData, 

88 LinkageData, 

89 ReferenceData, 

90 SenseData, 

91 SoundData, 

92 TemplateData, 

93 WordData, 

94) 

95from .unsupported_titles import unsupported_title_map 

96 

97# When determining whether a string is 'english', classify_desc 

98# might return 'taxonomic' which is English text 99% of the time. 

99ENGLISH_TEXTS = ("english", "taxonomic") 

100 

101# Matches head tag 

102HEAD_TAG_RE = re.compile( 

103 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

104 r"hangul-symbol|syllable-hangul)$|" 

105 + r"^(latin|" 

106 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

107 + r")-(" 

108 + "|".join( 

109 [ 

110 "abbr", 

111 "adj", 

112 "adjective", 

113 "adjective form", 

114 "adjective-form", 

115 "adv", 

116 "adverb", 

117 "affix", 

118 "animal command", 

119 "art", 

120 "article", 

121 "aux", 

122 "bound pronoun", 

123 "bound-pronoun", 

124 "Buyla", 

125 "card num", 

126 "card-num", 

127 "cardinal", 

128 "chunom", 

129 "classifier", 

130 "clitic", 

131 "cls", 

132 "cmene", 

133 "cmavo", 

134 "colloq-verb", 

135 "colverbform", 

136 "combining form", 

137 "combining-form", 

138 "comparative", 

139 "con", 

140 "concord", 

141 "conj", 

142 "conjunction", 

143 "conjug", 

144 "cont", 

145 "contr", 

146 "converb", 

147 "daybox", 

148 "decl", 

149 "decl noun", 

150 "def", 

151 "dem", 

152 "det", 

153 "determ", 

154 "Deva", 

155 "ending", 

156 "entry", 

157 "form", 

158 "fuhivla", 

159 "gerund", 

160 "gismu", 

161 "hanja", 

162 "hantu", 

163 "hanzi", 

164 "head", 

165 "ideophone", 

166 "idiom", 

167 "inf", 

168 "indef", 

169 "infixed pronoun", 

170 "infixed-pronoun", 

171 "infl", 

172 "inflection", 

173 "initialism", 

174 "int", 

175 "interfix", 

176 "interj", 

177 "interjection", 

178 "jyut", 

179 "latin", 

180 "letter", 

181 "locative", 

182 "lujvo", 

183 "monthbox", 

184 "mutverb", 

185 "name", 

186 "nisba", 

187 "nom", 

188 "noun", 

189 "noun form", 

190 "noun-form", 

191 "noun plural", 

192 "noun-plural", 

193 "nounprefix", 

194 "num", 

195 "number", 

196 "numeral", 

197 "ord", 

198 "ordinal", 

199 "par", 

200 "part", 

201 "part form", 

202 "part-form", 

203 "participle", 

204 "particle", 

205 "past", 

206 "past neg", 

207 "past-neg", 

208 "past participle", 

209 "past-participle", 

210 "perfect participle", 

211 "perfect-participle", 

212 "personal pronoun", 

213 "personal-pronoun", 

214 "pref", 

215 "prefix", 

216 "phrase", 

217 "pinyin", 

218 "plural noun", 

219 "plural-noun", 

220 "pos", 

221 "poss-noun", 

222 "post", 

223 "postp", 

224 "postposition", 

225 "PP", 

226 "pp", 

227 "ppron", 

228 "pred", 

229 "predicative", 

230 "prep", 

231 "prep phrase", 

232 "prep-phrase", 

233 "preposition", 

234 "present participle", 

235 "present-participle", 

236 "pron", 

237 "prondem", 

238 "pronindef", 

239 "pronoun", 

240 "prop", 

241 "proper noun", 

242 "proper-noun", 

243 "proper noun form", 

244 "proper-noun form", 

245 "proper noun-form", 

246 "proper-noun-form", 

247 "prov", 

248 "proverb", 

249 "prpn", 

250 "prpr", 

251 "punctuation mark", 

252 "punctuation-mark", 

253 "regnoun", 

254 "rel", 

255 "rom", 

256 "romanji", 

257 "root", 

258 "sign", 

259 "suff", 

260 "suffix", 

261 "syllable", 

262 "symbol", 

263 "verb", 

264 "verb form", 

265 "verb-form", 

266 "verbal noun", 

267 "verbal-noun", 

268 "verbnec", 

269 "vform", 

270 ] 

271 ) 

272 + r")(-|/|\+|$)" 

273) 

274 

275# Head-templates causing problems (like newlines) that can be squashed into 

276# an empty string in the template handler while saving their template 

277# data for later. 

278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

279 

280FLOATING_TABLE_TEMPLATES: set[str] = { 

281 # az-suffix-form creates a style=floatright div that is otherwise 

282 # deleted; if it is not pre-expanded, we can intercept the template 

283 # so we add this set into do_not_pre_expand, and intercept the 

284 # templates in parse_part_of_speech 

285 "az-suffix-forms", 

286 "az-inf-p", 

287 "kk-suffix-forms", 

288 "ky-suffix-forms", 

289 "tr-inf-p", 

290 "tr-suffix-forms", 

291 "tt-suffix-forms", 

292 "uz-suffix-forms", 

293} 

294# These two should contain template names that should always be 

295# pre-expanded when *first* processing the tree, or not pre-expanded 

296# so that the template are left in place with their identifying 

297# name intact for later filtering. 

298 

299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

301 

302# Additional templates to be expanded in the pre-expand phase 

303ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

304 "multitrans", 

305 "multitrans-nowiki", 

306 "trans-top", 

307 "trans-top-also", 

308 "trans-bottom", 

309 "checktrans-top", 

310 "checktrans-bottom", 

311 "col", 

312 "col1", 

313 "col2", 

314 "col3", 

315 "col4", 

316 "col5", 

317 "col1-u", 

318 "col2-u", 

319 "col3-u", 

320 "col4-u", 

321 "col5-u", 

322 "check deprecated lang param usage", 

323 "deprecated code", 

324 "ru-verb-alt-ё", 

325 "ru-noun-alt-ё", 

326 "ru-adj-alt-ё", 

327 "ru-proper noun-alt-ё", 

328 "ru-pos-alt-ё", 

329 "ru-alt-ё", 

330 "inflection of", 

331 "no deprecated lang param usage", 

332 "transclude", # these produce sense entries (or other lists) 

333 "tcl", 

334} 

335 

336# Inverse linkage for those that have them 

337linkage_inverses: dict[str, str] = { 

338 # XXX this is not currently used, move to post-processing 

339 "synonyms": "synonyms", 

340 "hypernyms": "hyponyms", 

341 "hyponyms": "hypernyms", 

342 "holonyms": "meronyms", 

343 "meronyms": "holonyms", 

344 "derived": "derived_from", 

345 "coordinate_terms": "coordinate_terms", 

346 "troponyms": "hypernyms", 

347 "antonyms": "antonyms", 

348 "instances": "instance_of", 

349 "related": "related", 

350} 

351 

352# Templates that are used to form panels on pages and that 

353# should be ignored in various positions 

354PANEL_TEMPLATES: set[str] = { 

355 "Character info", 

356 "CJKV", 

357 "French personal pronouns", 

358 "French possessive adjectives", 

359 "French possessive pronouns", 

360 "Han etym", 

361 "Japanese demonstratives", 

362 "Latn-script", 

363 "LDL", 

364 "MW1913Abbr", 

365 "Number-encoding", 

366 "Nuttall", 

367 "Spanish possessive adjectives", 

368 "Spanish possessive pronouns", 

369 "USRegionDisputed", 

370 "Webster 1913", 

371 "ase-rfr", 

372 "attention", 

373 "attn", 

374 "beer", 

375 "broken ref", 

376 "ca-compass", 

377 "character info", 

378 "character info/var", 

379 "checksense", 

380 "compass-fi", 

381 "copyvio suspected", 

382 "delete", 

383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

384 "etystub", 

385 "examples", 

386 "hu-corr", 

387 "hu-suff-pron", 

388 "interwiktionary", 

389 "ja-kanjitab", 

390 "ja-kt", 

391 "ko-hanja-search", 

392 "look", 

393 "maintenance box", 

394 "maintenance line", 

395 "mediagenic terms", 

396 "merge", 

397 "missing template", 

398 "morse links", 

399 "move", 

400 "multiple images", 

401 "no inline", 

402 "picdic", 

403 "picdicimg", 

404 "picdiclabel", 

405 "polyominoes", 

406 "predidential nomics", 

407 "punctuation", # This actually gets pre-expanded 

408 "reconstructed", 

409 "request box", 

410 "rf-sound example", 

411 "rfaccents", 

412 "rfap", 

413 "rfaspect", 

414 "rfc", 

415 "rfc-auto", 

416 "rfc-header", 

417 "rfc-level", 

418 "rfc-pron-n", 

419 "rfc-sense", 

420 "rfclarify", 

421 "rfd", 

422 "rfd-redundant", 

423 "rfd-sense", 

424 "rfdate", 

425 "rfdatek", 

426 "rfdef", 

427 "rfe", 

428 "rfe/dowork", 

429 "rfex", 

430 "rfexp", 

431 "rfform", 

432 "rfgender", 

433 "rfi", 

434 "rfinfl", 

435 "rfm", 

436 "rfm-sense", 

437 "rfp", 

438 "rfp-old", 

439 "rfquote", 

440 "rfquote-sense", 

441 "rfquotek", 

442 "rfref", 

443 "rfscript", 

444 "rft2", 

445 "rftaxon", 

446 "rftone", 

447 "rftranslit", 

448 "rfv", 

449 "rfv-etym", 

450 "rfv-pron", 

451 "rfv-quote", 

452 "rfv-sense", 

453 "selfref", 

454 "split", 

455 "stroke order", # XXX consider capturing this? 

456 "stub entry", 

457 "t-needed", 

458 "tbot entry", 

459 "tea room", 

460 "tea room sense", 

461 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

462 "unblock", 

463 "unsupportedpage", 

464 "video frames", 

465 "was wotd", 

466 "wrongtitle", 

467 "zh-forms", 

468 "zh-hanzi-box", 

469 "no entry", 

470} 

471 

472# Template name prefixes used for language-specific panel templates (i.e., 

473# templates that create side boxes or notice boxes or that should generally 

474# be ignored). 

475PANEL_PREFIXES: set[str] = { 

476 "list:compass points/", 

477 "list:Gregorian calendar months/", 

478 "RQ:", 

479} 

480 

481# Templates used for wikipedia links. 

482wikipedia_templates: set[str] = { 

483 "wikipedia", 

484 "slim-wikipedia", 

485 "w", 

486 "W", 

487 "swp", 

488 "wiki", 

489 "Wikipedia", 

490 "wtorw", 

491} 

492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started

493 print( 

494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

495 x 

496 ) 

497 ) 

498 

499# Mapping from a template name (without language prefix) for the main word 

500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

501# it could validly occur. This is used as just a sanity check to give 

502# warnings about probably incorrect coding in Wiktionary. 

503template_allowed_pos_map: dict[str, list[str]] = { 

504 "abbr": ["abbrev"], 

505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

506 "plural noun": ["noun", "name"], 

507 "plural-noun": ["noun", "name"], 

508 "proper noun": ["noun", "name"], 

509 "proper-noun": ["name", "noun"], 

510 "prop": ["name", "noun"], 

511 "verb": ["verb", "phrase"], 

512 "gerund": ["verb"], 

513 "particle": ["adv", "particle"], 

514 "adj": ["adj", "adj_noun"], 

515 "pron": ["pron", "noun"], 

516 "name": ["name", "noun"], 

517 "adv": ["adv", "intj", "conj", "particle"], 

518 "phrase": ["phrase", "prep_phrase"], 

519 "noun phrase": ["phrase"], 

520 "ordinal": ["num"], 

521 "number": ["num"], 

522 "pos": ["affix", "name", "num"], 

523 "suffix": ["suffix", "affix"], 

524 "character": ["character"], 

525 "letter": ["character"], 

526 "kanji": ["character"], 

527 "cont": ["abbrev"], 

528 "interj": ["intj"], 

529 "con": ["conj"], 

530 "part": ["particle"], 

531 "prep": ["prep", "postp"], 

532 "postp": ["postp"], 

533 "misspelling": ["noun", "adj", "verb", "adv"], 

534 "part-form": ["verb"], 

535} 

536for k, v in template_allowed_pos_map.items(): 

537 for x in v: 

538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 print( 

540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

541 "".format(x, k, v) 

542 ) 

543 assert False 

544 

545 

546# Templates ignored during etymology extraction, i.e., these will not be listed 

547# in the extracted etymology templates. 

548ignored_etymology_templates: list[str] = [ 

549 "...", 

550 "IPAchar", 

551 "ipachar", 

552 "ISBN", 

553 "isValidPageName", 

554 "redlink category", 

555 "deprecated code", 

556 "check deprecated lang param usage", 

557 "para", 

558 "p", 

559 "cite", 

560 "Cite news", 

561 "Cite newsgroup", 

562 "cite paper", 

563 "cite MLLM 1976", 

564 "cite journal", 

565 "cite news/documentation", 

566 "cite paper/documentation", 

567 "cite video game", 

568 "cite video game/documentation", 

569 "cite newsgroup", 

570 "cite newsgroup/documentation", 

571 "cite web/documentation", 

572 "cite news", 

573 "Cite book", 

574 "Cite-book", 

575 "cite book", 

576 "cite web", 

577 "cite-usenet", 

578 "cite-video/documentation", 

579 "Cite-journal", 

580 "rfe", 

581 "catlangname", 

582 "cln", 

583 "langname-lite", 

584 "no deprecated lang param usage", 

585 "mention", 

586 "m", 

587 "m-self", 

588 "link", 

589 "l", 

590 "ll", 

591 "l-self", 

592] 

593# Regexp for matching ignored etymology template names. This adds certain 

594# prefixes to the names listed above. 

595ignored_etymology_templates_re = re.compile( 

596 r"^((cite-|R:|RQ:).*|" 

597 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

598 + r")$" 

599) 

600 

601# Regexp for matching ignored descendants template names. Right now we just 

602# copy the ignored etymology templates 

603ignored_descendants_templates_re = ignored_etymology_templates_re 

604 

605# Set of template names that are used to define usage examples. If the usage 

606# example contains one of these templates, then it its type is set to 

607# "example" 

608usex_templates: set[str] = { 

609 "afex", 

610 "affixusex", 

611 "co", # {{collocation}} acts like a example template, specifically for 

612 # pairs of combinations of words that are more common than you'd 

613 # except would be randomly; hlavní#Czech 

614 "coi", 

615 "collocation", 

616 "el-example", 

617 "el-x", 

618 "example", 

619 "examples", 

620 "he-usex", 

621 "he-x", 

622 "hi-usex", 

623 "hi-x", 

624 "ja-usex-inline", 

625 "ja-usex", 

626 "ja-x", 

627 "jbo-example", 

628 "jbo-x", 

629 "km-usex", 

630 "km-x", 

631 "ko-usex", 

632 "ko-x", 

633 "lo-usex", 

634 "lo-x", 

635 "ne-x", 

636 "ne-usex", 

637 "prefixusex", 

638 "ryu-usex", 

639 "ryu-x", 

640 "shn-usex", 

641 "shn-x", 

642 "suffixusex", 

643 "th-usex", 

644 "th-x", 

645 "ur-usex", 

646 "ur-x", 

647 "usex", 

648 "usex-suffix", 

649 "ux", 

650 "uxi", 

651} 

652 

653stop_head_at_these_templates: set[str] = { 

654 "category", 

655 "cat", 

656 "topics", 

657 "catlangname", 

658 "c", 

659 "C", 

660 "top", 

661 "cln", 

662} 

663 

664# Set of template names that are used to define quotation examples. If the 

665# usage example contains one of these templates, then its type is set to 

666# "quotation". 

667quotation_templates: set[str] = { 

668 "collapse-quote", 

669 "quote-av", 

670 "quote-book", 

671 "quote-GYLD", 

672 "quote-hansard", 

673 "quotei", 

674 "quote-journal", 

675 "quotelite", 

676 "quote-mailing list", 

677 "quote-meta", 

678 "quote-newsgroup", 

679 "quote-song", 

680 "quote-text", 

681 "quote", 

682 "quote-us-patent", 

683 "quote-video game", 

684 "quote-web", 

685 "quote-wikipedia", 

686 "wikiquote", 

687 "Wikiquote", 

688 "Q", 

689} 

690 

691taxonomy_templates = { 

692 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

693 "taxfmt", 

694 "taxlink", 

695 "taxlink2", 

696 "taxlinknew", 

697 "taxlook", 

698} 

699 

700# Template names, this was exctracted from template_linkage_mappings, 

701# because the code using template_linkage_mappings was actually not used 

702# (but not removed). 

703template_linkages_to_ignore_in_examples: set[str] = { 

704 "syn", 

705 "synonyms", 

706 "ant", 

707 "antonyms", 

708 "hyp", 

709 "hyponyms", 

710 "der", 

711 "derived terms", 

712 "coordinate terms", 

713 "cot", 

714 "rel", 

715 "col", 

716 "inline alt forms", 

717 "alti", 

718 "comeronyms", 

719 "holonyms", 

720 "holo", 

721 "hypernyms", 

722 "hyper", 

723 "meronyms", 

724 "mero", 

725 "troponyms", 

726 "perfectives", 

727 "pf", 

728 "imperfectives", 

729 "impf", 

730 "syndiff", 

731 "synsee", 

732 # not linkage nor example templates 

733 "sense", 

734 "s", 

735 "color panel", 

736 "colour panel", 

737} 

738 

739# Maps template name used in a word sense to a linkage field that it adds. 

740sense_linkage_templates: dict[str, str] = { 

741 "syn": "synonyms", 

742 "synonyms": "synonyms", 

743 "synsee": "synonyms", 

744 "syndiff": "synonyms", 

745 "hyp": "hyponyms", 

746 "hyponyms": "hyponyms", 

747 "ant": "antonyms", 

748 "antonyms": "antonyms", 

749 "alti": "related", 

750 "inline alt forms": "related", 

751 "coordinate terms": "coordinate_terms", 

752 "cot": "coordinate_terms", 

753 "comeronyms": "related", 

754 "holonyms": "holonyms", 

755 "holo": "holonyms", 

756 "hypernyms": "hypernyms", 

757 "hyper": "hypernyms", 

758 "meronyms": "meronyms", 

759 "mero": "meronyms", 

760 "troponyms": "troponyms", 

761 "perfectives": "related", 

762 "pf": "related", 

763 "imperfectives": "related", 

764 "impf": "related", 

765 "parasynonyms": "synonyms", 

766 "par": "synonyms", 

767 "parasyn": "synonyms", 

768 "nearsyn": "synonyms", 

769 "near-syn": "synonyms", 

770} 

771 

772sense_linkage_templates_tags: dict[str, list[str]] = { 

773 "alti": ["alternative"], 

774 "inline alt forms": ["alternative"], 

775 "comeronyms": ["comeronym"], 

776 "perfectives": ["perfective"], 

777 "pf": ["perfective"], 

778 "imperfectives": ["imperfective"], 

779 "impf": ["imperfective"], 

780} 

781 

782 

783def decode_html_entities(v: Union[str, int]) -> str: 

784 """Decodes HTML entities from a value, converting them to the respective 

785 Unicode characters/strings.""" 

786 if isinstance(v, int): 

787 # I changed this to return str(v) instead of v = str(v), 

788 # but there might have been the intention to have more logic 

789 # here. html.unescape would not do anything special with an integer, 

790 # it needs html escape symbols (&xx;). 

791 return str(v) 

792 return html.unescape(v) 

793 

794 

795def parse_sense_linkage( 

796 wxr: WiktextractContext, 

797 data: SenseData, 

798 name: str, 

799 ht: TemplateArgs, 

800 pos: str, 

801) -> None: 

802 """Parses a linkage (synonym, etc) specified in a word sense.""" 

803 assert isinstance(wxr, WiktextractContext) 

804 assert isinstance(data, dict) 

805 assert isinstance(name, str) 

806 assert isinstance(ht, dict) 

807 field = sense_linkage_templates[name] 

808 field_tags = sense_linkage_templates_tags.get(name, []) 

809 for i in range(2, 20): 

810 if i not in ht: 

811 break 

812 w = clean_node(wxr, data, ht[i]) 

813 if "#" in w: 

814 w = w[: w.index("#")] 

815 if w in ["", "<"]: # `<` used in "hypernyms" template 

816 continue 

817 if ( 817 ↛ 822line 817 didn't jump to line 822 because the condition on line 817 was never true

818 i > 2 

819 and w in (",", "or", ";") 

820 or w.startswith(("see also", "See also")) 

821 ): 

822 continue 

823 is_thesaurus = False 

824 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

825 if w.startswith(alias): 

826 is_thesaurus = True 

827 w = w[len(alias) :] 

828 if w != wxr.wtp.title: 828 ↛ 848line 828 didn't jump to line 848 because the condition on line 828 was always true

829 from ...thesaurus import search_thesaurus 

830 

831 lang_code = clean_node(wxr, None, ht.get(1, "")) 

832 for t_data in search_thesaurus( 

833 wxr.thesaurus_db_conn, # type: ignore 

834 w, 

835 lang_code, 

836 pos, 

837 "synonyms", # GH issue #1570 

838 ): 

839 l_data: LinkageData = { 

840 "word": t_data.term, 

841 "source": "Thesaurus:" + w, 

842 } 

843 if len(t_data.tags) > 0: 843 ↛ 844line 843 didn't jump to line 844 because the condition on line 843 was never true

844 l_data["tags"] = t_data.tags 

845 if len(t_data.raw_tags) > 0: 845 ↛ 846line 845 didn't jump to line 846 because the condition on line 845 was never true

846 l_data["raw_tags"] = t_data.raw_tags 

847 data_append(data, field, l_data) 

848 break 

849 if is_thesaurus: 

850 continue 

851 tags: list[str] = [] 

852 topics: list[str] = [] 

853 english: Optional[str] = None 

854 # Try to find qualifiers for this synonym 

855 q = ht.get("q{}".format(i - 1)) 

856 if q: 

857 cls = classify_desc(q) 

858 if cls == "tags": 

859 tagsets1, topics1 = decode_tags(q) 

860 for ts in tagsets1: 

861 tags.extend(ts) 

862 topics.extend(topics1) 

863 elif cls == "english": 863 ↛ 869line 863 didn't jump to line 869 because the condition on line 863 was always true

864 if english: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true

865 english += "; " + q 

866 else: 

867 english = q 

868 # Try to find English translation for this synonym 

869 t = ht.get("t{}".format(i - 1)) 

870 if t: 870 ↛ 871line 870 didn't jump to line 871 because the condition on line 870 was never true

871 if english: 

872 english += "; " + t 

873 else: 

874 english = t 

875 

876 # See if the linkage contains a parenthesized alt 

877 alt = None 

878 m = re.search(r"\(([^)]+)\)$", w) 

879 if m: 879 ↛ 880line 879 didn't jump to line 880 because the condition on line 879 was never true

880 w = w[: m.start()].strip() 

881 alt = m.group(1) 

882 

883 dt = {"word": w} 

884 if field_tags: 884 ↛ 885line 884 didn't jump to line 885 because the condition on line 884 was never true

885 data_extend(dt, "tags", field_tags) 

886 if tags: 

887 data_extend(dt, "tags", tags) 

888 if topics: 888 ↛ 889line 888 didn't jump to line 889 because the condition on line 888 was never true

889 data_extend(dt, "topics", topics) 

890 if english: 

891 dt["english"] = english # DEPRECATED for "translation" 

892 dt["translation"] = english 

893 if alt: 893 ↛ 894line 893 didn't jump to line 894 because the condition on line 893 was never true

894 dt["alt"] = alt 

895 data_append(data, field, dt) 

896 

897 

898EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

899example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

900captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

901 

902 

903def synch_splits_with_args( 

904 line: str, targs: TemplateArgs 

905) -> Optional[list[str]]: 

906 """If it looks like there's something weird with how a line of example 

907 text has been split, this function will do the splitting after counting 

908 occurences of the splitting regex inside the two main template arguments 

909 containing the string data for the original language example and the 

910 English translations. 

911 """ 

912 # Previously, we split without capturing groups, but here we want to 

913 # keep the original splitting hyphen regex intact. 

914 fparts = captured_splitters_re.split(line) 

915 new_parts = [] 

916 # ["First", " – ", "second", " – ", "third..."] from OL argument 

917 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

918 new_parts.append("".join(fparts[:first])) 

919 # Translation argument 

920 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

921 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

922 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

923 new_parts.append("".join(fparts[first + 1 : second])) 

924 

925 if all(new_parts): # no empty strings from the above spaghetti 

926 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

927 return new_parts 

928 else: 

929 return None 

930 

931 

932QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

933QUALIFIERS_RE = re.compile(QUALIFIERS) 

934# (...): ... or (...(...)...): ... 

935 

936 

937def parse_language( 

938 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

939) -> list[WordData]: 

940 """Iterates over the text of the page, returning words (parts-of-speech) 

941 defined on the page one at a time. (Individual word senses for the 

942 same part-of-speech are typically encoded in the same entry.)""" 

943 # imported here to avoid circular import 

944 from .pronunciation import parse_pronunciation 

945 

946 assert isinstance(wxr, WiktextractContext) 

947 assert isinstance(langnode, WikiNode) 

948 assert isinstance(language, str) 

949 assert isinstance(lang_code, str) 

950 # print("parse_language", language) 

951 

952 is_reconstruction = False 

953 word: str = wxr.wtp.title # type: ignore[assignment] 

954 unsupported_prefix = "Unsupported titles/" 

955 if word.startswith(unsupported_prefix): 

956 w = word[len(unsupported_prefix) :] 

957 if w in unsupported_title_map: 957 ↛ 960line 957 didn't jump to line 960 because the condition on line 957 was always true

958 word = unsupported_title_map[w] 

959 else: 

960 wxr.wtp.error( 

961 "Unimplemented unsupported title: {}".format(word), 

962 sortid="page/870", 

963 ) 

964 word = w 

965 elif word.startswith("Reconstruction:"): 

966 word = word[word.find("/") + 1 :] 

967 is_reconstruction = True 

968 

969 base_data: WordData = { 

970 "word": word, 

971 "lang": language, 

972 "lang_code": lang_code, 

973 } 

974 if is_reconstruction: 

975 data_append(base_data, "tags", "reconstruction") 

976 sense_data: SenseData = {} 

977 pos_data: WordData = {} # For a current part-of-speech 

978 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

979 etym_data: WordData = {} # For one etymology 

980 sense_datas: list[SenseData] = [] 

981 sense_ordinal = 0 # The recursive sense parsing messes up the ordering 

982 # Never reset, do not use as data 

983 level_four_datas: list[WordData] = [] 

984 etym_datas: list[WordData] = [] 

985 page_datas: list[WordData] = [] 

986 have_etym = False 

987 inside_level_four = False # This is for checking if the etymology section 

988 # or article has a Pronunciation section, for Chinese mostly; because 

989 # Chinese articles can have three level three sections (two etymology 

990 # sections and pronunciation sections) one after another, we need a kludge 

991 # to better keep track of whether we're in a normal "etym" or inside a 

992 # "level four" (which is what we've turned the level three Pron sections 

993 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

994 # a step. 

995 stack: list[str] = [] # names of items on the "stack" 

996 

997 def merge_base(data: WordData, base: WordData) -> None: 

998 for k, v in base.items(): 

999 # Copy the value to ensure that we don't share lists or 

1000 # dicts between structures (even nested ones). 

1001 v = copy.deepcopy(v) 

1002 if k not in data: 

1003 # The list was copied above, so this will not create shared ref 

1004 data[k] = v # type: ignore[literal-required] 

1005 continue 

1006 if data[k] == v: # type: ignore[literal-required] 

1007 continue 

1008 if ( 1008 ↛ 1016line 1008 didn't jump to line 1016 because the condition on line 1008 was always true

1009 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

1010 or isinstance( 

1011 v, 

1012 (list, tuple), # Should this be "and"? 

1013 ) 

1014 ): 

1015 data[k] = list(data[k]) + list(v) # type: ignore 

1016 elif data[k] != v: # type: ignore[literal-required] 

1017 wxr.wtp.warning( 

1018 "conflicting values for {} in merge_base: " 

1019 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1020 sortid="page/904", 

1021 ) 

1022 

1023 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1024 """Remove unnecessary keys from dict values 

1025 in a list comprehension...""" 

1026 if key in pron: 

1027 pron.pop(key) # type: ignore 

1028 return pron 

1029 

1030 def sound_matches_pos(sound: SoundData, pos: str) -> bool: 

1031 if "pos" not in sound: 

1032 return True 

1033 sound_pos = sound["pos"] # type: ignore[typeddict-item] 

1034 return pos in sound_pos 

1035 

1036 def strip_sound_pos(sound: SoundData) -> SoundData: 

1037 complementary_pop(sound, "pos") 

1038 return sound 

1039 

1040 # If the result has sounds, eliminate sounds that have a prefix that 

1041 # does not match "word" or one of "forms" 

1042 if "sounds" in data and "word" in data: 

1043 accepted = [data["word"]] 

1044 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1045 data["sounds"] = list( 

1046 s 

1047 for s in data["sounds"] 

1048 if "form" not in s or s["form"] in accepted 

1049 ) 

1050 # If the result has sounds, eliminate sounds that have a pos that 

1051 # does not match "pos" 

1052 if "sounds" in data and "pos" in data: 

1053 data["sounds"] = list( 

1054 strip_sound_pos(s) 

1055 for s in data["sounds"] 

1056 # "pos" is not a field of SoundData, correctly, so we're 

1057 # removing it here. It's a kludge on a kludge on a kludge. 

1058 if sound_matches_pos(s, data["pos"]) 

1059 ) 

1060 elif "sounds" in data: 1060 ↛ 1061line 1060 didn't jump to line 1061 because the condition on line 1060 was never true

1061 data["sounds"] = [strip_sound_pos(s) for s in data["sounds"]] 

1062 

1063 def push_sense(sorting_ordinal: int | None = None) -> bool: 

1064 """Starts collecting data for a new word sense. This returns True 

1065 if a sense was added.""" 

1066 nonlocal sense_data 

1067 if sorting_ordinal is None: 

1068 sorting_ordinal = sense_ordinal 

1069 tags = sense_data.get("tags", ()) 

1070 if ( 

1071 not sense_data.get("glosses") 

1072 and "translation-hub" not in tags 

1073 and "no-gloss" not in tags 

1074 ): 

1075 return False 

1076 

1077 if ( 1077 ↛ 1087line 1077 didn't jump to line 1087 because the condition on line 1077 was never true

1078 ( 

1079 "participle" in sense_data.get("tags", ()) 

1080 or "infinitive" in sense_data.get("tags", ()) 

1081 ) 

1082 and "alt_of" not in sense_data 

1083 and "form_of" not in sense_data 

1084 and "etymology_text" in etym_data 

1085 and etym_data["etymology_text"] != "" 

1086 ): 

1087 etym = etym_data["etymology_text"] 

1088 etym = etym.split(". ")[0] 

1089 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1090 if ret is not None: 

1091 tags, lst = ret 

1092 assert isinstance(lst, (list, tuple)) 

1093 if "form-of" in tags: 

1094 data_extend(sense_data, "form_of", lst) 

1095 data_extend(sense_data, "tags", tags) 

1096 elif "alt-of" in tags: 

1097 data_extend(sense_data, "alt_of", lst) 

1098 data_extend(sense_data, "tags", tags) 

1099 

1100 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1100 ↛ 1103line 1100 didn't jump to line 1103 because the condition on line 1100 was never true

1101 "tags", () 

1102 ): 

1103 data_append(sense_data, "tags", "no-gloss") 

1104 

1105 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore 

1106 sense_datas.append(sense_data) 

1107 sense_data = {} 

1108 return True 

1109 

1110 def push_pos(sorting_ordinal: int | None = None) -> None: 

1111 """Starts collecting data for a new part-of-speech.""" 

1112 nonlocal pos_data 

1113 nonlocal sense_datas 

1114 push_sense(sorting_ordinal) 

1115 if wxr.wtp.subsection: 

1116 data: WordData = {"senses": sense_datas} 

1117 merge_base(data, pos_data) 

1118 level_four_datas.append(data) 

1119 pos_data = {} 

1120 sense_datas = [] 

1121 wxr.wtp.start_subsection(None) 

1122 

1123 def push_level_four_section(clear_sound_data: bool) -> None: 

1124 """Starts collecting data for a new level four sections, which 

1125 is usually virtual and empty, unless the article has Chinese 

1126 'Pronunciation' sections that are etymology-section-like but 

1127 under etymology, and at the same level in the source. We modify 

1128 the source to demote Pronunciation sections like that to level 

1129 4, and other sections one step lower.""" 

1130 nonlocal level_four_data 

1131 nonlocal level_four_datas 

1132 nonlocal etym_datas 

1133 push_pos() 

1134 # print(f"======\n{etym_data=}") 

1135 # print(f"======\n{etym_datas=}") 

1136 # print(f"======\n{level_four_data=}") 

1137 # print(f"======\n{level_four_datas=}") 

1138 for data in level_four_datas: 

1139 merge_base(data, level_four_data) 

1140 etym_datas.append(data) 

1141 for data in etym_datas: 

1142 merge_base(data, etym_data) 

1143 page_datas.append(data) 

1144 if clear_sound_data: 

1145 level_four_data = {} 

1146 level_four_datas = [] 

1147 etym_datas = [] 

1148 

1149 def push_etym() -> None: 

1150 """Starts collecting data for a new etymology.""" 

1151 nonlocal etym_data 

1152 nonlocal etym_datas 

1153 nonlocal have_etym 

1154 nonlocal inside_level_four 

1155 have_etym = True 

1156 push_level_four_section(False) 

1157 inside_level_four = False 

1158 # etymology section could under pronunciation section 

1159 etym_data = ( 

1160 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {} 

1161 ) 

1162 

1163 def select_data() -> WordData: 

1164 """Selects where to store data (pos or etym) based on whether we 

1165 are inside a pos (part-of-speech).""" 

1166 # print(f"{wxr.wtp.subsection=}") 

1167 # print(f"{stack=}") 

1168 if wxr.wtp.subsection is not None: 

1169 return pos_data 

1170 if inside_level_four: 

1171 return level_four_data 

1172 if stack[-1] == language: 

1173 return base_data 

1174 return etym_data 

1175 

1176 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1177 """Parses the subsection for a part-of-speech under a language on 

1178 a page.""" 

1179 assert isinstance(posnode, WikiNode) 

1180 assert isinstance(pos, str) 

1181 # print("parse_part_of_speech", pos) 

1182 pos_data["pos"] = pos 

1183 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1184 lists: list[list[WikiNode]] = [[]] # list of lists 

1185 first_para = True 

1186 first_head_tmplt = True 

1187 collecting_head = True 

1188 start_of_paragraph = True 

1189 

1190 # XXX extract templates from posnode with recursively_extract 

1191 # that break stuff, like ja-kanji or az-suffix-form. 

1192 # Do the extraction with a list of template names, combined from 

1193 # different lists, then separate out them into different lists 

1194 # that are handled at different points of the POS section. 

1195 # First, extract az-suffix-form, put it in `inflection`, 

1196 # and parse `inflection`'s content when appropriate later. 

1197 # The contents of az-suffix-form (and ja-kanji) that generate 

1198 # divs with "floatright" in their style gets deleted by 

1199 # clean_value, so templates that slip through from here won't 

1200 # break anything. 

1201 # XXX bookmark 

1202 # print("===================") 

1203 # print(posnode.children) 

1204 

1205 floaters, poschildren = recursively_extract( 

1206 posnode.children, 

1207 lambda x: ( 

1208 isinstance(x, WikiNode) 

1209 and ( 

1210 ( 

1211 isinstance(x, TemplateNode) 

1212 and x.template_name in FLOATING_TABLE_TEMPLATES 

1213 ) 

1214 or ( 

1215 x.kind == NodeKind.LINK 

1216 # Need to check for stringiness because some links are 

1217 # broken; for example, if a template is missing an 

1218 # argument, a link might look like `[[{{{1}}}...]]` 

1219 and len(x.largs) > 0 

1220 and len(x.largs[0]) > 0 

1221 and isinstance(x.largs[0][0], str) 

1222 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1223 ) 

1224 ) 

1225 ), 

1226 ) 

1227 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1228 tempnode.largs = [["Inflection"]] 

1229 tempnode.children = floaters 

1230 parse_inflection(tempnode, "Floating Div", pos) 

1231 # print(poschildren) 

1232 # XXX new above 

1233 

1234 if not poschildren: 1234 ↛ 1235line 1234 didn't jump to line 1235 because the condition on line 1234 was never true

1235 if not floaters: 

1236 wxr.wtp.debug( 

1237 "PoS section without contents", 

1238 sortid="en/page/1051/20230612", 

1239 ) 

1240 else: 

1241 wxr.wtp.debug( 

1242 "PoS section without contents except for a floating table", 

1243 sortid="en/page/1056/20230612", 

1244 ) 

1245 return 

1246 

1247 for node in poschildren: 

1248 if isinstance(node, str): 

1249 for m in re.finditer(r"\n+|[^\n]+", node): 

1250 p = m.group(0) 

1251 if p.startswith("\n\n") and pre: 

1252 first_para = False 

1253 start_of_paragraph = True 

1254 break 

1255 if p and collecting_head: 

1256 pre[-1].append(p) 

1257 continue 

1258 assert isinstance(node, WikiNode) 

1259 kind = node.kind 

1260 if kind == NodeKind.LIST: 

1261 lists[-1].append(node) 

1262 collecting_head = False 

1263 start_of_paragraph = True 

1264 continue 

1265 elif kind in LEVEL_KINDS: 

1266 # Stop parsing section if encountering any kind of 

1267 # level header (like ===Noun=== or ====Further Reading====). 

1268 # At a quick glance, this should be the default behavior, 

1269 # but if some kinds of source articles have sub-sub-sections 

1270 # that should be parsed XXX it should be handled by changing 

1271 # this break. 

1272 break 

1273 elif collecting_head and kind == NodeKind.LINK: 

1274 # We might collect relevant links as they are often pictures 

1275 # relating to the word 

1276 if len(node.largs[0]) >= 1 and isinstance( 1276 ↛ 1291line 1276 didn't jump to line 1291 because the condition on line 1276 was always true

1277 node.largs[0][0], str 

1278 ): 

1279 if node.largs[0][0].startswith( 1279 ↛ 1285line 1279 didn't jump to line 1285 because the condition on line 1279 was never true

1280 ns_title_prefix_tuple(wxr, "Category") 

1281 ): 

1282 # [[Category:...]] 

1283 # We're at the end of the file, probably, so stop 

1284 # here. Otherwise the head will get garbage. 

1285 break 

1286 if node.largs[0][0].startswith( 

1287 ns_title_prefix_tuple(wxr, "File") 

1288 ): 

1289 # Skips file links 

1290 continue 

1291 start_of_paragraph = False 

1292 pre[-1].append(node) 

1293 elif kind == NodeKind.HTML: 

1294 if node.sarg == "br": 

1295 if pre[-1]: 1295 ↛ 1247line 1295 didn't jump to line 1247 because the condition on line 1295 was always true

1296 pre.append([]) # Switch to next head 

1297 lists.append([]) # Lists parallels pre 

1298 collecting_head = True 

1299 start_of_paragraph = True 

1300 elif collecting_head and node.sarg not in ( 1300 ↛ 1306line 1300 didn't jump to line 1306 because the condition on line 1300 was never true

1301 "gallery", 

1302 "ref", 

1303 "cite", 

1304 "caption", 

1305 ): 

1306 start_of_paragraph = False 

1307 pre[-1].append(node) 

1308 else: 

1309 start_of_paragraph = False 

1310 elif isinstance(node, TemplateNode): 

1311 # XXX Insert code here that disambiguates between 

1312 # templates that generate word heads and templates 

1313 # that don't. 

1314 # There's head_tag_re that seems like a regex meant 

1315 # to identify head templates. Too bad it's None. 

1316 

1317 # ignore {{category}}, {{cat}}... etc. 

1318 if node.template_name in stop_head_at_these_templates: 

1319 # we've reached a template that should be at the end, 

1320 continue 

1321 

1322 # skip these templates; panel_templates is already used 

1323 # to skip certain templates else, but it also applies to 

1324 # head parsing quite well. 

1325 # node.largs[0][0] should always be str, but can't type-check 

1326 # that. 

1327 if is_panel_template(wxr, node.template_name): 

1328 continue 

1329 # skip these templates 

1330 # if node.largs[0][0] in skip_these_templates_in_head: 

1331 # first_head_tmplt = False # no first_head_tmplt at all 

1332 # start_of_paragraph = False 

1333 # continue 

1334 

1335 if first_head_tmplt and pre[-1]: 

1336 first_head_tmplt = False 

1337 start_of_paragraph = False 

1338 pre[-1].append(node) 

1339 elif pre[-1] and start_of_paragraph: 

1340 pre.append([]) # Switch to the next head 

1341 lists.append([]) # lists parallel pre 

1342 collecting_head = True 

1343 start_of_paragraph = False 

1344 pre[-1].append(node) 

1345 else: 

1346 pre[-1].append(node) 

1347 elif first_para: 

1348 start_of_paragraph = False 

1349 if collecting_head: 1349 ↛ 1247line 1349 didn't jump to line 1247 because the condition on line 1349 was always true

1350 pre[-1].append(node) 

1351 # XXX use template_fn in clean_node to check that the head macro 

1352 # is compatible with the current part-of-speech and generate warning 

1353 # if not. Use template_allowed_pos_map. 

1354 

1355 # Clean up empty pairs, and fix messes with extra newlines that 

1356 # separate templates that are followed by lists wiktextract issue #314 

1357 

1358 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1359 cleaned_lists: list[list[WikiNode]] = [] 

1360 pairless_pre_index = None 

1361 

1362 for pre1, ls in zip(pre, lists): 

1363 if pre1 and not ls: 

1364 pairless_pre_index = len(cleaned_pre) 

1365 if not pre1 and not ls: 1365 ↛ 1367line 1365 didn't jump to line 1367 because the condition on line 1365 was never true

1366 # skip [] + [] 

1367 continue 

1368 if not ls and all( 

1369 (isinstance(x, str) and not x.strip()) for x in pre1 

1370 ): 

1371 # skip ["\n", " "] + [] 

1372 continue 

1373 if ls and not pre1: 

1374 if pairless_pre_index is not None: 1374 ↛ 1375line 1374 didn't jump to line 1375 because the condition on line 1374 was never true

1375 cleaned_lists[pairless_pre_index] = ls 

1376 pairless_pre_index = None 

1377 continue 

1378 cleaned_pre.append(pre1) 

1379 cleaned_lists.append(ls) 

1380 

1381 pre = cleaned_pre 

1382 lists = cleaned_lists 

1383 

1384 there_are_many_heads = len(pre) > 1 

1385 header_tags: list[str] = [] 

1386 header_topics: list[str] = [] 

1387 previous_head_had_list = False 

1388 

1389 if not any(g for g in lists): 

1390 process_gloss_without_list( 

1391 poschildren, pos, pos_data, header_tags, header_topics 

1392 ) 

1393 else: 

1394 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1395 # if len(ls) == 0: 

1396 # # don't have gloss list 

1397 # # XXX add code here to filter out 'garbage', like text 

1398 # # that isn't a head template or head. 

1399 # continue 

1400 

1401 if all(not sl for sl in lists[i:]): 

1402 if i == 0: 1402 ↛ 1403line 1402 didn't jump to line 1403 because the condition on line 1402 was never true

1403 if isinstance(node, str): 

1404 wxr.wtp.debug( 

1405 "first head without list of senses," 

1406 "string: '{}[...]', {}/{}".format( 

1407 node[:20], word, language 

1408 ), 

1409 sortid="page/1689/20221215", 

1410 ) 

1411 if isinstance(node, WikiNode): 

1412 if node.largs and node.largs[0][0] in [ 

1413 "Han char", 

1414 ]: 

1415 # just ignore these templates 

1416 pass 

1417 else: 

1418 wxr.wtp.debug( 

1419 "first head without " 

1420 "list of senses, " 

1421 "template node " 

1422 "{}, {}/{}".format( 

1423 node.largs, word, language 

1424 ), 

1425 sortid="page/1694/20221215", 

1426 ) 

1427 else: 

1428 wxr.wtp.debug( 

1429 "first head without list of senses, " 

1430 "{}/{}".format(word, language), 

1431 sortid="page/1700/20221215", 

1432 ) 

1433 # no break here so that the first head always 

1434 # gets processed. 

1435 else: 

1436 if isinstance(node, str): 1436 ↛ 1437line 1436 didn't jump to line 1437 because the condition on line 1436 was never true

1437 wxr.wtp.debug( 

1438 "later head without list of senses," 

1439 "string: '{}[...]', {}/{}".format( 

1440 node[:20], word, language 

1441 ), 

1442 sortid="page/1708/20221215", 

1443 ) 

1444 if isinstance(node, WikiNode): 1444 ↛ 1456line 1444 didn't jump to line 1456 because the condition on line 1444 was always true

1445 wxr.wtp.debug( 

1446 "later head without list of senses," 

1447 "template node " 

1448 "{}, {}/{}".format( 

1449 node.sarg if node.sarg else node.largs, 

1450 word, 

1451 language, 

1452 ), 

1453 sortid="page/1713/20221215", 

1454 ) 

1455 else: 

1456 wxr.wtp.debug( 

1457 "later head without list of senses, " 

1458 "{}/{}".format(word, language), 

1459 sortid="page/1719/20221215", 

1460 ) 

1461 break 

1462 head_group = i + 1 if there_are_many_heads else None 

1463 # print("parse_part_of_speech: {}: {}: pre={}" 

1464 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1465 

1466 if previous_head_had_list: 

1467 # We use a boolean flag here because we want to be able 

1468 # let the header_tags data pass through after the loop 

1469 # is over without accidentally emptying it, if there are 

1470 # no pos_datas and we need a dummy data. 

1471 header_tags.clear() 

1472 header_topics.clear() 

1473 

1474 # print(f"{pre1=}") 

1475 process_gloss_header( 

1476 pre1, pos, head_group, pos_data, header_tags, header_topics 

1477 ) 

1478 for ln in ls: 

1479 # Parse each list associated with this head. 

1480 for node in ln.children: 

1481 # Parse nodes in l.children recursively. 

1482 # The recursion function uses push_sense() to 

1483 # add stuff into sense_datas, and returns True or 

1484 # False if something is added, which bubbles upward. 

1485 # If the bubble is "True", then higher levels of 

1486 # the recursion will not push_sense(), because 

1487 # the data is already pushed into a sub-gloss 

1488 # downstream, unless the higher level has examples 

1489 # that need to be put somewhere. 

1490 common_data: SenseData = { 

1491 "tags": list(header_tags), 

1492 "topics": list(header_topics), 

1493 } 

1494 if head_group: 

1495 common_data["head_nr"] = head_group 

1496 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1497 

1498 if len(ls) > 0: 

1499 previous_head_had_list = True 

1500 else: 

1501 previous_head_had_list = False 

1502 

1503 # If there are no senses extracted, add a dummy sense. We want to 

1504 # keep tags extracted from the head for the dummy sense. 

1505 push_sense() # Make sure unfinished data pushed, and start clean sense 

1506 if len(sense_datas) == 0: 

1507 data_extend(sense_data, "tags", header_tags) 

1508 data_extend(sense_data, "topics", header_topics) 

1509 data_append(sense_data, "tags", "no-gloss") 

1510 push_sense() 

1511 

1512 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore 

1513 

1514 for sd in sense_datas: 

1515 if "__temp_sense_sorting_ordinal" in sd: 1515 ↛ 1514line 1515 didn't jump to line 1514 because the condition on line 1515 was always true

1516 del sd["__temp_sense_sorting_ordinal"] # type: ignore 

1517 

1518 term_label_templates: list[TemplateData] = [] 

1519 normal_label_templates: list[TemplateData] = [] 

1520 

1521 def head_post_template_fn( 

1522 name: str, ht: TemplateArgs, expansion: str 

1523 ) -> Optional[str]: 

1524 """Handles special templates in the head section of a word. Head 

1525 section is the text after part-of-speech subtitle and before word 

1526 sense list. Typically it generates the bold line for the word, but 

1527 may also contain other useful information that often ends in 

1528 side boxes. We want to capture some of that additional information.""" 

1529 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1530 if is_panel_template(wxr, name): 1530 ↛ 1533line 1530 didn't jump to line 1533 because the condition on line 1530 was never true

1531 # Completely ignore these templates (not even recorded in 

1532 # head_templates) 

1533 return "" 

1534 if name == "head": 

1535 # XXX are these also captured in forms? Should this special case 

1536 # be removed? 

1537 t = ht.get(2, "") 

1538 if t == "pinyin": 1538 ↛ 1539line 1538 didn't jump to line 1539 because the condition on line 1538 was never true

1539 data_append(pos_data, "tags", "Pinyin") 

1540 elif t == "romanization": 1540 ↛ 1541line 1540 didn't jump to line 1541 because the condition on line 1540 was never true

1541 data_append(pos_data, "tags", "romanization") 

1542 if ( 

1543 HEAD_TAG_RE.search(name) is not None 

1544 or name in WORD_LEVEL_HEAD_TEMPLATES 

1545 ): 

1546 args_ht = clean_template_args(wxr, ht) 

1547 cleaned_expansion = clean_node(wxr, None, expansion) 

1548 dt: TemplateData = { 

1549 "name": name, 

1550 "args": args_ht, 

1551 "expansion": cleaned_expansion, 

1552 } 

1553 data_append(pos_data, "head_templates", dt) 

1554 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1555 term_label_templates.append(dt) 

1556 # Squash these, their tags are applied to the whole word, 

1557 # and some cause problems like "term-label" 

1558 return "" 

1559 

1560 # The following are both captured in head_templates and parsed 

1561 # separately 

1562 

1563 if name in wikipedia_templates: 

1564 # Note: various places expect to have content from wikipedia 

1565 # templates, so cannot convert this to empty 

1566 parse_wikipedia_template(wxr, pos_data, ht) 

1567 return None 

1568 

1569 if name == "number box": 1569 ↛ 1571line 1569 didn't jump to line 1571 because the condition on line 1569 was never true

1570 # XXX extract numeric value? 

1571 return "" 

1572 if name == "enum": 

1573 # XXX extract? 

1574 return "" 

1575 if name == "cardinalbox": 1575 ↛ 1578line 1575 didn't jump to line 1578 because the condition on line 1575 was never true

1576 # XXX extract similar to enum? 

1577 # XXX this can also occur in top-level under language 

1578 return "" 

1579 if name == "Han simplified forms": 1579 ↛ 1581line 1579 didn't jump to line 1581 because the condition on line 1579 was never true

1580 # XXX extract? 

1581 return "" 

1582 # if name == "ja-kanji forms": 

1583 # # XXX extract? 

1584 # return "" 

1585 # if name == "vi-readings": 

1586 # # XXX extract? 

1587 # return "" 

1588 # if name == "ja-kanji": 

1589 # # XXX extract? 

1590 # return "" 

1591 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1591 ↛ 1593line 1591 didn't jump to line 1593 because the condition on line 1591 was never true

1592 # XXX extract? 

1593 return "" 

1594 if name == "defdate": 1594 ↛ 1596line 1594 didn't jump to line 1596 because the condition on line 1594 was never true

1595 # the one exampe I saw of this in a head was weird. 

1596 return "" 

1597 if name in ("lb", "lbl", "label"): 

1598 args_ht = clean_template_args(wxr, ht) 

1599 cleaned_expansion = clean_node(wxr, None, expansion).strip("()") 

1600 dt = { 

1601 "name": name, 

1602 "args": args_ht, 

1603 "expansion": cleaned_expansion, 

1604 } 

1605 normal_label_templates.append(dt) 

1606 # The parens around __LABEL... below is meaningful: label 

1607 # templates generate text with parens, so if we add the magical 

1608 # phrase here with parens, it will look like a normal label that 

1609 # will be handled as a parenthetical text; only when handling 

1610 # parenthetical text do we need to actually actually access 

1611 # the contents of the label. 

1612 return f"(__LABEL_TEMPLATE_{len(normal_label_templates) - 1}__)" 

1613 

1614 return None 

1615 

1616 def process_gloss_header( 

1617 header_nodes: list[Union[WikiNode, str]], 

1618 pos_type: str, 

1619 header_group: Optional[int], 

1620 pos_data: WordData, 

1621 header_tags: list[str], 

1622 header_topics: list[str], 

1623 ) -> None: 

1624 ruby = [] 

1625 

1626 # process template parse nodes here 

1627 new_nodes = [] 

1628 info_template_data = [] 

1629 for node in header_nodes: 

1630 # print(f"{node=}") 

1631 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1632 if info_data or info_out: 

1633 if info_data: 1633 ↛ 1635line 1633 didn't jump to line 1635 because the condition on line 1633 was always true

1634 info_template_data.append(info_data) 

1635 if info_out: # including just the original node 1635 ↛ 1636line 1635 didn't jump to line 1636 because the condition on line 1635 was never true

1636 new_nodes.append(info_out) 

1637 else: 

1638 new_nodes.append(node) 

1639 header_nodes = new_nodes 

1640 

1641 if info_template_data: 

1642 if "info_templates" not in pos_data: 1642 ↛ 1645line 1642 didn't jump to line 1645 because the condition on line 1642 was always true

1643 pos_data["info_templates"] = info_template_data 

1644 else: 

1645 pos_data["info_templates"].extend(info_template_data) 

1646 

1647 if lang_code == "ja": 

1648 exp = wxr.wtp.parse( 

1649 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1650 ) 

1651 rub, _ = recursively_extract( 

1652 exp.children, 

1653 lambda x: ( 

1654 isinstance(x, WikiNode) 

1655 and x.kind == NodeKind.HTML 

1656 and x.sarg == "ruby" 

1657 ), 

1658 ) 

1659 if rub is not None: 1659 ↛ 1704line 1659 didn't jump to line 1704 because the condition on line 1659 was always true

1660 for r in rub: 

1661 if TYPE_CHECKING: 

1662 # we know the lambda above in recursively_extract 

1663 # returns only WikiNodes in rub 

1664 assert isinstance(r, WikiNode) 

1665 rt = parse_ruby(wxr, r) 

1666 if rt is not None: 1666 ↛ 1660line 1666 didn't jump to line 1660 because the condition on line 1666 was always true

1667 ruby.append(rt) 

1668 elif lang_code == "vi": 

1669 # Handle vi-readings templates that have a weird structures for 

1670 # Chu Nom vietnamese characters heads 

1671 # https://en.wiktionary.org/wiki/Template:vi-readings 

1672 new_header_nodes = [] 

1673 related_readings: list[LinkageData] = [] 

1674 for node in header_nodes: 

1675 if ( 1675 ↛ 1699line 1675 didn't jump to line 1699 because the condition on line 1675 was always true

1676 isinstance(node, TemplateNode) 

1677 and node.template_name == "vi-readings" 

1678 ): 

1679 print(node.template_parameters) 

1680 for parameter, tag in ( 

1681 ("hanviet", "han-viet-reading"), 

1682 ("nom", "nom-reading"), 

1683 # we ignore the fanqie parameter "phienthiet" 

1684 ): 

1685 arg = node.template_parameters.get(parameter) 

1686 if arg is not None: 1686 ↛ 1680line 1686 didn't jump to line 1680 because the condition on line 1686 was always true

1687 text = clean_node(wxr, None, arg) 

1688 for w in text.split(","): 

1689 # ignore - separated references 

1690 if "-" in w: 

1691 w = w[: w.index("-")] 

1692 w = w.strip() 

1693 related_readings.append( 

1694 LinkageData(word=w, tags=[tag]) 

1695 ) 

1696 continue 

1697 

1698 # Skip the vi-reading template for the rest of the head parsing 

1699 new_header_nodes.append(node) 

1700 if len(related_readings) > 0: 1700 ↛ 1704line 1700 didn't jump to line 1704 because the condition on line 1700 was always true

1701 data_extend(pos_data, "related", related_readings) 

1702 header_nodes = new_header_nodes 

1703 

1704 header_text = clean_node( 

1705 wxr, 

1706 pos_data, 

1707 header_nodes, 

1708 post_template_fn=head_post_template_fn, 

1709 collect_links=True, 

1710 remove_anchors_from_links=True, 

1711 ) 

1712 if "links" in pos_data: 

1713 # WordData doesn't use `links`, so we can use `collect_links=True` 

1714 # above without special handling and smuggle link data. 

1715 extracted_links = pos_data["links"] # type: ignore 

1716 del pos_data["links"] # type: ignore 

1717 else: 

1718 extracted_links = None 

1719 # print(f"{header_text=}, {extracted_links=}") 

1720 

1721 header_text = re.sub(r"\s+", " ", header_text).strip() 

1722 

1723 if not header_text: 

1724 return 

1725 

1726 term_label_tags: list[str] = [] 

1727 term_label_topics: list[str] = [] 

1728 if len(term_label_templates) > 0: 

1729 # parse term label templates; if there are other similar kinds 

1730 # of templates in headers that you want to squash and apply as 

1731 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1732 for templ_data in term_label_templates: 

1733 # print(templ_data) 

1734 expan = templ_data.get("expansion", "").strip("().,; ") 

1735 if not expan: 1735 ↛ 1736line 1735 didn't jump to line 1736 because the condition on line 1735 was never true

1736 continue 

1737 tlb_tagsets, tlb_topics = decode_tags(expan) 

1738 for tlb_tags in tlb_tagsets: 

1739 if len(tlb_tags) > 0 and not any( 

1740 t.startswith("error-") for t in tlb_tags 

1741 ): 

1742 term_label_tags.extend(tlb_tags) 

1743 term_label_topics.extend(tlb_topics) 

1744 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1745 

1746 # print(f"{header_text=}") 

1747 parse_word_head( 

1748 wxr, 

1749 word, 

1750 pos_type, 

1751 header_text, 

1752 pos_data, 

1753 is_reconstruction, 

1754 header_group, 

1755 header_nodes, 

1756 ruby=ruby, 

1757 links=extracted_links, 

1758 label_templates=normal_label_templates, 

1759 ) 

1760 if "tags" in pos_data: 

1761 # pos_data can get "tags" data from some source; type-checkers 

1762 # doesn't like it, so let's ignore it. 

1763 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1764 del pos_data["tags"] # type: ignore[typeddict-item] 

1765 if len(term_label_tags) > 0: 

1766 header_tags.extend(term_label_tags) 

1767 if len(term_label_topics) > 0: 

1768 header_topics.extend(term_label_topics) 

1769 

1770 def process_gloss_without_list( 

1771 nodes: list[Union[WikiNode, str]], 

1772 pos_type: str, 

1773 pos_data: WordData, 

1774 header_tags: list[str], 

1775 header_topics: list[str], 

1776 ) -> None: 

1777 # gloss text might not inside a list 

1778 header_nodes: list[Union[str, WikiNode]] = [] 

1779 gloss_nodes: list[Union[str, WikiNode]] = [] 

1780 for node in strip_nodes(nodes): 

1781 if isinstance(node, WikiNode): 

1782 if isinstance(node, TemplateNode): 

1783 if node.template_name in ( 

1784 "zh-see", 

1785 "ja-see", 

1786 "ja-see-kango", 

1787 ): 

1788 continue # soft redirect 

1789 elif ( 

1790 node.template_name == "head" 

1791 or node.template_name.startswith(f"{lang_code}-") 

1792 ): 

1793 header_nodes.append(node) 

1794 continue 

1795 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1795 ↛ 1797line 1795 didn't jump to line 1797 because the condition on line 1795 was always true

1796 break 

1797 gloss_nodes.append(node) 

1798 

1799 if len(header_nodes) > 0: 

1800 process_gloss_header( 

1801 header_nodes, 

1802 pos_type, 

1803 None, 

1804 pos_data, 

1805 header_tags, 

1806 header_topics, 

1807 ) 

1808 if len(gloss_nodes) > 0: 

1809 process_gloss_contents( 

1810 gloss_nodes, 

1811 pos_type, 

1812 {"tags": list(header_tags), "topics": list(header_topics)}, 

1813 ) 

1814 

1815 def parse_sense_node( 

1816 node: Union[str, WikiNode], # never receives str 

1817 sense_base: SenseData, 

1818 pos: str, 

1819 ) -> bool: 

1820 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1821 Uses push_sense() to attempt adding data to pos_data in the scope 

1822 of parse_language() when it reaches deep in the recursion. push_sense() 

1823 returns True if it succeeds, and that is bubbled up the stack; if 

1824 a sense was added downstream, the higher levels (whose shared data 

1825 was already added by a subsense) do not push_sense(), unless it 

1826 has examples that need to be put somewhere. 

1827 """ 

1828 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1829 

1830 nonlocal sense_ordinal 

1831 my_ordinal = sense_ordinal # copies, not a reference 

1832 sense_ordinal += 1 # only use for sorting 

1833 

1834 if not isinstance(node, WikiNode): 1834 ↛ 1836line 1834 didn't jump to line 1836 because the condition on line 1834 was never true

1835 # This doesn't seem to ever happen in practice. 

1836 wxr.wtp.debug( 

1837 "{}: parse_sense_node called with" 

1838 "something that isn't a WikiNode".format(pos), 

1839 sortid="page/1287/20230119", 

1840 ) 

1841 return False 

1842 

1843 if node.kind != NodeKind.LIST_ITEM: 1843 ↛ 1844line 1843 didn't jump to line 1844 because the condition on line 1843 was never true

1844 wxr.wtp.debug( 

1845 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1846 ) 

1847 return False 

1848 

1849 if node.sarg == ":": 

1850 # Skip example entries at the highest level, ones without 

1851 # a sense ("...#") above them. 

1852 # If node.sarg is exactly and only ":", then it's at 

1853 # the highest level; lower levels would have more 

1854 # "indentation", like "#:" or "##:" 

1855 return False 

1856 

1857 # If a recursion call succeeds in push_sense(), bubble it up with 

1858 # `added`. 

1859 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1860 added = False 

1861 

1862 gloss_template_args: set[str] = set() 

1863 

1864 # For LISTs and LIST_ITEMS, their argument is something like 

1865 # "##" or "##:", and using that we can rudimentally determine 

1866 # list 'depth' if need be, and also what kind of list or 

1867 # entry it is; # is for normal glosses, : for examples (indent) 

1868 # and * is used for quotations on wiktionary. 

1869 current_depth = node.sarg 

1870 

1871 children = node.children 

1872 

1873 # subentries, (presumably) a list 

1874 # of subglosses below this. The list's 

1875 # argument ends with #, and its depth should 

1876 # be bigger than parent node. 

1877 subentries = [ 

1878 x 

1879 for x in children 

1880 if isinstance(x, WikiNode) 

1881 and x.kind == NodeKind.LIST 

1882 and x.sarg == current_depth + "#" 

1883 ] 

1884 

1885 # sublists of examples and quotations. .sarg 

1886 # does not end with "#". 

1887 others = [ 

1888 x 

1889 for x in children 

1890 if isinstance(x, WikiNode) 

1891 and x.kind == NodeKind.LIST 

1892 and x.sarg != current_depth + "#" 

1893 ] 

1894 

1895 # the actual contents of this particular node. 

1896 # can be a gloss (or a template that expands into 

1897 # many glosses which we can't easily pre-expand) 

1898 # or could be an "outer gloss" with more specific 

1899 # subglosses, or could be a qualfier for the subglosses. 

1900 contents = [ 

1901 x 

1902 for x in children 

1903 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1904 ] 

1905 # If this entry has sublists of entries, we should combine 

1906 # gloss information from both the "outer" and sublist content. 

1907 # Sometimes the outer gloss 

1908 # is more non-gloss or tags, sometimes it is a coarse sense 

1909 # and the inner glosses are more specific. The outer one 

1910 # does not seem to have qualifiers. 

1911 

1912 # If we have one sublist with one element, treat it 

1913 # specially as it may be a Wiktionary error; raise 

1914 # that nested element to the same level. 

1915 # XXX If need be, this block can be easily removed in 

1916 # the current recursive logicand the result is one sense entry 

1917 # with both glosses in the glosses list, as you would 

1918 # expect. If the higher entry has examples, there will 

1919 # be a higher entry with some duplicated data. 

1920 if len(subentries) == 1: 

1921 slc = subentries[0].children 

1922 if len(slc) == 1: 

1923 # copy current node and modify it so it doesn't 

1924 # loop infinitely. 

1925 cropped_node = copy.copy(node) 

1926 cropped_node.children = [ 

1927 x 

1928 for x in children 

1929 if not ( 

1930 isinstance(x, WikiNode) 

1931 and x.kind == NodeKind.LIST 

1932 and x.sarg == current_depth + "#" 

1933 ) 

1934 ] 

1935 added |= parse_sense_node(cropped_node, sense_base, pos) 

1936 nonlocal sense_data # this kludge causes duplicated raw_ 

1937 # glosses data if this is not done; 

1938 # if the top-level (cropped_node) 

1939 # does not push_sense() properly or 

1940 # parse_sense_node() returns early, 

1941 # sense_data is not reset. This happens 

1942 # for example when you have a no-gloss 

1943 # string like "(intransitive)": 

1944 # no gloss, push_sense() returns early 

1945 # and sense_data has duplicate data with 

1946 # sense_base 

1947 sense_data = {} 

1948 added |= parse_sense_node(slc[0], sense_base, pos) 

1949 return added 

1950 

1951 return process_gloss_contents( 

1952 contents, 

1953 pos, 

1954 sense_base, 

1955 subentries, 

1956 others, 

1957 gloss_template_args, 

1958 added, 

1959 my_ordinal, 

1960 ) 

1961 

1962 def process_gloss_contents( 

1963 contents: list[Union[str, WikiNode]], 

1964 pos: str, 

1965 sense_base: SenseData, 

1966 subentries: list[WikiNode] = [], 

1967 others: list[WikiNode] = [], 

1968 gloss_template_args: Set[str] = set(), 

1969 added: bool = False, 

1970 sorting_ordinal: int | None = None, 

1971 ) -> bool: 

1972 def sense_template_fn( 

1973 name: str, ht: TemplateArgs, is_gloss: bool = False 

1974 ) -> Optional[str]: 

1975 # print(f"sense_template_fn: {name}, {ht}") 

1976 if name in wikipedia_templates: 

1977 # parse_wikipedia_template(wxr, pos_data, ht) 

1978 return None 

1979 if is_panel_template(wxr, name): 

1980 return "" 

1981 if name in INFO_TEMPLATE_FUNCS: 

1982 info_data, info_exp = parse_info_template_arguments( 

1983 wxr, name, ht, "sense" 

1984 ) 

1985 if info_data or info_exp: 1985 ↛ 1991line 1985 didn't jump to line 1991 because the condition on line 1985 was always true

1986 if info_data: 1986 ↛ 1988line 1986 didn't jump to line 1988 because the condition on line 1986 was always true

1987 data_append(sense_base, "info_templates", info_data) 

1988 if info_exp and isinstance(info_exp, str): 1988 ↛ 1990line 1988 didn't jump to line 1990 because the condition on line 1988 was always true

1989 return info_exp 

1990 return "" 

1991 if name in ("defdate",): 

1992 date = clean_node(wxr, None, ht.get(1, ())) 

1993 if part_two := ht.get(2): 1993 ↛ 1995line 1993 didn't jump to line 1995 because the condition on line 1993 was never true

1994 # Unicode mdash, not '-' 

1995 date += "–" + clean_node(wxr, None, part_two) 

1996 refs: dict[str, ReferenceData] = {} 

1997 # ref, refn, ref2, ref2n, ref3, ref3n 

1998 # ref1 not valid 

1999 for k, v in sorted( 

2000 (k, v) for k, v in ht.items() if isinstance(k, str) 

2001 ): 

2002 if m := re.match(r"ref(\d?)(n?)", k): 2002 ↛ 1999line 2002 didn't jump to line 1999 because the condition on line 2002 was always true

2003 ref_v = clean_node(wxr, None, v) 

2004 if m.group(1) not in refs: # empty string or digit 

2005 refs[m.group(1)] = ReferenceData() 

2006 if m.group(2): 

2007 refs[m.group(1)]["refn"] = ref_v 

2008 else: 

2009 refs[m.group(1)]["text"] = ref_v 

2010 data_append( 

2011 sense_base, 

2012 "attestations", 

2013 AttestationData(date=date, references=list(refs.values())), 

2014 ) 

2015 return "" 

2016 if name == "senseid": 

2017 langid = clean_node(wxr, None, ht.get(1, ())) 

2018 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

2019 if re.match(r"Q\d+$", arg): 

2020 data_append(sense_base, "wikidata", arg) 

2021 data_append(sense_base, "senseid", langid + ":" + arg) 

2022 if name in sense_linkage_templates: 

2023 # print(f"SENSE_TEMPLATE_FN: {name}") 

2024 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

2025 return "" 

2026 if name == "†" or name == "zh-obsolete": 

2027 data_append(sense_base, "tags", "obsolete") 

2028 return "" 

2029 if name in { 

2030 "ux", 

2031 "uxi", 

2032 "usex", 

2033 "afex", 

2034 "prefixusex", 

2035 "ko-usex", 

2036 "ko-x", 

2037 "hi-x", 

2038 "ja-usex-inline", 

2039 "ja-x", 

2040 "quotei", 

2041 "he-x", 

2042 "hi-x", 

2043 "km-x", 

2044 "ne-x", 

2045 "shn-x", 

2046 "th-x", 

2047 "ur-x", 

2048 }: 

2049 # Usage examples are captured separately below. We don't 

2050 # want to expand them into glosses even when unusual coding 

2051 # is used in the entry. 

2052 # These templates may slip through inside another item, but 

2053 # currently we're separating out example entries (..#:) 

2054 # well enough that there seems to very little contamination. 

2055 if is_gloss: 

2056 wxr.wtp.wiki_notice( 

2057 "Example template is used for gloss text", 

2058 sortid="extractor.en.page.sense_template_fn/1415", 

2059 ) 

2060 else: 

2061 return "" 

2062 if name == "w": 2062 ↛ 2063line 2062 didn't jump to line 2063 because the condition on line 2062 was never true

2063 if ht.get(2) == "Wp": 

2064 return "" 

2065 for v in ht.values(): 

2066 v = v.strip() 

2067 if v and "<" not in v: 

2068 gloss_template_args.add(v) 

2069 return None 

2070 

2071 def extract_link_texts(item: GeneralNode) -> None: 

2072 """Recursively extracts link texts from the gloss source. This 

2073 information is used to select whether to remove final "." from 

2074 form_of/alt_of (e.g., ihm/Hunsrik).""" 

2075 if isinstance(item, (list, tuple)): 

2076 for x in item: 

2077 extract_link_texts(x) 

2078 return 

2079 if isinstance(item, str): 

2080 # There seem to be HTML sections that may futher contain 

2081 # unparsed links. 

2082 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2082 ↛ 2083line 2082 didn't jump to line 2083 because the loop on line 2082 never started

2083 print("ITER:", m.group(0)) 

2084 v = m.group(1).split("|")[-1].strip() 

2085 if v: 

2086 gloss_template_args.add(v) 

2087 return 

2088 if not isinstance(item, WikiNode): 2088 ↛ 2089line 2088 didn't jump to line 2089 because the condition on line 2088 was never true

2089 return 

2090 if item.kind == NodeKind.LINK: 

2091 v = item.largs[-1] 

2092 if ( 2092 ↛ 2098line 2092 didn't jump to line 2098 because the condition on line 2092 was always true

2093 isinstance(v, list) 

2094 and len(v) == 1 

2095 and isinstance(v[0], str) 

2096 ): 

2097 gloss_template_args.add(v[0].strip()) 

2098 for x in item.children: 

2099 extract_link_texts(x) 

2100 

2101 extract_link_texts(contents) 

2102 

2103 # get the raw text of non-list contents of this node, and other stuff 

2104 # like tag and category data added to sense_base 

2105 # cast = no-op type-setter for the type-checker 

2106 partial_template_fn = cast( 

2107 TemplateFnCallable, 

2108 partial(sense_template_fn, is_gloss=True), 

2109 ) 

2110 rawgloss = clean_node( 

2111 wxr, 

2112 sense_base, 

2113 contents, 

2114 template_fn=partial_template_fn, 

2115 collect_links=True, 

2116 ) 

2117 

2118 if not rawgloss: 2118 ↛ 2119line 2118 didn't jump to line 2119 because the condition on line 2118 was never true

2119 return False 

2120 

2121 # remove manually typed ordered list text at the start("1. ") 

2122 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2123 

2124 # get stuff like synonyms and categories from "others", 

2125 # maybe examples and quotations 

2126 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2127 

2128 # The gloss could contain templates that produce more list items. 

2129 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2130 # to parts. However, e.g. Interlingua generates multiple glosses 

2131 # in HTML directly without Wikitext markup, so we must also split 

2132 # by just newlines. 

2133 subglosses = rawgloss.splitlines() 

2134 

2135 if len(subglosses) == 0: 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true

2136 return False 

2137 

2138 if any(s.startswith("#") for s in subglosses): 

2139 subtree = wxr.wtp.parse(rawgloss) 

2140 # from wikitextprocessor.parser import print_tree 

2141 # print("SUBTREE GENERATED BY TEMPLATE:") 

2142 # print_tree(subtree) 

2143 new_subentries = [ 

2144 x 

2145 for x in subtree.children 

2146 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2147 ] 

2148 

2149 new_others = [ 

2150 x 

2151 for x in subtree.children 

2152 if isinstance(x, WikiNode) 

2153 and x.kind == NodeKind.LIST 

2154 and not x.sarg.endswith("#") 

2155 ] 

2156 

2157 new_contents = [ 

2158 clean_node(wxr, [], x) 

2159 for x in subtree.children 

2160 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2161 ] 

2162 

2163 subentries = subentries or new_subentries 

2164 others = others or new_others 

2165 subglosses = new_contents 

2166 rawgloss = "".join(subglosses) 

2167 # Generate no gloss for translation hub pages, but add the 

2168 # "translation-hub" tag for them 

2169 if rawgloss == "(This entry is a translation hub.)": 2169 ↛ 2170line 2169 didn't jump to line 2170 because the condition on line 2169 was never true

2170 data_append(sense_data, "tags", "translation-hub") 

2171 return push_sense(sorting_ordinal) 

2172 

2173 # Remove certain substrings specific to outer glosses 

2174 strip_ends = [", particularly:"] 

2175 for x in strip_ends: 

2176 if rawgloss.endswith(x): 

2177 rawgloss = rawgloss[: -len(x)].strip() 

2178 break 

2179 

2180 # A single gloss, or possibly an outer gloss. 

2181 # Check if the possible outer gloss starts with 

2182 # parenthesized tags/topics 

2183 

2184 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2185 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2186 m = QUALIFIERS_RE.match(rawgloss) 

2187 # (...): ... or (...(...)...): ... 

2188 if m: 

2189 q = m.group(1) 

2190 rawgloss = rawgloss[m.end() :].strip() 

2191 parse_sense_qualifier(wxr, q, sense_base) 

2192 if rawgloss == "A pejorative:": 2192 ↛ 2193line 2192 didn't jump to line 2193 because the condition on line 2192 was never true

2193 data_append(sense_base, "tags", "pejorative") 

2194 rawgloss = "" 

2195 elif rawgloss == "Short forms.": 2195 ↛ 2196line 2195 didn't jump to line 2196 because the condition on line 2195 was never true

2196 data_append(sense_base, "tags", "abbreviation") 

2197 rawgloss = "" 

2198 elif rawgloss == "Technical or specialized senses.": 2198 ↛ 2199line 2198 didn't jump to line 2199 because the condition on line 2198 was never true

2199 rawgloss = "" 

2200 elif rawgloss.startswith("inflection of "): 

2201 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2202 if parsed is not None: 2202 ↛ 2211line 2202 didn't jump to line 2211 because the condition on line 2202 was always true

2203 tags, origins = parsed 

2204 if origins is not None: 2204 ↛ 2206line 2204 didn't jump to line 2206 because the condition on line 2204 was always true

2205 data_extend(sense_base, "form_of", origins) 

2206 if tags is not None: 2206 ↛ 2209line 2206 didn't jump to line 2209 because the condition on line 2206 was always true

2207 data_extend(sense_base, "tags", tags) 

2208 else: 

2209 data_append(sense_base, "tags", "form-of") 

2210 else: 

2211 data_append(sense_base, "tags", "form-of") 

2212 if rawgloss: 2212 ↛ 2243line 2212 didn't jump to line 2243 because the condition on line 2212 was always true

2213 # Code duplicating a lot of clean-up operations from later in 

2214 # this block. We want to clean up the "supergloss" as much as 

2215 # possible, in almost the same way as a normal gloss. 

2216 supergloss = rawgloss 

2217 

2218 if supergloss.startswith("; "): 2218 ↛ 2219line 2218 didn't jump to line 2219 because the condition on line 2218 was never true

2219 supergloss = supergloss[1:].strip() 

2220 

2221 if supergloss.startswith(("^†", "†")): 

2222 data_append(sense_base, "tags", "obsolete") 

2223 supergloss = supergloss[2:].strip() 

2224 elif supergloss.startswith("^‡"): 2224 ↛ 2225line 2224 didn't jump to line 2225 because the condition on line 2224 was never true

2225 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2226 supergloss = supergloss[2:].strip() 

2227 

2228 # remove [14th century...] style brackets at the end 

2229 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2230 

2231 if supergloss.startswith((",", ":")): 

2232 supergloss = supergloss[1:] 

2233 supergloss = supergloss.strip() 

2234 if supergloss.startswith("N. of "): 2234 ↛ 2235line 2234 didn't jump to line 2235 because the condition on line 2234 was never true

2235 supergloss = "Name of " + supergloss[6:] 

2236 supergloss = supergloss[2:] 

2237 data_append(sense_base, "glosses", supergloss) 

2238 if supergloss in ("A person:",): 

2239 data_append(sense_base, "tags", "g-person") 

2240 

2241 # The main recursive call (except for the exceptions at the 

2242 # start of this function). 

2243 for sublist in subentries: 

2244 if not ( 2244 ↛ 2247line 2244 didn't jump to line 2247 because the condition on line 2244 was never true

2245 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2246 ): 

2247 wxr.wtp.debug( 

2248 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2249 f"with items that are not LISTs", 

2250 sortid="page/1511/20230119", 

2251 ) 

2252 continue 

2253 for item in sublist.children: 

2254 if not ( 2254 ↛ 2258line 2254 didn't jump to line 2258 because the condition on line 2254 was never true

2255 isinstance(item, WikiNode) 

2256 and item.kind == NodeKind.LIST_ITEM 

2257 ): 

2258 continue 

2259 # copy sense_base to prevent cross-contamination between 

2260 # subglosses and other subglosses and superglosses 

2261 sense_base2 = copy.deepcopy(sense_base) 

2262 if parse_sense_node(item, sense_base2, pos): 2262 ↛ 2253line 2262 didn't jump to line 2253 because the condition on line 2262 was always true

2263 added = True 

2264 

2265 # Capture examples. 

2266 # This is called after the recursive calls above so that 

2267 # sense_base is not contaminated with meta-data from 

2268 # example entries for *this* gloss. 

2269 examples = [] 

2270 if wxr.config.capture_examples: 2270 ↛ 2274line 2270 didn't jump to line 2274 because the condition on line 2270 was always true

2271 examples = extract_examples(others, sense_base) 

2272 

2273 # push_sense() succeeded somewhere down-river, so skip this level 

2274 if added: 

2275 if examples: 

2276 # this higher-up gloss has examples that we do not want to skip 

2277 wxr.wtp.debug( 

2278 "'{}[...]' gloss has examples we want to keep, " 

2279 "but there are subglosses.".format(repr(rawgloss[:30])), 

2280 sortid="page/1498/20230118", 

2281 ) 

2282 else: 

2283 return True 

2284 

2285 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2286 # after the gloss, but these sentences don't seem to be intended 

2287 # as glosses. Skip them. 

2288 indexed_subglosses = list( 

2289 (i, gl) 

2290 for i, gl in enumerate(subglosses) 

2291 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2292 ) 

2293 

2294 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2294 ↛ 2295line 2294 didn't jump to line 2295 because the condition on line 2294 was never true

2295 gl = indexed_subglosses[0][1].strip() 

2296 if gl.endswith(":"): 

2297 gl = gl[:-1].strip() 

2298 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2299 if parsed is not None: 

2300 infl_tags, infl_dts = parsed 

2301 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2302 # Interpret others as a particular form under 

2303 # "inflection of" 

2304 data_extend(sense_base, "tags", infl_tags) 

2305 data_extend(sense_base, "form_of", infl_dts) 

2306 indexed_subglosses = indexed_subglosses[1:] 

2307 elif not infl_dts: 

2308 data_extend(sense_base, "tags", infl_tags) 

2309 indexed_subglosses = indexed_subglosses[1:] 

2310 

2311 # Create senses for remaining subglosses 

2312 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2313 gloss = gloss.strip() 

2314 if not gloss and len(indexed_subglosses) > 1: 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true

2315 continue 

2316 # Push a new sense (if the last one is not empty) 

2317 if push_sense(sorting_ordinal): 2317 ↛ 2318line 2317 didn't jump to line 2318 because the condition on line 2317 was never true

2318 added = True 

2319 # if gloss not in sense_data.get("raw_glosses", ()): 

2320 # data_append(sense_data, "raw_glosses", gloss) 

2321 if i == 0 and examples: 

2322 # In a multi-line gloss, associate examples 

2323 # with only one of them. 

2324 # XXX or you could use gloss_i == len(indexed_subglosses) 

2325 # to associate examples with the *last* one. 

2326 data_extend(sense_data, "examples", examples) 

2327 if gloss.startswith("; ") and gloss_i > 0: 2327 ↛ 2328line 2327 didn't jump to line 2328 because the condition on line 2327 was never true

2328 gloss = gloss[1:].strip() 

2329 # If the gloss starts with †, mark as obsolete 

2330 if gloss.startswith("^†"): 2330 ↛ 2331line 2330 didn't jump to line 2331 because the condition on line 2330 was never true

2331 data_append(sense_data, "tags", "obsolete") 

2332 gloss = gloss[2:].strip() 

2333 elif gloss.startswith("^‡"): 2333 ↛ 2334line 2333 didn't jump to line 2334 because the condition on line 2333 was never true

2334 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2335 gloss = gloss[2:].strip() 

2336 # Copy data for all senses to this sense 

2337 for k, v in sense_base.items(): 

2338 if isinstance(v, (list, tuple)): 

2339 if k != "tags": 

2340 # Tags handled below (countable/uncountable special) 

2341 data_extend(sense_data, k, v) 

2342 else: 

2343 assert k not in ("tags", "categories", "topics") 

2344 sense_data[k] = v # type:ignore[literal-required] 

2345 # Parse the gloss for this particular sense 

2346 m = QUALIFIERS_RE.match(gloss) 

2347 # (...): ... or (...(...)...): ... 

2348 if m: 

2349 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2350 gloss = gloss[m.end() :].strip() 

2351 

2352 # Remove common suffix "[from 14th c.]" and similar 

2353 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2354 

2355 # Check to make sure we don't have unhandled list items in gloss 

2356 ofs = max(gloss.find("#"), gloss.find("* ")) 

2357 if ofs > 10 and "(#)" not in gloss: 

2358 wxr.wtp.debug( 

2359 "gloss may contain unhandled list items: {}".format(gloss), 

2360 sortid="page/1412", 

2361 ) 

2362 elif "\n" in gloss: 2362 ↛ 2363line 2362 didn't jump to line 2363 because the condition on line 2362 was never true

2363 wxr.wtp.debug( 

2364 "gloss contains newline: {}".format(gloss), 

2365 sortid="page/1416", 

2366 ) 

2367 

2368 # Kludge, some glosses have a comma after initial qualifiers in 

2369 # parentheses 

2370 if gloss.startswith((",", ":")): 

2371 gloss = gloss[1:] 

2372 gloss = gloss.strip() 

2373 if gloss.endswith(":"): 

2374 gloss = gloss[:-1].strip() 

2375 if gloss.startswith("N. of "): 2375 ↛ 2376line 2375 didn't jump to line 2376 because the condition on line 2375 was never true

2376 gloss = "Name of " + gloss[6:] 

2377 if gloss.startswith("†"): 2377 ↛ 2378line 2377 didn't jump to line 2378 because the condition on line 2377 was never true

2378 data_append(sense_data, "tags", "obsolete") 

2379 gloss = gloss[1:] 

2380 elif gloss.startswith("^†"): 2380 ↛ 2381line 2380 didn't jump to line 2381 because the condition on line 2380 was never true

2381 data_append(sense_data, "tags", "obsolete") 

2382 gloss = gloss[2:] 

2383 

2384 # Copy tags from sense_base if any. This will not copy 

2385 # countable/uncountable if either was specified in the sense, 

2386 # as sometimes both are specified in word head but only one 

2387 # in individual senses. 

2388 countability_tags = [] 

2389 base_tags = sense_base.get("tags", ()) 

2390 sense_tags = sense_data.get("tags", ()) 

2391 for tag in base_tags: 

2392 if tag in ("countable", "uncountable"): 

2393 if tag not in countability_tags: 2393 ↛ 2395line 2393 didn't jump to line 2395 because the condition on line 2393 was always true

2394 countability_tags.append(tag) 

2395 continue 

2396 if tag not in sense_tags: 

2397 data_append(sense_data, "tags", tag) 

2398 if countability_tags: 

2399 if ( 2399 ↛ 2408line 2399 didn't jump to line 2408 because the condition on line 2399 was always true

2400 "countable" not in sense_tags 

2401 and "uncountable" not in sense_tags 

2402 ): 

2403 data_extend(sense_data, "tags", countability_tags) 

2404 

2405 # If outer gloss specifies a form-of ("inflection of", see 

2406 # aquamarine/German), try to parse the inner glosses as 

2407 # tags for an inflected form. 

2408 if "form-of" in sense_base.get("tags", ()): 

2409 parsed = parse_alt_or_inflection_of( 

2410 wxr, gloss, gloss_template_args 

2411 ) 

2412 if parsed is not None: 2412 ↛ 2418line 2412 didn't jump to line 2418 because the condition on line 2412 was always true

2413 infl_tags, infl_dts = parsed 

2414 if not infl_dts and infl_tags: 2414 ↛ 2418line 2414 didn't jump to line 2418 because the condition on line 2414 was always true

2415 # Interpret as a particular form under "inflection of" 

2416 data_extend(sense_data, "tags", infl_tags) 

2417 

2418 if not gloss: 2418 ↛ 2419line 2418 didn't jump to line 2419 because the condition on line 2418 was never true

2419 data_append(sense_data, "tags", "empty-gloss") 

2420 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2421 if ( 2421 ↛ 2432line 2421 didn't jump to line 2432 because the condition on line 2421 was always true

2422 gloss_i == 0 

2423 and len(sense_data.get("glosses", tuple())) >= 1 

2424 ): 

2425 # If we added a "high-level gloss" from rawgloss, but this 

2426 # is that same gloss_i, add this instead of the raw_gloss 

2427 # from before if they're different: the rawgloss was not 

2428 # cleaned exactly the same as this later gloss 

2429 sense_data["glosses"][-1] = gloss 

2430 else: 

2431 # Add the gloss for the sense. 

2432 data_append(sense_data, "glosses", gloss) 

2433 

2434 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2435 # two abbreviations in the same sense, both generated by the 

2436 # {{abbreviation of|...}} template. Handle these with some magic. 

2437 position = 0 

2438 split_glosses = [] 

2439 for m in re.finditer(r"Abbreviation of ", gloss): 

2440 if m.start() != position: 2440 ↛ 2439line 2440 didn't jump to line 2439 because the condition on line 2440 was always true

2441 split_glosses.append(gloss[position : m.start()]) 

2442 position = m.start() 

2443 split_glosses.append(gloss[position:]) 

2444 for gloss in split_glosses: 

2445 # Check if this gloss describes an alt-of or inflection-of 

2446 if ( 

2447 lang_code != "en" 

2448 and " " not in gloss 

2449 and distw([word], gloss) < 0.3 

2450 ): 

2451 # Don't try to parse gloss if it is one word 

2452 # that is close to the word itself for non-English words 

2453 # (probable translations of a tag/form name) 

2454 continue 

2455 parsed = parse_alt_or_inflection_of( 

2456 wxr, gloss, gloss_template_args 

2457 ) 

2458 if parsed is None: 

2459 continue 

2460 tags, dts = parsed 

2461 if not dts and tags: 

2462 data_extend(sense_data, "tags", tags) 

2463 continue 

2464 for dt in dts: # type:ignore[union-attr] 

2465 ftags = list(tag for tag in tags if tag != "form-of") 

2466 if "alt-of" in tags: 

2467 data_extend(sense_data, "tags", ftags) 

2468 data_append(sense_data, "alt_of", dt) 

2469 elif "compound-of" in tags: 2469 ↛ 2470line 2469 didn't jump to line 2470 because the condition on line 2469 was never true

2470 data_extend(sense_data, "tags", ftags) 

2471 data_append(sense_data, "compound_of", dt) 

2472 elif "synonym-of" in tags: 2472 ↛ 2473line 2472 didn't jump to line 2473 because the condition on line 2472 was never true

2473 data_extend(dt, "tags", ftags) 

2474 data_append(sense_data, "synonyms", dt) 

2475 elif tags and dt.get("word", "").startswith("of "): 2475 ↛ 2476line 2475 didn't jump to line 2476 because the condition on line 2475 was never true

2476 dt["word"] = dt["word"][3:] 

2477 data_append(sense_data, "tags", "form-of") 

2478 data_extend(sense_data, "tags", ftags) 

2479 data_append(sense_data, "form_of", dt) 

2480 elif "form-of" in tags: 2480 ↛ 2464line 2480 didn't jump to line 2464 because the condition on line 2480 was always true

2481 data_extend(sense_data, "tags", tags) 

2482 data_append(sense_data, "form_of", dt) 

2483 

2484 if len(sense_data) == 0: 

2485 if len(sense_base.get("tags", [])) == 0: 2485 ↛ 2487line 2485 didn't jump to line 2487 because the condition on line 2485 was always true

2486 del sense_base["tags"] 

2487 sense_data.update(sense_base) 

2488 if push_sense(sorting_ordinal): 2488 ↛ 2492line 2488 didn't jump to line 2492 because the condition on line 2488 was always true

2489 # push_sense succeded in adding a sense to pos_data 

2490 added = True 

2491 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2492 return added 

2493 

2494 def parse_inflection( 

2495 node: WikiNode, section: str, pos: Optional[str] 

2496 ) -> None: 

2497 """Parses inflection data (declension, conjugation) from the given 

2498 page. This retrieves the actual inflection template 

2499 parameters, which are very useful for applications that need 

2500 to learn the inflection classes and generate inflected 

2501 forms.""" 

2502 assert isinstance(node, WikiNode) 

2503 assert isinstance(section, str) 

2504 assert pos is None or isinstance(pos, str) 

2505 # print("parse_inflection:", node) 

2506 

2507 if pos is None: 2507 ↛ 2508line 2507 didn't jump to line 2508 because the condition on line 2507 was never true

2508 wxr.wtp.debug( 

2509 "inflection table outside part-of-speech", sortid="page/1812" 

2510 ) 

2511 return 

2512 

2513 def inflection_template_fn( 

2514 name: str, ht: TemplateArgs 

2515 ) -> Optional[str]: 

2516 # print("decl_conj_template_fn", name, ht) 

2517 if is_panel_template(wxr, name): 2517 ↛ 2518line 2517 didn't jump to line 2518 because the condition on line 2517 was never true

2518 return "" 

2519 if name in ("is-u-mutation",): 2519 ↛ 2522line 2519 didn't jump to line 2522 because the condition on line 2519 was never true

2520 # These are not to be captured as an exception to the 

2521 # generic code below 

2522 return None 

2523 m = re.search( 

2524 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2525 r"declension|inflection|mut|mutation)($|-)", 

2526 name, 

2527 ) 

2528 if m: 

2529 args_ht = clean_template_args(wxr, ht) 

2530 dt = {"name": name, "args": args_ht} 

2531 data_append(pos_data, "inflection_templates", dt) 

2532 

2533 return None 

2534 

2535 # Convert the subtree back to Wikitext, then expand all and parse, 

2536 # capturing templates in the process 

2537 text = wxr.wtp.node_to_wikitext(node.children) 

2538 

2539 # Split text into separate sections for each to-level template 

2540 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2541 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2542 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2543 # like the group around it, it would create elements in brace_matches, 

2544 # including None if it doesn't match. 

2545 # 20250114: Added {| and |} into the regex because tables were being 

2546 # cut into pieces by this code. Issue #973, introduction of two-part 

2547 # book-end templates similar to trans-top and tran-bottom. 

2548 template_sections = [] 

2549 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2550 # Because there is the possibility of triple curly braces 

2551 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2552 # count nesting depth using pairs of two brackets, but 

2553 # instead use singular braces ("{ }"). 

2554 # Because template delimiters should be balanced, regardless 

2555 # of whether {{ or {{{ is used, and because we only care 

2556 # about the outer-most delimiters (the highest level template) 

2557 # we can just count the single braces when those single 

2558 # braces are part of a group. 

2559 table_nesting = 0 

2560 # However, if we have a stray table ({| ... |}) that should always 

2561 # be its own section, and should prevent templates from cutting it 

2562 # into sections. 

2563 

2564 # print(f"Parse inflection: {text=}") 

2565 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2566 if len(brace_matches) > 1: 

2567 tsection: list[str] = [] 

2568 after_templates = False # kludge to keep any text 

2569 # before first template 

2570 # with the first template; 

2571 # otherwise, text 

2572 # goes with preceding template 

2573 for m in brace_matches: 

2574 if m.startswith("\n; ") and after_templates: 2574 ↛ 2575line 2574 didn't jump to line 2575 because the condition on line 2574 was never true

2575 after_templates = False 

2576 template_sections.append(tsection) 

2577 tsection = [] 

2578 tsection.append(m) 

2579 elif m.startswith("{{") or m.endswith("{|"): 

2580 if ( 

2581 template_nesting == 0 

2582 and after_templates 

2583 and table_nesting == 0 

2584 ): 

2585 template_sections.append(tsection) 

2586 tsection = [] 

2587 # start new section 

2588 after_templates = True 

2589 if m.startswith("{{"): 

2590 template_nesting += 1 

2591 else: 

2592 # m.endswith("{|") 

2593 table_nesting += 1 

2594 tsection.append(m) 

2595 elif m.startswith("}}") or m.endswith("|}"): 

2596 if m.startswith("}}"): 

2597 template_nesting -= 1 

2598 if template_nesting < 0: 2598 ↛ 2599line 2598 didn't jump to line 2599 because the condition on line 2598 was never true

2599 wxr.wtp.error( 

2600 "Negatively nested braces, " 

2601 "couldn't split inflection templates, " 

2602 "{}/{} section {}".format( 

2603 word, language, section 

2604 ), 

2605 sortid="page/1871", 

2606 ) 

2607 template_sections = [] # use whole text 

2608 break 

2609 else: 

2610 table_nesting -= 1 

2611 if table_nesting < 0: 2611 ↛ 2612line 2611 didn't jump to line 2612 because the condition on line 2611 was never true

2612 wxr.wtp.error( 

2613 "Negatively nested table braces, " 

2614 "couldn't split inflection section, " 

2615 "{}/{} section {}".format( 

2616 word, language, section 

2617 ), 

2618 sortid="page/20250114", 

2619 ) 

2620 template_sections = [] # use whole text 

2621 break 

2622 tsection.append(m) 

2623 else: 

2624 tsection.append(m) 

2625 if tsection: # dangling tsection 2625 ↛ 2633line 2625 didn't jump to line 2633 because the condition on line 2625 was always true

2626 template_sections.append(tsection) 

2627 # Why do it this way around? The parser has a preference 

2628 # to associate bits outside of tables with the preceding 

2629 # table (`after`-variable), so a new tsection begins 

2630 # at {{ and everything before it belongs to the previous 

2631 # template. 

2632 

2633 texts = [] 

2634 if not template_sections: 

2635 texts = [text] 

2636 else: 

2637 for tsection in template_sections: 

2638 texts.append("".join(tsection)) 

2639 if template_nesting != 0: 2639 ↛ 2640line 2639 didn't jump to line 2640 because the condition on line 2639 was never true

2640 wxr.wtp.error( 

2641 "Template nesting error: " 

2642 "template_nesting = {} " 

2643 "couldn't split inflection templates, " 

2644 "{}/{} section {}".format( 

2645 template_nesting, word, language, section 

2646 ), 

2647 sortid="page/1896", 

2648 ) 

2649 texts = [text] 

2650 for text in texts: 

2651 tree = wxr.wtp.parse( 

2652 text, expand_all=True, template_fn=inflection_template_fn 

2653 ) 

2654 

2655 if not text.strip(): 

2656 continue 

2657 

2658 # Parse inflection tables from the section. The data is stored 

2659 # under "forms". 

2660 if wxr.config.capture_inflections: 2660 ↛ 2650line 2660 didn't jump to line 2650 because the condition on line 2660 was always true

2661 tablecontext = None 

2662 m = re.search(r"{{([^}{|]+)\|?", text) 

2663 if m: 

2664 template_name = m.group(1).strip() 

2665 tablecontext = TableContext(template_name) 

2666 

2667 parse_inflection_section( 

2668 wxr, 

2669 pos_data, 

2670 word, 

2671 language, 

2672 pos, 

2673 section, 

2674 tree, 

2675 tablecontext=tablecontext, 

2676 ) 

2677 

2678 def get_subpage_section( 

2679 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2680 ) -> Optional[Union[WikiNode, str]]: 

2681 """Loads a subpage of the given page, and finds the section 

2682 for the given language, part-of-speech, and section title. This 

2683 is used for finding translations and other sections on subpages.""" 

2684 assert isinstance(language, str) 

2685 assert isinstance(title, str) 

2686 assert isinstance(subtitle, str) 

2687 assert isinstance(seqs, (list, tuple)) 

2688 for seq in seqs: 

2689 for x in seq: 

2690 assert isinstance(x, str) 

2691 subpage_title = word + "/" + subtitle 

2692 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2693 if subpage_content is None: 

2694 wxr.wtp.error( 

2695 "/translations not found despite " 

2696 "{{see translation subpage|...}}", 

2697 sortid="page/1934", 

2698 ) 

2699 return None 

2700 

2701 def recurse( 

2702 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2703 ) -> Optional[Union[str, WikiNode]]: 

2704 # print(f"seq: {seq}") 

2705 if not seq: 

2706 return node 

2707 if not isinstance(node, WikiNode): 

2708 return None 

2709 # print(f"node.kind: {node.kind}") 

2710 if node.kind in LEVEL_KINDS: 

2711 t = clean_node(wxr, None, node.largs[0]) 

2712 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2713 if t.lower() == seq[0].lower(): 

2714 seq = seq[1:] 

2715 if not seq: 

2716 return node 

2717 for n in node.children: 

2718 ret = recurse(n, seq) 

2719 if ret is not None: 

2720 return ret 

2721 return None 

2722 

2723 tree = wxr.wtp.parse( 

2724 subpage_content, 

2725 pre_expand=True, 

2726 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2727 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2728 ) 

2729 assert tree.kind == NodeKind.ROOT 

2730 for seq in seqs: 

2731 ret = recurse(tree, seq) 

2732 if ret is None: 

2733 wxr.wtp.debug( 

2734 "Failed to find subpage section {}/{} seq {}".format( 

2735 title, subtitle, seq 

2736 ), 

2737 sortid="page/1963", 

2738 ) 

2739 return ret 

2740 

2741 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2742 """Parses translations for a word. This may also pull in translations 

2743 from separate translation subpages.""" 

2744 assert isinstance(data, dict) 

2745 assert isinstance(xlatnode, WikiNode) 

2746 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2747 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2748 # print("parse_translations xlatnode={}".format(xlatnode)) 

2749 if not wxr.config.capture_translations: 2749 ↛ 2750line 2749 didn't jump to line 2750 because the condition on line 2749 was never true

2750 return 

2751 sense_parts: list[Union[WikiNode, str]] = [] 

2752 sense: Optional[str] = None 

2753 

2754 def parse_translation_item( 

2755 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

2756 ) -> None: 

2757 nonlocal sense 

2758 assert isinstance(contents, list) 

2759 assert lang is None or isinstance(lang, str) 

2760 # print("PARSE_TRANSLATION_ITEM:", contents) 

2761 

2762 langcode: Optional[str] = None 

2763 if sense is None: 

2764 sense = clean_node(wxr, data, sense_parts).strip() 

2765 # print("sense <- clean_node: ", sense) 

2766 idx = sense.find("See also translations at") 

2767 if idx > 0: 2767 ↛ 2768line 2767 didn't jump to line 2768 because the condition on line 2767 was never true

2768 wxr.wtp.debug( 

2769 "Skipping translation see also: {}".format(sense), 

2770 sortid="page/2361", 

2771 ) 

2772 sense = sense[:idx].strip() 

2773 if sense.endswith(":"): 2773 ↛ 2774line 2773 didn't jump to line 2774 because the condition on line 2773 was never true

2774 sense = sense[:-1].strip() 

2775 if sense.endswith("—"): 2775 ↛ 2776line 2775 didn't jump to line 2776 because the condition on line 2775 was never true

2776 sense = sense[:-1].strip() 

2777 translations_from_template: list[str] = [] 

2778 

2779 def translation_item_template_fn( 

2780 name: str, ht: TemplateArgs 

2781 ) -> Optional[str]: 

2782 nonlocal langcode 

2783 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

2784 if is_panel_template(wxr, name): 

2785 return "" 

2786 if name in ("t+check", "t-check", "t-needed"): 

2787 # We ignore these templates. They seem to have outright 

2788 # garbage in some entries, and very varying formatting in 

2789 # others. These should be transitory and unreliable 

2790 # anyway. 

2791 return "__IGNORE__" 

2792 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

2793 code = ht.get(1) 

2794 if code: 2794 ↛ 2804line 2794 didn't jump to line 2804 because the condition on line 2794 was always true

2795 if langcode and code != langcode: 

2796 wxr.wtp.debug( 

2797 "inconsistent language codes {} vs " 

2798 "{} in translation item: {!r} {}".format( 

2799 langcode, code, name, ht 

2800 ), 

2801 sortid="page/2386", 

2802 ) 

2803 langcode = code 

2804 tr = ht.get(2) 

2805 if tr: 

2806 tr = clean_node(wxr, None, [tr]) 

2807 translations_from_template.append(tr) 

2808 return None 

2809 if name == "t-egy": 

2810 langcode = "egy" 

2811 return None 

2812 if name == "ttbc": 

2813 code = ht.get(1) 

2814 if code: 2814 ↛ 2816line 2814 didn't jump to line 2816 because the condition on line 2814 was always true

2815 langcode = code 

2816 return None 

2817 if name == "trans-see": 2817 ↛ 2818line 2817 didn't jump to line 2818 because the condition on line 2817 was never true

2818 wxr.wtp.error( 

2819 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

2820 ) 

2821 return "" 

2822 if name.endswith("-top"): 2822 ↛ 2823line 2822 didn't jump to line 2823 because the condition on line 2822 was never true

2823 return "" 

2824 if name.endswith("-bottom"): 2824 ↛ 2825line 2824 didn't jump to line 2825 because the condition on line 2824 was never true

2825 return "" 

2826 if name.endswith("-mid"): 2826 ↛ 2827line 2826 didn't jump to line 2827 because the condition on line 2826 was never true

2827 return "" 

2828 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

2829 # .format(name), 

2830 # sortid="page/2414") 

2831 return None 

2832 

2833 sublists = list( 

2834 x 

2835 for x in contents 

2836 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2837 ) 

2838 contents = list( 

2839 x 

2840 for x in contents 

2841 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2842 ) 

2843 

2844 item = clean_node( 

2845 wxr, data, contents, template_fn=translation_item_template_fn 

2846 ) 

2847 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

2848 

2849 # Parse the translation item. 

2850 if item: 2850 ↛ exitline 2850 didn't return from function 'parse_translation_item' because the condition on line 2850 was always true

2851 lang = parse_translation_item_text( 

2852 wxr, 

2853 word, 

2854 data, 

2855 item, 

2856 sense, 

2857 lang, 

2858 langcode, 

2859 translations_from_template, 

2860 is_reconstruction, 

2861 ) 

2862 

2863 # Handle sublists. They are frequently used for different 

2864 # scripts for the language and different variants of the 

2865 # language. We will include the lower-level header as a 

2866 # tag in those cases. 

2867 for listnode in sublists: 

2868 assert listnode.kind == NodeKind.LIST 

2869 for node in listnode.children: 

2870 if not isinstance(node, WikiNode): 2870 ↛ 2871line 2870 didn't jump to line 2871 because the condition on line 2870 was never true

2871 continue 

2872 if node.kind == NodeKind.LIST_ITEM: 2872 ↛ 2869line 2872 didn't jump to line 2869 because the condition on line 2872 was always true

2873 parse_translation_item(node.children, lang=lang) 

2874 

2875 def parse_translation_template(node: WikiNode) -> None: 

2876 assert isinstance(node, WikiNode) 

2877 

2878 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

2879 nonlocal sense_parts 

2880 nonlocal sense 

2881 if is_panel_template(wxr, name): 

2882 return "" 

2883 if name == "see also": 

2884 # XXX capture 

2885 # XXX for example, "/" has top-level list containing 

2886 # see also items. So also should parse those. 

2887 return "" 

2888 if name == "trans-see": 

2889 # XXX capture 

2890 return "" 

2891 if name == "see translation subpage": 2891 ↛ 2892line 2891 didn't jump to line 2892 because the condition on line 2891 was never true

2892 sense_parts = [] 

2893 sense = None 

2894 sub = ht.get(1, "") 

2895 if sub: 

2896 m = re.match( 

2897 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

2898 ) 

2899 else: 

2900 m = None 

2901 etym = "" 

2902 etym_numbered = "" 

2903 pos = "" 

2904 if m: 

2905 etym_numbered = m.group(1) 

2906 etym = m.group(2) 

2907 pos = m.group(3) 

2908 if not sub: 

2909 wxr.wtp.debug( 

2910 "no part-of-speech in " 

2911 "{{see translation subpage|...}}, " 

2912 "defaulting to just wxr.wtp.section " 

2913 "(= language)", 

2914 sortid="page/2468", 

2915 ) 

2916 # seq sent to get_subpage_section without sub and pos 

2917 seq = [ 

2918 language, 

2919 TRANSLATIONS_TITLE, 

2920 ] 

2921 elif ( 

2922 m 

2923 and etym.lower().strip() in ETYMOLOGY_TITLES 

2924 and pos.lower() in POS_TITLES 

2925 ): 

2926 seq = [ 

2927 language, 

2928 etym_numbered, 

2929 pos, 

2930 TRANSLATIONS_TITLE, 

2931 ] 

2932 elif sub.lower() in POS_TITLES: 

2933 # seq with sub but not pos 

2934 seq = [ 

2935 language, 

2936 sub, 

2937 TRANSLATIONS_TITLE, 

2938 ] 

2939 else: 

2940 # seq with sub and pos 

2941 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2942 if pos.lower() not in POS_TITLES: 

2943 wxr.wtp.debug( 

2944 "unhandled see translation subpage: " 

2945 "language={} sub={} " 

2946 "wxr.wtp.subsection={}".format( 

2947 language, sub, wxr.wtp.subsection 

2948 ), 

2949 sortid="page/2478", 

2950 ) 

2951 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

2952 subnode = get_subpage_section( 

2953 wxr.wtp.title or "MISSING_TITLE", 

2954 TRANSLATIONS_TITLE, 

2955 [seq], 

2956 ) 

2957 if subnode is None or not isinstance(subnode, WikiNode): 

2958 # Failed to find the normal subpage section 

2959 # seq with sub and pos 

2960 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2961 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

2962 seqs: list[list[str] | tuple[str, ...]] = [ 

2963 [TRANSLATIONS_TITLE], 

2964 [language, pos], 

2965 ] 

2966 subnode = get_subpage_section( 

2967 wxr.wtp.title or "MISSING_TITLE", 

2968 TRANSLATIONS_TITLE, 

2969 seqs, 

2970 ) 

2971 if subnode is not None and isinstance(subnode, WikiNode): 

2972 parse_translations(data, subnode) 

2973 return "" 

2974 if name in ( 

2975 "c", 

2976 "C", 

2977 "categorize", 

2978 "cat", 

2979 "catlangname", 

2980 "topics", 

2981 "top", 

2982 "qualifier", 

2983 "cln", 

2984 ): 

2985 # These are expanded in the default way 

2986 return None 

2987 if name in ( 

2988 "trans-top", 

2989 "trans-top-see", 

2990 ): 

2991 # XXX capture id from trans-top? Capture sense here 

2992 # instead of trying to parse it from expanded content? 

2993 if ht.get(1): 

2994 sense_parts = [] 

2995 sense = ht.get(1) 

2996 else: 

2997 sense_parts = [] 

2998 sense = None 

2999 return None 

3000 if name in ( 

3001 "trans-bottom", 

3002 "trans-mid", 

3003 "checktrans-mid", 

3004 "checktrans-bottom", 

3005 ): 

3006 return None 

3007 if name == "checktrans-top": 

3008 sense_parts = [] 

3009 sense = None 

3010 return "" 

3011 if name == "trans-top-also": 

3012 # XXX capture? 

3013 sense_parts = [] 

3014 sense = None 

3015 return "" 

3016 wxr.wtp.error( 

3017 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3018 name, ht 

3019 ), 

3020 sortid="page/2517", 

3021 ) 

3022 return "" 

3023 

3024 wxr.wtp.expand( 

3025 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3026 ) 

3027 

3028 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3029 nonlocal sense 

3030 nonlocal sense_parts 

3031 for node in xlatnode.children: 

3032 # print(node) 

3033 if isinstance(node, str): 

3034 if sense: 

3035 if not node.isspace(): 

3036 wxr.wtp.debug( 

3037 "skipping string in the middle of " 

3038 "translations: {}".format(node), 

3039 sortid="page/2530", 

3040 ) 

3041 continue 

3042 # Add a part to the sense 

3043 sense_parts.append(node) 

3044 sense = None 

3045 continue 

3046 assert isinstance(node, WikiNode) 

3047 kind = node.kind 

3048 if kind == NodeKind.LIST: 

3049 for item in node.children: 

3050 if not isinstance(item, WikiNode): 3050 ↛ 3051line 3050 didn't jump to line 3051 because the condition on line 3050 was never true

3051 continue 

3052 if item.kind != NodeKind.LIST_ITEM: 3052 ↛ 3053line 3052 didn't jump to line 3053 because the condition on line 3052 was never true

3053 continue 

3054 if item.sarg == ":": 3054 ↛ 3055line 3054 didn't jump to line 3055 because the condition on line 3054 was never true

3055 continue 

3056 parse_translation_item(item.children) 

3057 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3057 ↛ 3061line 3057 didn't jump to line 3061 because the condition on line 3057 was never true

3058 # Silently skip list items that are just indented; these 

3059 # are used for text between translations, such as indicating 

3060 # translations that need to be checked. 

3061 pass 

3062 elif kind == NodeKind.TEMPLATE: 

3063 parse_translation_template(node) 

3064 elif kind in ( 3064 ↛ 3069line 3064 didn't jump to line 3069 because the condition on line 3064 was never true

3065 NodeKind.TABLE, 

3066 NodeKind.TABLE_ROW, 

3067 NodeKind.TABLE_CELL, 

3068 ): 

3069 parse_translation_recurse(node) 

3070 elif kind == NodeKind.HTML: 

3071 if node.attrs.get("class") == "NavFrame": 3071 ↛ 3077line 3071 didn't jump to line 3077 because the condition on line 3071 was never true

3072 # Reset ``sense_parts`` (and force recomputing 

3073 # by clearing ``sense``) as each NavFrame specifies 

3074 # its own sense. This helps eliminate garbage coming 

3075 # from text at the beginning at the translations 

3076 # section. 

3077 sense_parts = [] 

3078 sense = None 

3079 # for item in node.children: 

3080 # if not isinstance(item, WikiNode): 

3081 # continue 

3082 # parse_translation_recurse(item) 

3083 parse_translation_recurse(node) 

3084 elif kind in LEVEL_KINDS: 3084 ↛ 3086line 3084 didn't jump to line 3086 because the condition on line 3084 was never true

3085 # Sub-levels will be recursed elsewhere 

3086 pass 

3087 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3088 parse_translation_recurse(node) 

3089 elif kind == NodeKind.PREFORMATTED: 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true

3090 print("parse_translation_recurse: PREFORMATTED:", node) 

3091 elif kind == NodeKind.LINK: 3091 ↛ 3145line 3091 didn't jump to line 3145 because the condition on line 3091 was always true

3092 arg0 = node.largs[0] 

3093 # Kludge: I've seen occasional normal links to translation 

3094 # subpages from main pages (e.g., language/English/Noun 

3095 # in July 2021) instead of the normal 

3096 # {{see translation subpage|...}} template. This should 

3097 # handle them. Note: must be careful not to read other 

3098 # links, particularly things like in "human being": 

3099 # "a human being -- see [[man/translations]]" (group title) 

3100 if ( 3100 ↛ 3108line 3100 didn't jump to line 3108 because the condition on line 3100 was never true

3101 isinstance(arg0, (list, tuple)) 

3102 and arg0 

3103 and isinstance(arg0[0], str) 

3104 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3105 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3106 == wxr.wtp.title 

3107 ): 

3108 wxr.wtp.debug( 

3109 "translations subpage link found on main " 

3110 "page instead " 

3111 "of normal {{see translation subpage|...}}", 

3112 sortid="page/2595", 

3113 ) 

3114 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3115 if sub.lower() in POS_TITLES: 

3116 seq = [ 

3117 language, 

3118 sub, 

3119 TRANSLATIONS_TITLE, 

3120 ] 

3121 subnode = get_subpage_section( 

3122 wxr.wtp.title, 

3123 TRANSLATIONS_TITLE, 

3124 [seq], 

3125 ) 

3126 if subnode is not None and isinstance( 

3127 subnode, WikiNode 

3128 ): 

3129 parse_translations(data, subnode) 

3130 else: 

3131 wxr.wtp.error( 

3132 "/translations link outside part-of-speech" 

3133 ) 

3134 

3135 if ( 

3136 len(arg0) >= 1 

3137 and isinstance(arg0[0], str) 

3138 and not arg0[0].lower().startswith("category:") 

3139 ): 

3140 for x in node.largs[-1]: 

3141 if isinstance(x, str): 3141 ↛ 3144line 3141 didn't jump to line 3144 because the condition on line 3141 was always true

3142 sense_parts.append(x) 

3143 else: 

3144 parse_translation_recurse(x) 

3145 elif not sense: 

3146 sense_parts.append(node) 

3147 else: 

3148 wxr.wtp.debug( 

3149 "skipping text between translation items/senses: " 

3150 "{}".format(node), 

3151 sortid="page/2621", 

3152 ) 

3153 

3154 # Main code of parse_translation(). We want ``sense`` to be assigned 

3155 # regardless of recursion levels, and thus the code is structured 

3156 # to define at this level and recurse in parse_translation_recurse(). 

3157 parse_translation_recurse(xlatnode) 

3158 

3159 def parse_etymology(data: WordData, node: LevelNode) -> None: 

3160 """Parses an etymology section.""" 

3161 assert isinstance(data, dict) 

3162 assert isinstance(node, WikiNode) 

3163 

3164 templates: list[TemplateData] = [] 

3165 

3166 # Counter for preventing the capture of etymology templates 

3167 # when we are inside templates that we want to ignore (i.e., 

3168 # not capture). 

3169 ignore_count = 0 

3170 

3171 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3172 nonlocal ignore_count 

3173 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3174 return "" 

3175 if re.match(ignored_etymology_templates_re, name): 

3176 ignore_count += 1 

3177 return None 

3178 

3179 # CONTINUE_HERE 

3180 

3181 def etym_post_template_fn( 

3182 name: str, ht: TemplateArgs, expansion: str 

3183 ) -> None: 

3184 nonlocal ignore_count 

3185 if name in wikipedia_templates: 

3186 parse_wikipedia_template(wxr, data, ht) 

3187 return None 

3188 if re.match(ignored_etymology_templates_re, name): 

3189 ignore_count -= 1 

3190 return None 

3191 if ignore_count == 0: 3191 ↛ 3197line 3191 didn't jump to line 3197 because the condition on line 3191 was always true

3192 ht = clean_template_args(wxr, ht) 

3193 expansion = clean_node(wxr, None, expansion) 

3194 templates.append( 

3195 {"name": name, "args": ht, "expansion": expansion} 

3196 ) 

3197 return None 

3198 

3199 # Remove any subsections 

3200 contents = list( 

3201 x 

3202 for x in node.children 

3203 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3204 ) 

3205 # Convert to text, also capturing templates using post_template_fn 

3206 text = clean_node( 

3207 wxr, 

3208 None, 

3209 contents, 

3210 template_fn=etym_template_fn, 

3211 post_template_fn=etym_post_template_fn, 

3212 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3213 # Save the collected information. 

3214 if len(text) > 0: 

3215 data["etymology_text"] = text 

3216 if len(templates) > 0: 

3217 # Some etymology templates, like Template:root do not generate 

3218 # text, so they should be added here. Elsewhere, we check 

3219 # for Template:root and add some text to the expansion to please 

3220 # the validation. 

3221 data["etymology_templates"] = templates 

3222 

3223 for child_node in node.find_child_recursively( 3223 ↛ exitline 3223 didn't return from function 'parse_etymology' because the loop on line 3223 didn't complete

3224 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3225 ): 

3226 if child_node.kind in LEVEL_KIND_FLAGS: 

3227 break 

3228 elif isinstance( 3228 ↛ 3231line 3228 didn't jump to line 3231 because the condition on line 3228 was never true

3229 child_node, TemplateNode 

3230 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3231 if "etymology_examples" not in data: 

3232 data["etymology_examples"] = [] 

3233 data["etymology_examples"].extend( 

3234 extract_template_zh_x( 

3235 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3236 ) 

3237 ) 

3238 

3239 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3240 """This recurses into a subtree in the parse tree for a page.""" 

3241 nonlocal etym_data 

3242 nonlocal pos_data 

3243 nonlocal inside_level_four 

3244 

3245 redirect_list: list[str] = [] # for `zh-see` template 

3246 

3247 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3248 """This is called for otherwise unprocessed parts of the page. 

3249 We still expand them so that e.g. Category links get captured.""" 

3250 if name in wikipedia_templates: 

3251 data = select_data() 

3252 parse_wikipedia_template(wxr, data, ht) 

3253 return None 

3254 if is_panel_template(wxr, name): 

3255 return "" 

3256 return None 

3257 

3258 for node in treenode.children: 

3259 if not isinstance(node, WikiNode): 

3260 # print(" X{}".format(repr(node)[:40])) 

3261 continue 

3262 if isinstance(node, TemplateNode): 

3263 if process_soft_redirect_template(wxr, node, redirect_list): 

3264 continue 

3265 elif node.template_name == "zh-forms": 

3266 extract_zh_forms_template(wxr, node, select_data()) 

3267 elif ( 

3268 node.template_name.endswith("-kanjitab") 

3269 or node.template_name == "ja-kt" 

3270 ): 

3271 extract_ja_kanjitab_template(wxr, node, select_data()) 

3272 

3273 if not isinstance(node, LevelNode): 

3274 # XXX handle e.g. wikipedia links at the top of a language 

3275 # XXX should at least capture "also" at top of page 

3276 if node.kind in ( 

3277 NodeKind.HLINE, 

3278 NodeKind.LIST, 

3279 NodeKind.LIST_ITEM, 

3280 ): 

3281 continue 

3282 # print(" UNEXPECTED: {}".format(node)) 

3283 # Clean the node to collect category links 

3284 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3285 continue 

3286 t = clean_node( 

3287 wxr, etym_data, node.sarg if node.sarg else node.largs 

3288 ) 

3289 t = t.lower() 

3290 # XXX these counts were never implemented fully, and even this 

3291 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3292 wxr.config.section_counts[t] += 1 

3293 # print("PROCESS_CHILDREN: T:", repr(t)) 

3294 if t in IGNORED_TITLES: 

3295 pass 

3296 elif t.startswith(PRONUNCIATION_TITLE): 

3297 # Chinese Pronunciation section kludge; we demote these to 

3298 # be level 4 instead of 3 so that they're part of a larger 

3299 # etymology hierarchy; usually the data here is empty and 

3300 # acts as an inbetween between POS and Etymology data 

3301 if lang_code in ("zh",): 

3302 inside_level_four = True 

3303 if t.startswith(PRONUNCIATION_TITLE + " "): 

3304 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3305 # and each of them may have senses under Definition 

3306 push_level_four_section(True) 

3307 wxr.wtp.start_subsection(None) 

3308 if wxr.config.capture_pronunciation: 3308 ↛ 3416line 3308 didn't jump to line 3416 because the condition on line 3308 was always true

3309 data = select_data() 

3310 parse_pronunciation( 

3311 wxr, 

3312 node, 

3313 data, 

3314 etym_data, 

3315 have_etym, 

3316 base_data, 

3317 lang_code, 

3318 ) 

3319 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3320 push_etym() 

3321 wxr.wtp.start_subsection(None) 

3322 if wxr.config.capture_etymologies: 3322 ↛ 3416line 3322 didn't jump to line 3416 because the condition on line 3322 was always true

3323 m = re.search(r"\s(\d+(\.\d+)?)$", t) 

3324 if m: 

3325 etym_data["etymology_number"] = m.group(1) 

3326 parse_etymology(etym_data, node) 

3327 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3328 data = select_data() 

3329 extract_descendant_section(wxr, data, node, False) 

3330 elif ( 

3331 t in PROTO_ROOT_DERIVED_TITLES 

3332 and pos == "root" 

3333 and is_reconstruction 

3334 and wxr.config.capture_descendants 

3335 ): 

3336 data = select_data() 

3337 extract_descendant_section(wxr, data, node, True) 

3338 elif t == TRANSLATIONS_TITLE: 

3339 data = select_data() 

3340 parse_translations(data, node) 

3341 elif t in INFLECTION_TITLES: 

3342 parse_inflection(node, t, pos) 

3343 elif t == "alternative forms": 

3344 extract_alt_form_section(wxr, select_data(), node) 

3345 else: 

3346 lst = t.split() 

3347 while len(lst) > 1 and lst[-1].isdigit(): 3347 ↛ 3348line 3347 didn't jump to line 3348 because the condition on line 3347 was never true

3348 lst = lst[:-1] 

3349 t_no_number = " ".join(lst).lower() 

3350 if t_no_number in POS_TITLES: 

3351 push_pos() 

3352 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3353 pos = dt["pos"] or "MISSING_POS" 

3354 wxr.wtp.start_subsection(t) 

3355 if "debug" in dt: 

3356 wxr.wtp.debug( 

3357 "{} in section {}".format(dt["debug"], t), 

3358 sortid="page/2755", 

3359 ) 

3360 if "warning" in dt: 3360 ↛ 3361line 3360 didn't jump to line 3361 because the condition on line 3360 was never true

3361 wxr.wtp.wiki_notice( 

3362 "{} in section {}".format(dt["warning"], t), 

3363 sortid="page/2759", 

3364 ) 

3365 if "error" in dt: 3365 ↛ 3366line 3365 didn't jump to line 3366 because the condition on line 3365 was never true

3366 wxr.wtp.error( 

3367 "{} in section {}".format(dt["error"], t), 

3368 sortid="page/2763", 

3369 ) 

3370 if "note" in dt: 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true

3371 wxr.wtp.note( 

3372 "{} in section {}".format(dt["note"], t), 

3373 sortid="page/20251017a", 

3374 ) 

3375 if "wiki_notice" in dt: 3375 ↛ 3376line 3375 didn't jump to line 3376 because the condition on line 3375 was never true

3376 wxr.wtp.wiki_notice( 

3377 "{} in section {}".format(dt["wiki_notices"], t), 

3378 sortid="page/20251017b", 

3379 ) 

3380 # Parse word senses for the part-of-speech 

3381 parse_part_of_speech(node, pos) 

3382 if "tags" in dt: 

3383 for pdata in sense_datas: 

3384 data_extend(pdata, "tags", dt["tags"]) 

3385 elif t_no_number in LINKAGE_TITLES: 

3386 # print(f"LINKAGE_TITLES NODE {node=}") 

3387 rel = LINKAGE_TITLES[t_no_number] 

3388 data = select_data() 

3389 parse_linkage( 

3390 wxr, 

3391 data, 

3392 rel, 

3393 node, 

3394 word, 

3395 sense_datas, 

3396 is_reconstruction, 

3397 ) 

3398 elif t_no_number == COMPOUNDS_TITLE: 

3399 data = select_data() 

3400 if wxr.config.capture_compounds: 3400 ↛ 3416line 3400 didn't jump to line 3416 because the condition on line 3400 was always true

3401 parse_linkage( 

3402 wxr, 

3403 data, 

3404 "derived", 

3405 node, 

3406 word, 

3407 sense_datas, 

3408 is_reconstruction, 

3409 ) 

3410 

3411 # XXX parse interesting templates also from other sections. E.g., 

3412 # {{Letter|...}} in ===See also=== 

3413 # Also <gallery> 

3414 

3415 # Recurse to children of this node, processing subtitles therein 

3416 stack.append(t) 

3417 process_children(node, pos) 

3418 stack.pop() 

3419 

3420 if len(redirect_list) > 0: 

3421 if len(pos_data) > 0: 

3422 pos_data["redirects"] = redirect_list 

3423 if "pos" not in pos_data: 3423 ↛ 3424line 3423 didn't jump to line 3424 because the condition on line 3423 was never true

3424 pos_data["pos"] = "soft-redirect" 

3425 else: 

3426 new_page_data = copy.deepcopy(base_data) 

3427 new_page_data["redirects"] = redirect_list 

3428 if "pos" not in new_page_data: 3428 ↛ 3430line 3428 didn't jump to line 3430 because the condition on line 3428 was always true

3429 new_page_data["pos"] = "soft-redirect" 

3430 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3431 page_datas.append(new_page_data) 

3432 

3433 def extract_examples( 

3434 others: list[WikiNode], sense_base: SenseData 

3435 ) -> list[ExampleData]: 

3436 """Parses through a list of definitions and quotes to find examples. 

3437 Returns a list of example dicts to be added to sense data. Adds 

3438 meta-data, mostly categories, into sense_base.""" 

3439 assert isinstance(others, list) 

3440 examples: list[ExampleData] = [] 

3441 

3442 for sub in others: 

3443 if not sub.sarg.endswith((":", "*")): 3443 ↛ 3444line 3443 didn't jump to line 3444 because the condition on line 3443 was never true

3444 continue 

3445 for item in sub.children: 

3446 if not isinstance(item, WikiNode): 3446 ↛ 3447line 3446 didn't jump to line 3447 because the condition on line 3446 was never true

3447 continue 

3448 if item.kind != NodeKind.LIST_ITEM: 3448 ↛ 3449line 3448 didn't jump to line 3449 because the condition on line 3448 was never true

3449 continue 

3450 usex_type = None 

3451 example_template_args = [] 

3452 example_template_names = [] 

3453 taxons = set() 

3454 

3455 # Bypass this function when parsing Chinese, Japanese and 

3456 # quotation templates. 

3457 new_example_lists = extract_example_list_item( 

3458 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3459 ) 

3460 if len(new_example_lists) > 0: 

3461 examples.extend(new_example_lists) 

3462 continue 

3463 

3464 def usex_template_fn( 

3465 name: str, ht: TemplateArgs 

3466 ) -> Optional[str]: 

3467 nonlocal usex_type 

3468 if is_panel_template(wxr, name): 

3469 return "" 

3470 if name in usex_templates: 

3471 usex_type = "example" 

3472 example_template_args.append(ht) 

3473 example_template_names.append(name) 

3474 elif name in quotation_templates: 

3475 usex_type = "quotation" 

3476 elif name in taxonomy_templates: 3476 ↛ 3477line 3476 didn't jump to line 3477 because the condition on line 3476 was never true

3477 taxons.update(ht.get(1, "").split()) 

3478 for prefix in template_linkages_to_ignore_in_examples: 

3479 if re.search( 

3480 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3481 ): 

3482 return "" 

3483 return None 

3484 

3485 # bookmark 

3486 ruby: list[tuple[str, str]] = [] 

3487 contents = item.children 

3488 if lang_code == "ja": 

3489 # Capture ruby contents if this is a Japanese language 

3490 # example. 

3491 # print(contents) 

3492 if ( 3492 ↛ 3497line 3492 didn't jump to line 3497 because the condition on line 3492 was never true

3493 contents 

3494 and isinstance(contents, str) 

3495 and re.match(r"\s*$", contents[0]) 

3496 ): 

3497 contents = contents[1:] 

3498 exp = wxr.wtp.parse( 

3499 wxr.wtp.node_to_wikitext(contents), 

3500 # post_template_fn=head_post_template_fn, 

3501 expand_all=True, 

3502 ) 

3503 rub, rest = extract_ruby(wxr, exp.children) 

3504 if rub: 

3505 for rtup in rub: 

3506 ruby.append(rtup) 

3507 contents = rest 

3508 subtext = clean_node( 

3509 wxr, sense_base, contents, template_fn=usex_template_fn 

3510 ) 

3511 

3512 frozen_taxons = frozenset(taxons) 

3513 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3514 

3515 # print(f"{subtext=}") 

3516 subtext = re.sub( 

3517 r"\s*\(please add an English " 

3518 r"translation of this " 

3519 r"(example|usage example|quote)\)", 

3520 "", 

3521 subtext, 

3522 ).strip() 

3523 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3524 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3525 # print("subtext:", repr(subtext)) 

3526 

3527 lines = subtext.splitlines() 

3528 # print(lines) 

3529 

3530 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3531 lines = list( 

3532 x 

3533 for x in lines 

3534 if not re.match( 

3535 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3536 r"Synonym: |Antonym: |Hyponym: |" 

3537 r"Hypernyms: |Derived terms: |" 

3538 r"Related terms: |" 

3539 r"Hypernym: |Derived term: |" 

3540 r"Coordinate terms:|" 

3541 r"Related term: |" 

3542 r"For more quotations using )", 

3543 x, 

3544 ) 

3545 ) 

3546 tr = "" 

3547 ref = "" 

3548 roman = "" 

3549 # for line in lines: 

3550 # print("LINE:", repr(line)) 

3551 # print(classify_desc(line)) 

3552 if len(lines) == 1 and lang_code != "en": 

3553 parts = example_splitter_re.split(lines[0]) 

3554 if ( 3554 ↛ 3562line 3554 didn't jump to line 3562 because the condition on line 3554 was never true

3555 len(parts) > 2 

3556 and len(example_template_args) == 1 

3557 and any( 

3558 ("―" in s) or ("—" in s) 

3559 for s in example_template_args[0].values() 

3560 ) 

3561 ): 

3562 if nparts := synch_splits_with_args( 

3563 lines[0], example_template_args[0] 

3564 ): 

3565 parts = nparts 

3566 if ( 3566 ↛ 3571line 3566 didn't jump to line 3571 because the condition on line 3566 was never true

3567 len(example_template_args) == 1 

3568 and "lit" in example_template_args[0] 

3569 ): 

3570 # ugly brute-force kludge in case there's a lit= arg 

3571 literally = example_template_args[0].get("lit", "") 

3572 if literally: 

3573 literally = ( 

3574 " (literally, “" 

3575 + clean_value(wxr, literally) 

3576 + "”)" 

3577 ) 

3578 else: 

3579 literally = "" 

3580 if ( 3580 ↛ 3619line 3580 didn't jump to line 3619 because the condition on line 3580 was never true

3581 len(example_template_args) == 1 

3582 and len(parts) == 2 

3583 and len(example_template_args[0]) 

3584 - ( 

3585 # horrible kludge to ignore these arguments 

3586 # when calculating how many there are 

3587 sum( 

3588 s in example_template_args[0] 

3589 for s in ( 

3590 "lit", # generates text, but we handle it 

3591 "inline", 

3592 "noenum", 

3593 "nocat", 

3594 "sort", 

3595 ) 

3596 ) 

3597 ) 

3598 == 3 

3599 and clean_value( 

3600 wxr, example_template_args[0].get(2, "") 

3601 ) 

3602 == parts[0].strip() 

3603 and clean_value( 

3604 wxr, 

3605 ( 

3606 example_template_args[0].get(3) 

3607 or example_template_args[0].get("translation") 

3608 or example_template_args[0].get("t", "") 

3609 ) 

3610 + literally, # in case there's a lit= argument 

3611 ) 

3612 == parts[1].strip() 

3613 ): 

3614 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3615 # is a pretty reliable 'heuristic', so we use it here 

3616 # before the others. To be extra sure the template 

3617 # doesn't do anything weird, we compare the arguments 

3618 # and the output to each other. 

3619 lines = [parts[0].strip()] 

3620 tr = parts[1].strip() 

3621 elif ( 

3622 len(parts) == 2 

3623 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3624 ): 

3625 # These other branches just do some simple heuristics w/ 

3626 # the expanded output of the template (if applicable). 

3627 lines = [parts[0].strip()] 

3628 tr = parts[1].strip() 

3629 elif ( 3629 ↛ 3635line 3629 didn't jump to line 3635 because the condition on line 3629 was never true

3630 len(parts) == 3 

3631 and classify_desc2(parts[1]) 

3632 in ("romanization", "english") 

3633 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3634 ): 

3635 lines = [parts[0].strip()] 

3636 roman = parts[1].strip() 

3637 tr = parts[2].strip() 

3638 else: 

3639 parts = re.split(r"\s+-\s+", lines[0]) 

3640 if ( 3640 ↛ 3644line 3640 didn't jump to line 3644 because the condition on line 3640 was never true

3641 len(parts) == 2 

3642 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3643 ): 

3644 lines = [parts[0].strip()] 

3645 tr = parts[1].strip() 

3646 elif len(lines) > 1: 

3647 if any( 

3648 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3649 ) and not (len(example_template_names) == 1): 

3650 refs: list[str] = [] 

3651 for i in range(len(lines)): 3651 ↛ 3657line 3651 didn't jump to line 3657 because the loop on line 3651 didn't complete

3652 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3652 ↛ 3653line 3652 didn't jump to line 3653 because the condition on line 3652 was never true

3653 break 

3654 refs.append(lines[i].strip()) 

3655 if re.search(r"[]\d:)]\s*$", lines[i]): 

3656 break 

3657 ref = " ".join(refs) 

3658 lines = lines[i + 1 :] 

3659 if ( 

3660 lang_code != "en" 

3661 and len(lines) >= 2 

3662 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3663 ): 

3664 i = len(lines) - 1 

3665 while ( 3665 ↛ 3670line 3665 didn't jump to line 3670 because the condition on line 3665 was never true

3666 i > 1 

3667 and classify_desc2(lines[i - 1]) 

3668 in ENGLISH_TEXTS 

3669 ): 

3670 i -= 1 

3671 tr = "\n".join(lines[i:]) 

3672 lines = lines[:i] 

3673 if len(lines) >= 2: 

3674 if classify_desc2(lines[-1]) == "romanization": 

3675 roman = lines[-1].strip() 

3676 lines = lines[:-1] 

3677 

3678 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

3679 ref = lines[0] 

3680 lines = lines[1:] 

3681 elif lang_code != "en" and len(lines) == 2: 

3682 cls1 = classify_desc2(lines[0]) 

3683 cls2 = classify_desc2(lines[1]) 

3684 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3685 tr = lines[1] 

3686 lines = [lines[0]] 

3687 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3687 ↛ 3688line 3687 didn't jump to line 3688 because the condition on line 3687 was never true

3688 tr = lines[0] 

3689 lines = [lines[1]] 

3690 elif ( 3690 ↛ 3697line 3690 didn't jump to line 3697 because the condition on line 3690 was never true

3691 re.match(r"^[#*]*:+", lines[1]) 

3692 and classify_desc2( 

3693 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3694 ) 

3695 in ENGLISH_TEXTS 

3696 ): 

3697 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3698 lines = [lines[0]] 

3699 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3700 # Both were classified as English, but 

3701 # presumably one is not. Assume first is 

3702 # non-English, as that seems more common. 

3703 tr = lines[1] 

3704 lines = [lines[0]] 

3705 elif ( 

3706 usex_type != "quotation" 

3707 and lang_code != "en" 

3708 and len(lines) == 3 

3709 ): 

3710 cls1 = classify_desc2(lines[0]) 

3711 cls2 = classify_desc2(lines[1]) 

3712 cls3 = classify_desc2(lines[2]) 

3713 if ( 

3714 cls3 == "english" 

3715 and cls2 in ("english", "romanization") 

3716 and cls1 != "english" 

3717 ): 

3718 tr = lines[2].strip() 

3719 roman = lines[1].strip() 

3720 lines = [lines[0].strip()] 

3721 elif ( 3721 ↛ 3729line 3721 didn't jump to line 3729 because the condition on line 3721 was never true

3722 usex_type == "quotation" 

3723 and lang_code != "en" 

3724 and len(lines) > 2 

3725 ): 

3726 # for x in lines: 

3727 # print(" LINE: {}: {}" 

3728 # .format(classify_desc2(x), x)) 

3729 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3730 ref = lines[0] 

3731 lines = lines[2:] 

3732 cls1 = classify_desc2(lines[-1]) 

3733 if cls1 == "english": 

3734 i = len(lines) - 1 

3735 while ( 

3736 i > 1 

3737 and classify_desc2(lines[i - 1]) 

3738 == ENGLISH_TEXTS 

3739 ): 

3740 i -= 1 

3741 tr = "\n".join(lines[i:]) 

3742 lines = lines[:i] 

3743 

3744 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3745 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3746 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3747 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3748 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3749 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3750 ref = re.sub( 

3751 r", (volume |number |page )?“?" 

3752 r"\(please specify ([^)]|\(s\))*\)”?|" 

3753 ", text here$", 

3754 "", 

3755 ref, 

3756 ) 

3757 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3758 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3759 subtext = "\n".join(x for x in lines if x) 

3760 if not tr and lang_code != "en": 

3761 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3762 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3762 ↛ 3763line 3762 didn't jump to line 3763 because the condition on line 3762 was never true

3763 tr = m.group(2) 

3764 subtext = subtext[: m.start()] + m.group(1) 

3765 elif lines: 

3766 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

3767 if ( 3767 ↛ 3771line 3767 didn't jump to line 3771 because the condition on line 3767 was never true

3768 len(parts) == 2 

3769 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3770 ): 

3771 subtext = parts[0].strip() 

3772 tr = parts[1].strip() 

3773 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

3774 subtext = re.sub( 

3775 r"(please add an English translation of " 

3776 r"this (quote|usage example))", 

3777 "", 

3778 subtext, 

3779 ) 

3780 subtext = re.sub( 

3781 r"\s*→New International Version " "translation$", 

3782 "", 

3783 subtext, 

3784 ) # e.g. pis/Tok Pisin (Bible) 

3785 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

3786 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

3787 note = None 

3788 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

3789 if ( 3789 ↛ 3797line 3789 didn't jump to line 3797 because the condition on line 3789 was never true

3790 m is not None 

3791 and lang_code != "en" 

3792 and ( 

3793 m.group(1).startswith("with ") 

3794 or classify_desc2(m.group(1)) == "english" 

3795 ) 

3796 ): 

3797 note = m.group(1) 

3798 subtext = subtext[m.end() :] 

3799 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

3800 ref = re.sub(r",\s*→ISBN", "", ref) 

3801 ref = ref.strip() 

3802 if ref.endswith(":") or ref.endswith(","): 

3803 ref = ref[:-1].strip() 

3804 ref = re.sub(r"\s+,\s+", ", ", ref) 

3805 ref = re.sub(r"\s+", " ", ref) 

3806 if ref and not subtext: 3806 ↛ 3807line 3806 didn't jump to line 3807 because the condition on line 3806 was never true

3807 subtext = ref 

3808 ref = "" 

3809 if subtext: 

3810 dt: ExampleData = {"text": subtext} 

3811 if ref: 

3812 dt["ref"] = ref 

3813 if tr: 

3814 dt["english"] = tr # DEPRECATED for "translation" 

3815 dt["translation"] = tr 

3816 if usex_type: 

3817 dt["type"] = usex_type 

3818 if note: 3818 ↛ 3819line 3818 didn't jump to line 3819 because the condition on line 3818 was never true

3819 dt["note"] = note 

3820 if roman: 

3821 dt["roman"] = roman 

3822 if ruby: 

3823 dt["ruby"] = ruby 

3824 examples.append(dt) 

3825 

3826 return examples 

3827 

3828 # Main code of parse_language() 

3829 # Process the section 

3830 stack.append(language) 

3831 process_children(langnode, None) 

3832 stack.pop() 

3833 

3834 # Finalize word entires 

3835 push_etym() 

3836 ret = [] 

3837 for data in page_datas: 

3838 merge_base(data, base_data) 

3839 ret.append(data) 

3840 

3841 # Copy all tags to word senses 

3842 for data in ret: 

3843 if "senses" not in data: 3843 ↛ 3844line 3843 didn't jump to line 3844 because the condition on line 3843 was never true

3844 continue 

3845 # WordData should not have a 'tags' field, but if it does, it's 

3846 # deleted and its contents removed and placed in each sense; 

3847 # that's why the type ignores. 

3848 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

3849 if "tags" in data: 

3850 del data["tags"] # type: ignore[typeddict-item] 

3851 for sense in data["senses"]: 

3852 data_extend(sense, "tags", tags) 

3853 

3854 return ret 

3855 

3856 

3857def parse_wikipedia_template( 

3858 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

3859) -> None: 

3860 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

3861 assert isinstance(wxr, WiktextractContext) 

3862 assert isinstance(data, dict) 

3863 assert isinstance(ht, dict) 

3864 langid = clean_node(wxr, data, ht.get("lang", ())) 

3865 pagename = ( 

3866 clean_node(wxr, data, ht.get(1, ())) 

3867 or wxr.wtp.title 

3868 or "MISSING_PAGE_TITLE" 

3869 ) 

3870 if langid: 

3871 data_append(data, "wikipedia", langid + ":" + pagename) 

3872 else: 

3873 data_append(data, "wikipedia", pagename) 

3874 

3875 

3876def parse_top_template( 

3877 wxr: WiktextractContext, node: WikiNode, data: WordData 

3878) -> None: 

3879 """Parses a template that occurs on the top-level in a page, before any 

3880 language subtitles.""" 

3881 assert isinstance(wxr, WiktextractContext) 

3882 assert isinstance(node, WikiNode) 

3883 assert isinstance(data, dict) 

3884 

3885 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3886 if name in wikipedia_templates: 

3887 parse_wikipedia_template(wxr, data, ht) 

3888 return None 

3889 if is_panel_template(wxr, name): 

3890 return "" 

3891 if name in ("reconstruction",): 3891 ↛ 3892line 3891 didn't jump to line 3892 because the condition on line 3891 was never true

3892 return "" 

3893 if name.lower() == "also" or name.lower().startswith("also/"): 

3894 # XXX shows related words that might really have been the intended 

3895 # word, capture them 

3896 return "" 

3897 if name == "see also": 3897 ↛ 3899line 3897 didn't jump to line 3899 because the condition on line 3897 was never true

3898 # XXX capture 

3899 return "" 

3900 if name == "cardinalbox": 3900 ↛ 3902line 3900 didn't jump to line 3902 because the condition on line 3900 was never true

3901 # XXX capture 

3902 return "" 

3903 if name == "character info": 3903 ↛ 3905line 3903 didn't jump to line 3905 because the condition on line 3903 was never true

3904 # XXX capture 

3905 return "" 

3906 if name == "commonscat": 3906 ↛ 3908line 3906 didn't jump to line 3908 because the condition on line 3906 was never true

3907 # XXX capture link to Wikimedia commons 

3908 return "" 

3909 if name == "wrongtitle": 3909 ↛ 3912line 3909 didn't jump to line 3912 because the condition on line 3909 was never true

3910 # XXX this should be captured to replace page title with the 

3911 # correct title. E.g. ⿰亻革家 

3912 return "" 

3913 if name == "wikidata": 3913 ↛ 3914line 3913 didn't jump to line 3914 because the condition on line 3913 was never true

3914 arg = clean_node(wxr, data, ht.get(1, ())) 

3915 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

3916 data_append(data, "wikidata", arg) 

3917 return "" 

3918 wxr.wtp.debug( 

3919 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

3920 sortid="page/2870", 

3921 ) 

3922 return "" 

3923 

3924 clean_node(wxr, None, [node], template_fn=top_template_fn) 

3925 

3926 

3927def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

3928 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

3929 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

3930 that are next to each other.""" 

3931 

3932 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

3933 # section get overwritten. In this case, let's just combine the two. 

3934 

3935 # In Chinese entries, Pronunciation can be preceded on the 

3936 # same level 3 by its Etymology *and* Glyph Origin sections: 

3937 # ===Glyph Origin=== 

3938 # ===Etymology=== 

3939 # ===Pronunciation=== 

3940 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

3941 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

3942 # are now level 6 

3943 

3944 # Known lowercase PoS names are in part_of_speech_map 

3945 # Known lowercase linkage section names are in linkage_map 

3946 

3947 old = re.split( 

3948 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

3949 ) 

3950 

3951 parts = [] 

3952 npar = 4 # Number of parentheses in above expression 

3953 parts.append(old[0]) 

3954 prev_level = None 

3955 level = None 

3956 skip_level_title = False # When combining etymology sections 

3957 for i in range(1, len(old), npar + 1): 

3958 left = old[i] 

3959 right = old[i + npar - 1] 

3960 # remove Wikilinks in title 

3961 title = re.sub(r"^\[\[", "", old[i + 1]) 

3962 title = re.sub(r"\]\]$", "", title) 

3963 prev_level = level 

3964 level = len(left) 

3965 part = old[i + npar] 

3966 if level != len(right): 3966 ↛ 3967line 3966 didn't jump to line 3967 because the condition on line 3966 was never true

3967 wxr.wtp.debug( 

3968 "subtitle has unbalanced levels: " 

3969 "{!r} has {} on the left and {} on the right".format( 

3970 title, left, right 

3971 ), 

3972 sortid="page/2904", 

3973 ) 

3974 lc = title.lower() 

3975 if name_to_code(title, "en") != "": 

3976 if level > 2: 3976 ↛ 3977line 3976 didn't jump to line 3977 because the condition on line 3976 was never true

3977 wxr.wtp.debug( 

3978 "subtitle has language name {} at level {}".format( 

3979 title, level 

3980 ), 

3981 sortid="page/2911", 

3982 ) 

3983 level = 2 

3984 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

3985 if level > 3: 3985 ↛ 3986line 3985 didn't jump to line 3986 because the condition on line 3985 was never true

3986 wxr.wtp.debug( 

3987 "etymology section {} at level {}".format(title, level), 

3988 sortid="page/2917", 

3989 ) 

3990 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

3991 # sections cheek-to-cheek 

3992 skip_level_title = True 

3993 # Modify the title of previous ("Glyph Origin") section, in 

3994 # case we have a meaningful title like "Etymology 1" 

3995 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

3996 level = 3 

3997 elif lc.startswith(PRONUNCIATION_TITLE): 

3998 # Pronunciation is now a level between POS and Etymology, so 

3999 # we need to shift everything down by one 

4000 level = 4 

4001 elif lc in POS_TITLES: 

4002 level = 5 

4003 elif lc == TRANSLATIONS_TITLE: 

4004 level = 6 

4005 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

4006 level = 6 

4007 elif lc in INFLECTION_TITLES: 

4008 level = 6 

4009 elif lc == DESCENDANTS_TITLE: 

4010 level = 6 

4011 elif title in PROTO_ROOT_DERIVED_TITLES: 4011 ↛ 4012line 4011 didn't jump to line 4012 because the condition on line 4011 was never true

4012 level = 6 

4013 elif lc in IGNORED_TITLES: 

4014 level = 6 

4015 else: 

4016 level = 6 

4017 if skip_level_title: 

4018 skip_level_title = False 

4019 parts.append(part) 

4020 else: 

4021 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4022 parts.append(part) 

4023 # print("=" * level, title) 

4024 # if level != len(left): 

4025 # print(" FIXED LEVEL OF {} {} -> {}" 

4026 # .format(title, len(left), level)) 

4027 

4028 text = "".join(parts) 

4029 # print(text) 

4030 return text 

4031 

4032 

4033def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4034 # Skip translation pages 

4035 if word.endswith("/" + TRANSLATIONS_TITLE): 4035 ↛ 4036line 4035 didn't jump to line 4036 because the condition on line 4035 was never true

4036 return [] 

4037 

4038 if wxr.config.verbose: 4038 ↛ 4039line 4038 didn't jump to line 4039 because the condition on line 4038 was never true

4039 logger.info(f"Parsing page: {word}") 

4040 

4041 wxr.config.word = word 

4042 wxr.wtp.start_page(word) 

4043 

4044 # Remove <noinclude> and similar tags from main pages. They 

4045 # should not appear there, but at least net/Elfdala has one and it 

4046 # is probably not the only one. 

4047 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4048 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4049 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4050 

4051 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4052 # pages that have, for example, Translations section under Linkage, or 

4053 # Translations section on the same level as Noun. Enforce a proper 

4054 # hierarchy by manipulating the subtitle levels in certain cases. 

4055 text = fix_subtitle_hierarchy(wxr, text) 

4056 

4057 # Parse the page, pre-expanding those templates that are likely to 

4058 # influence parsing 

4059 tree = wxr.wtp.parse( 

4060 text, 

4061 pre_expand=True, 

4062 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4063 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4064 ) 

4065 # from wikitextprocessor.parser import print_tree 

4066 # print("PAGE PARSE:", print_tree(tree)) 

4067 

4068 top_data: WordData = {} 

4069 

4070 # Iterate over top-level titles, which should be languages for normal 

4071 # pages 

4072 by_lang = defaultdict(list) 

4073 for langnode in tree.children: 

4074 if not isinstance(langnode, WikiNode): 

4075 continue 

4076 if langnode.kind == NodeKind.TEMPLATE: 

4077 parse_top_template(wxr, langnode, top_data) 

4078 continue 

4079 if langnode.kind == NodeKind.LINK: 

4080 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4081 continue 

4082 if langnode.kind != NodeKind.LEVEL2: 4082 ↛ 4083line 4082 didn't jump to line 4083 because the condition on line 4082 was never true

4083 wxr.wtp.debug( 

4084 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4085 ) 

4086 continue 

4087 lang = clean_node( 

4088 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4089 ) 

4090 lang_code = name_to_code(lang, "en") 

4091 if lang_code == "": 4091 ↛ 4092line 4091 didn't jump to line 4092 because the condition on line 4091 was never true

4092 wxr.wtp.debug( 

4093 f"unrecognized language name: {lang}", sortid="page/3019" 

4094 ) 

4095 if ( 

4096 wxr.config.capture_language_codes 

4097 and lang_code not in wxr.config.capture_language_codes 

4098 ): 

4099 continue 

4100 wxr.wtp.start_section(lang) 

4101 

4102 # Collect all words from the page. 

4103 # print(f"{langnode=}") 

4104 datas = parse_language(wxr, langnode, lang, lang_code) 

4105 

4106 # Propagate fields resulting from top-level templates to this 

4107 # part-of-speech. 

4108 for data in datas: 

4109 if "lang" not in data: 4109 ↛ 4110line 4109 didn't jump to line 4110 because the condition on line 4109 was never true

4110 wxr.wtp.debug( 

4111 "internal error -- no lang in data: {}".format(data), 

4112 sortid="page/3034", 

4113 ) 

4114 continue 

4115 for k, v in top_data.items(): 

4116 assert isinstance(v, (list, tuple)) 

4117 data_extend(data, k, v) 

4118 by_lang[data["lang"]].append(data) 

4119 

4120 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4121 # field. FIX OR REMOVE. 

4122 # Do some post-processing on the words. For example, we may distribute 

4123 # conjugation information to all the words. 

4124 ret = [] 

4125 for lang, lang_datas in by_lang.items(): 

4126 ret.extend(lang_datas) 

4127 

4128 for x in ret: 

4129 if x["word"] != word: 

4130 if word.startswith("Unsupported titles/"): 

4131 wxr.wtp.debug( 

4132 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4133 sortid="20231101/3578page.py", 

4134 ) 

4135 else: 

4136 wxr.wtp.debug( 

4137 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4138 sortid="20231101/3582page.py", 

4139 ) 

4140 x["original_title"] = word 

4141 # validate tag data 

4142 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4143 return ret 

4144 

4145 

4146def recursively_separate_raw_tags( 

4147 wxr: WiktextractContext, data: dict[str, Any] 

4148) -> None: 

4149 if not isinstance(data, dict): 4149 ↛ 4150line 4149 didn't jump to line 4150 because the condition on line 4149 was never true

4150 wxr.wtp.error( 

4151 "'data' is not dict; most probably " 

4152 "data has a list that contains at least one dict and " 

4153 "at least one non-dict item", 

4154 sortid="en/page-4016/20240419", 

4155 ) 

4156 return 

4157 new_tags: list[str] = [] 

4158 raw_tags: list[str] = data.get("raw_tags", []) 

4159 for field, val in data.items(): 

4160 if field == "tags": 

4161 for tag in val: 

4162 if tag not in valid_tags: 

4163 raw_tags.append(tag) 

4164 else: 

4165 new_tags.append(tag) 

4166 if isinstance(val, list): 

4167 if len(val) > 0 and isinstance(val[0], dict): 

4168 for d in val: 

4169 recursively_separate_raw_tags(wxr, d) 

4170 if "tags" in data and not new_tags: 

4171 del data["tags"] 

4172 elif new_tags: 

4173 data["tags"] = new_tags 

4174 if raw_tags: 

4175 data["raw_tags"] = raw_tags 

4176 

4177 

4178def process_soft_redirect_template( 

4179 wxr: WiktextractContext, 

4180 template_node: TemplateNode, 

4181 redirect_pages: list[str], 

4182) -> bool: 

4183 # return `True` if the template is soft redirect template 

4184 if template_node.template_name == "zh-see": 

4185 # https://en.wiktionary.org/wiki/Template:zh-see 

4186 title = clean_node( 

4187 wxr, None, template_node.template_parameters.get(1, "") 

4188 ) 

4189 if title != "": 4189 ↛ 4191line 4189 didn't jump to line 4191 because the condition on line 4189 was always true

4190 redirect_pages.append(title) 

4191 return True 

4192 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4193 # https://en.wiktionary.org/wiki/Template:ja-see 

4194 for key, value in template_node.template_parameters.items(): 

4195 if isinstance(key, int): 4195 ↛ 4194line 4195 didn't jump to line 4194 because the condition on line 4195 was always true

4196 title = clean_node(wxr, None, value) 

4197 if title != "": 4197 ↛ 4194line 4197 didn't jump to line 4194 because the condition on line 4197 was always true

4198 redirect_pages.append(title) 

4199 return True 

4200 return False 

4201 

4202 

4203ZH_FORMS_TAGS = { 

4204 "trad.": "Traditional-Chinese", 

4205 "simp.": "Simplified-Chinese", 

4206 "alternative forms": "alternative", 

4207 "2nd round simp.": "Second-Round-Simplified-Chinese", 

4208} 

4209 

4210 

4211def extract_zh_forms_template( 

4212 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4213): 

4214 # https://en.wiktionary.org/wiki/Template:zh-forms 

4215 lit_meaning = clean_node( 

4216 wxr, None, t_node.template_parameters.get("lit", "") 

4217 ) 

4218 if lit_meaning != "": 

4219 base_data["literal_meaning"] = lit_meaning 

4220 expanded_node = wxr.wtp.parse( 

4221 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4222 ) 

4223 for table in expanded_node.find_child(NodeKind.TABLE): 

4224 for row in table.find_child(NodeKind.TABLE_ROW): 

4225 row_header = "" 

4226 row_header_tags: list[str] = [] 

4227 header_has_span = False 

4228 for cell in row.find_child( 

4229 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

4230 ): 

4231 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

4232 row_header, row_header_tags, header_has_span = ( 

4233 extract_zh_forms_header_cell(wxr, base_data, cell) 

4234 ) 

4235 elif not header_has_span: 

4236 extract_zh_forms_data_cell( 

4237 wxr, base_data, cell, row_header, row_header_tags 

4238 ) 

4239 

4240 if "forms" in base_data and len(base_data["forms"]) == 0: 4240 ↛ 4241line 4240 didn't jump to line 4241 because the condition on line 4240 was never true

4241 del base_data["forms"] 

4242 

4243 

4244def extract_zh_forms_header_cell( 

4245 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode 

4246) -> tuple[str, list[str], bool]: 

4247 row_header = "" 

4248 row_header_tags = [] 

4249 header_has_span = False 

4250 first_span_index = len(header_cell.children) 

4251 for index, span_tag in header_cell.find_html("span", with_index=True): 

4252 if index < first_span_index: 4252 ↛ 4254line 4252 didn't jump to line 4254 because the condition on line 4252 was always true

4253 first_span_index = index 

4254 header_has_span = True 

4255 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

4256 for raw_tag in row_header.split(" and "): 

4257 raw_tag = raw_tag.strip() 

4258 if raw_tag != "": 

4259 row_header_tags.append(raw_tag) 

4260 for span_tag in header_cell.find_html_recursively("span"): 

4261 span_lang = span_tag.attrs.get("lang", "") 

4262 form_nodes = [] 

4263 sup_title = "" 

4264 for node in span_tag.children: 

4265 if isinstance(node, HTMLNode) and node.tag == "sup": 4265 ↛ 4266line 4265 didn't jump to line 4266 because the condition on line 4265 was never true

4266 for sup_span in node.find_html("span"): 

4267 sup_title = sup_span.attrs.get("title", "") 

4268 else: 

4269 form_nodes.append(node) 

4270 if span_lang in ["zh-Hant", "zh-Hans"]: 

4271 for word in clean_node(wxr, None, form_nodes).split("/"): 

4272 if word not in [wxr.wtp.title, ""]: 

4273 form = {"form": word} 

4274 for raw_tag in row_header_tags: 

4275 if raw_tag in ZH_FORMS_TAGS: 4275 ↛ 4278line 4275 didn't jump to line 4278 because the condition on line 4275 was always true

4276 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4277 else: 

4278 data_append(form, "raw_tags", raw_tag) 

4279 if sup_title != "": 4279 ↛ 4280line 4279 didn't jump to line 4280 because the condition on line 4279 was never true

4280 data_append(form, "raw_tags", sup_title) 

4281 data_append(base_data, "forms", form) 

4282 return row_header, row_header_tags, header_has_span 

4283 

4284 

4285TagLiteral = Literal["tags", "raw_tags"] 

4286TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags") 

4287 

4288 

4289def extract_zh_forms_data_cell( 

4290 wxr: WiktextractContext, 

4291 base_data: WordData, 

4292 cell: WikiNode, 

4293 row_header: str, 

4294 row_header_tags: list[str], 

4295) -> None: 

4296 from .zh_pron_tags import ZH_PRON_TAGS 

4297 

4298 forms: list[FormData] = [] 

4299 for top_span_tag in cell.find_html("span"): 

4300 span_style = top_span_tag.attrs.get("style", "") 

4301 span_lang = top_span_tag.attrs.get("lang", "") 

4302 if span_style == "white-space:nowrap;": 

4303 extract_zh_forms_data_cell( 

4304 wxr, base_data, top_span_tag, row_header, row_header_tags 

4305 ) 

4306 elif "font-size:80%" in span_style: 

4307 raw_tag = clean_node(wxr, None, top_span_tag) 

4308 if raw_tag != "": 4308 ↛ 4299line 4308 didn't jump to line 4299 because the condition on line 4308 was always true

4309 for form in forms: 

4310 if raw_tag in ZH_PRON_TAGS: 4310 ↛ 4316line 4310 didn't jump to line 4316 because the condition on line 4310 was always true

4311 tr_tag = ZH_PRON_TAGS[raw_tag] 

4312 if isinstance(tr_tag, list): 4312 ↛ 4313line 4312 didn't jump to line 4313 because the condition on line 4312 was never true

4313 data_extend(form, "tags", tr_tag) 

4314 elif isinstance(tr_tag, str): 4314 ↛ 4309line 4314 didn't jump to line 4309 because the condition on line 4314 was always true

4315 data_append(form, "tags", tr_tag) 

4316 elif raw_tag in valid_tags: 

4317 data_append(form, "tags", raw_tag) 

4318 else: 

4319 data_append(form, "raw_tags", raw_tag) 

4320 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4320 ↛ 4299line 4320 didn't jump to line 4299 because the condition on line 4320 was always true

4321 word = clean_node(wxr, None, top_span_tag) 

4322 if word not in ["", "/", wxr.wtp.title]: 

4323 form = {"form": word} 

4324 if row_header != "anagram": 4324 ↛ 4330line 4324 didn't jump to line 4330 because the condition on line 4324 was always true

4325 for raw_tag in row_header_tags: 

4326 if raw_tag in ZH_FORMS_TAGS: 4326 ↛ 4329line 4326 didn't jump to line 4329 because the condition on line 4326 was always true

4327 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4328 else: 

4329 data_append(form, "raw_tags", raw_tag) 

4330 if span_lang == "zh-Hant": 

4331 data_append(form, "tags", "Traditional-Chinese") 

4332 elif span_lang == "zh-Hans": 

4333 data_append(form, "tags", "Simplified-Chinese") 

4334 forms.append(form) 

4335 

4336 if row_header == "anagram": 4336 ↛ 4337line 4336 didn't jump to line 4337 because the condition on line 4336 was never true

4337 for form in forms: 

4338 l_data: LinkageData = {"word": form["form"]} 

4339 for key in TAG_LITERALS_TUPLE: 

4340 if key in form: 

4341 l_data[key] = form[key] 

4342 data_append(base_data, "anagrams", l_data) 

4343 else: 

4344 data_extend(base_data, "forms", forms) 

4345 

4346 

4347def extract_ja_kanjitab_template( 

4348 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4349): 

4350 # https://en.wiktionary.org/wiki/Template:ja-kanjitab 

4351 expanded_node = wxr.wtp.parse( 

4352 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4353 ) 

4354 for table in expanded_node.find_child(NodeKind.TABLE): 

4355 is_alt_form_table = False 

4356 for row in table.find_child(NodeKind.TABLE_ROW): 

4357 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

4358 header_text = clean_node(wxr, None, header_node) 

4359 if header_text.startswith("Alternative spelling"): 

4360 is_alt_form_table = True 

4361 if not is_alt_form_table: 

4362 continue 

4363 forms = [] 

4364 for row in table.find_child(NodeKind.TABLE_ROW): 

4365 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

4366 for child_node in cell_node.children: 

4367 if isinstance(child_node, HTMLNode): 

4368 if child_node.tag == "span": 

4369 word = clean_node(wxr, None, child_node) 

4370 if word != "": 4370 ↛ 4366line 4370 didn't jump to line 4366 because the condition on line 4370 was always true

4371 forms.append( 

4372 { 

4373 "form": word, 

4374 "tags": ["alternative", "kanji"], 

4375 } 

4376 ) 

4377 elif child_node.tag == "small": 

4378 raw_tag = clean_node(wxr, None, child_node).strip( 

4379 "()" 

4380 ) 

4381 if raw_tag != "" and len(forms) > 0: 4381 ↛ 4366line 4381 didn't jump to line 4366 because the condition on line 4381 was always true

4382 data_append( 

4383 forms[-1], 

4384 "tags" 

4385 if raw_tag in valid_tags 

4386 else "raw_tags", 

4387 raw_tag, 

4388 ) 

4389 data_extend(base_data, "forms", forms) 

4390 for link_node in expanded_node.find_child(NodeKind.LINK): 

4391 clean_node(wxr, base_data, link_node)