Coverage for src/wiktextract/extractor/en/page.py: 73%

1974 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 08:12 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8import sys 

9from collections import defaultdict 

10from functools import partial 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Iterable, 

15 Iterator, 

16 Optional, 

17 Set, 

18 Union, 

19 cast, 

20) 

21 

22from mediawiki_langcodes import get_all_names, name_to_code 

23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

24from wikitextprocessor.parser import ( 

25 LEVEL_KIND_FLAGS, 

26 GeneralNode, 

27 NodeKind, 

28 TemplateNode, 

29 WikiNode, 

30) 

31 

32from ...clean import clean_template_args, clean_value 

33from ...datautils import ( 

34 data_append, 

35 data_extend, 

36 ns_title_prefix_tuple, 

37) 

38from ...page import ( 

39 LEVEL_KINDS, 

40 clean_node, 

41 is_panel_template, 

42 recursively_extract, 

43) 

44from ...tags import valid_tags 

45from ...wxr_context import WiktextractContext 

46from ...wxr_logging import logger 

47from ..ruby import extract_ruby, parse_ruby 

48from ..share import strip_nodes 

49from .example import extract_example_list_item, extract_template_zh_x 

50from .form_descriptions import ( 

51 classify_desc, 

52 decode_tags, 

53 distw, 

54 parse_alt_or_inflection_of, 

55 parse_sense_qualifier, 

56 parse_word_head, 

57) 

58from .inflection import TableContext, parse_inflection_section 

59from .info_templates import ( 

60 INFO_TEMPLATE_FUNCS, 

61 parse_info_template_arguments, 

62 parse_info_template_node, 

63) 

64from .linkages import extract_alt_form_section, parse_linkage_item_text 

65from .parts_of_speech import PARTS_OF_SPEECH 

66from .section_titles import ( 

67 COMPOUNDS_TITLE, 

68 DESCENDANTS_TITLE, 

69 ETYMOLOGY_TITLES, 

70 IGNORED_TITLES, 

71 INFLECTION_TITLES, 

72 LINKAGE_TITLES, 

73 POS_TITLES, 

74 PRONUNCIATION_TITLE, 

75 PROTO_ROOT_DERIVED_TITLES, 

76 TRANSLATIONS_TITLE, 

77) 

78from .translations import parse_translation_item_text 

79from .type_utils import ( 

80 DescendantData, 

81 ExampleData, 

82 FormData, 

83 LinkageData, 

84 SenseData, 

85 SoundData, 

86 TemplateData, 

87 WordData, 

88) 

89from .unsupported_titles import unsupported_title_map 

90 

91# When determining whether a string is 'english', classify_desc 

92# might return 'taxonomic' which is English text 99% of the time. 

93ENGLISH_TEXTS = ("english", "taxonomic") 

94 

95# Matches head tag 

96HEAD_TAG_RE = re.compile( 

97 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

98 r"hangul-symbol|syllable-hangul)$|" 

99 + r"^(latin|" 

100 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

101 + r")-(" 

102 + "|".join( 

103 [ 

104 "abbr", 

105 "adj", 

106 "adjective", 

107 "adjective form", 

108 "adjective-form", 

109 "adv", 

110 "adverb", 

111 "affix", 

112 "animal command", 

113 "art", 

114 "article", 

115 "aux", 

116 "bound pronoun", 

117 "bound-pronoun", 

118 "Buyla", 

119 "card num", 

120 "card-num", 

121 "cardinal", 

122 "chunom", 

123 "classifier", 

124 "clitic", 

125 "cls", 

126 "cmene", 

127 "cmavo", 

128 "colloq-verb", 

129 "colverbform", 

130 "combining form", 

131 "combining-form", 

132 "comparative", 

133 "con", 

134 "concord", 

135 "conj", 

136 "conjunction", 

137 "conjug", 

138 "cont", 

139 "contr", 

140 "converb", 

141 "daybox", 

142 "decl", 

143 "decl noun", 

144 "def", 

145 "dem", 

146 "det", 

147 "determ", 

148 "Deva", 

149 "ending", 

150 "entry", 

151 "form", 

152 "fuhivla", 

153 "gerund", 

154 "gismu", 

155 "hanja", 

156 "hantu", 

157 "hanzi", 

158 "head", 

159 "ideophone", 

160 "idiom", 

161 "inf", 

162 "indef", 

163 "infixed pronoun", 

164 "infixed-pronoun", 

165 "infl", 

166 "inflection", 

167 "initialism", 

168 "int", 

169 "interfix", 

170 "interj", 

171 "interjection", 

172 "jyut", 

173 "latin", 

174 "letter", 

175 "locative", 

176 "lujvo", 

177 "monthbox", 

178 "mutverb", 

179 "name", 

180 "nisba", 

181 "nom", 

182 "noun", 

183 "noun form", 

184 "noun-form", 

185 "noun plural", 

186 "noun-plural", 

187 "nounprefix", 

188 "num", 

189 "number", 

190 "numeral", 

191 "ord", 

192 "ordinal", 

193 "par", 

194 "part", 

195 "part form", 

196 "part-form", 

197 "participle", 

198 "particle", 

199 "past", 

200 "past neg", 

201 "past-neg", 

202 "past participle", 

203 "past-participle", 

204 "perfect participle", 

205 "perfect-participle", 

206 "personal pronoun", 

207 "personal-pronoun", 

208 "pref", 

209 "prefix", 

210 "phrase", 

211 "pinyin", 

212 "plural noun", 

213 "plural-noun", 

214 "pos", 

215 "poss-noun", 

216 "post", 

217 "postp", 

218 "postposition", 

219 "PP", 

220 "pp", 

221 "ppron", 

222 "pred", 

223 "predicative", 

224 "prep", 

225 "prep phrase", 

226 "prep-phrase", 

227 "preposition", 

228 "present participle", 

229 "present-participle", 

230 "pron", 

231 "prondem", 

232 "pronindef", 

233 "pronoun", 

234 "prop", 

235 "proper noun", 

236 "proper-noun", 

237 "proper noun form", 

238 "proper-noun form", 

239 "proper noun-form", 

240 "proper-noun-form", 

241 "prov", 

242 "proverb", 

243 "prpn", 

244 "prpr", 

245 "punctuation mark", 

246 "punctuation-mark", 

247 "regnoun", 

248 "rel", 

249 "rom", 

250 "romanji", 

251 "root", 

252 "sign", 

253 "suff", 

254 "suffix", 

255 "syllable", 

256 "symbol", 

257 "verb", 

258 "verb form", 

259 "verb-form", 

260 "verbal noun", 

261 "verbal-noun", 

262 "verbnec", 

263 "vform", 

264 ] 

265 ) 

266 + r")(-|/|\+|$)" 

267) 

268 

269# Head-templates causing problems (like newlines) that can be squashed into 

270# an empty string in the template handler while saving their template 

271# data for later. 

272WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

273 

274FLOATING_TABLE_TEMPLATES: set[str] = { 

275 # az-suffix-form creates a style=floatright div that is otherwise 

276 # deleted; if it is not pre-expanded, we can intercept the template 

277 # so we add this set into do_not_pre_expand, and intercept the 

278 # templates in parse_part_of_speech 

279 "az-suffix-forms", 

280 "az-inf-p", 

281 "kk-suffix-forms", 

282 "ky-suffix-forms", 

283 "tr-inf-p", 

284 "tr-suffix-forms", 

285 "tt-suffix-forms", 

286 "uz-suffix-forms", 

287} 

288# These two should contain template names that should always be 

289# pre-expanded when *first* processing the tree, or not pre-expanded 

290# so that the template are left in place with their identifying 

291# name intact for later filtering. 

292 

293DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

294DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

295 

296# Additional templates to be expanded in the pre-expand phase 

297ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

298 "multitrans", 

299 "multitrans-nowiki", 

300 "trans-top", 

301 "trans-top-also", 

302 "trans-bottom", 

303 "checktrans-top", 

304 "checktrans-bottom", 

305 "col1", 

306 "col2", 

307 "col3", 

308 "col4", 

309 "col5", 

310 "col1-u", 

311 "col2-u", 

312 "col3-u", 

313 "col4-u", 

314 "col5-u", 

315 "check deprecated lang param usage", 

316 "deprecated code", 

317 "ru-verb-alt-ё", 

318 "ru-noun-alt-ё", 

319 "ru-adj-alt-ё", 

320 "ru-proper noun-alt-ё", 

321 "ru-pos-alt-ё", 

322 "ru-alt-ё", 

323 "inflection of", 

324 "no deprecated lang param usage", 

325 "transclude", # these produce sense entries (or other lists) 

326 "tcl", 

327} 

328 

329# Inverse linkage for those that have them 

330linkage_inverses: dict[str, str] = { 

331 # XXX this is not currently used, move to post-processing 

332 "synonyms": "synonyms", 

333 "hypernyms": "hyponyms", 

334 "hyponyms": "hypernyms", 

335 "holonyms": "meronyms", 

336 "meronyms": "holonyms", 

337 "derived": "derived_from", 

338 "coordinate_terms": "coordinate_terms", 

339 "troponyms": "hypernyms", 

340 "antonyms": "antonyms", 

341 "instances": "instance_of", 

342 "related": "related", 

343} 

344 

345# Templates that are used to form panels on pages and that 

346# should be ignored in various positions 

347PANEL_TEMPLATES: set[str] = { 

348 "Character info", 

349 "CJKV", 

350 "French personal pronouns", 

351 "French possessive adjectives", 

352 "French possessive pronouns", 

353 "Han etym", 

354 "Japanese demonstratives", 

355 "Latn-script", 

356 "LDL", 

357 "MW1913Abbr", 

358 "Number-encoding", 

359 "Nuttall", 

360 "Spanish possessive adjectives", 

361 "Spanish possessive pronouns", 

362 "USRegionDisputed", 

363 "Webster 1913", 

364 "ase-rfr", 

365 "attention", 

366 "attn", 

367 "beer", 

368 "broken ref", 

369 "ca-compass", 

370 "character info", 

371 "character info/var", 

372 "checksense", 

373 "compass-fi", 

374 "copyvio suspected", 

375 "delete", 

376 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

377 "etystub", 

378 "examples", 

379 "hu-corr", 

380 "hu-suff-pron", 

381 "interwiktionary", 

382 "ja-kanjitab", 

383 "ko-hanja-search", 

384 "look", 

385 "maintenance box", 

386 "maintenance line", 

387 "mediagenic terms", 

388 "merge", 

389 "missing template", 

390 "morse links", 

391 "move", 

392 "multiple images", 

393 "no inline", 

394 "picdic", 

395 "picdicimg", 

396 "picdiclabel", 

397 "polyominoes", 

398 "predidential nomics", 

399 "punctuation", # This actually gets pre-expanded 

400 "reconstructed", 

401 "request box", 

402 "rf-sound example", 

403 "rfaccents", 

404 "rfap", 

405 "rfaspect", 

406 "rfc", 

407 "rfc-auto", 

408 "rfc-header", 

409 "rfc-level", 

410 "rfc-pron-n", 

411 "rfc-sense", 

412 "rfclarify", 

413 "rfd", 

414 "rfd-redundant", 

415 "rfd-sense", 

416 "rfdate", 

417 "rfdatek", 

418 "rfdef", 

419 "rfe", 

420 "rfe/dowork", 

421 "rfex", 

422 "rfexp", 

423 "rfform", 

424 "rfgender", 

425 "rfi", 

426 "rfinfl", 

427 "rfm", 

428 "rfm-sense", 

429 "rfp", 

430 "rfp-old", 

431 "rfquote", 

432 "rfquote-sense", 

433 "rfquotek", 

434 "rfref", 

435 "rfscript", 

436 "rft2", 

437 "rftaxon", 

438 "rftone", 

439 "rftranslit", 

440 "rfv", 

441 "rfv-etym", 

442 "rfv-pron", 

443 "rfv-quote", 

444 "rfv-sense", 

445 "selfref", 

446 "split", 

447 "stroke order", # XXX consider capturing this? 

448 "stub entry", 

449 "t-needed", 

450 "tbot entry", 

451 "tea room", 

452 "tea room sense", 

453 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

454 "unblock", 

455 "unsupportedpage", 

456 "video frames", 

457 "was wotd", 

458 "wrongtitle", 

459 "zh-forms", 

460 "zh-hanzi-box", 

461 "no entry", 

462} 

463 

464# lookup table for the tags of Chinese dialectal synonyms 

465zh_tag_lookup: dict[str, list[str]] = { 

466 "Formal": ["formal"], 

467 "Written-Standard-Chinese": ["Standard-Chinese"], 

468 "historical or Internet slang": ["historical", "internet-slang"], 

469 "now usually derogatory or offensive": ["offensive", "derogatory"], 

470 "lofty": [], 

471} 

472 

473# Template name prefixes used for language-specific panel templates (i.e., 

474# templates that create side boxes or notice boxes or that should generally 

475# be ignored). 

476PANEL_PREFIXES: set[str] = { 

477 "list:compass points/", 

478 "list:Gregorian calendar months/", 

479 "RQ:", 

480} 

481 

482# Templates used for wikipedia links. 

483wikipedia_templates: set[str] = { 

484 "wikipedia", 

485 "slim-wikipedia", 

486 "w", 

487 "W", 

488 "swp", 

489 "wiki", 

490 "Wikipedia", 

491 "wtorw", 

492} 

493for x in PANEL_PREFIXES & wikipedia_templates: 493 ↛ 494line 493 didn't jump to line 494 because the loop on line 493 never started

494 print( 

495 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

496 x 

497 ) 

498 ) 

499 

500# Mapping from a template name (without language prefix) for the main word 

501# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

502# it could validly occur. This is used as just a sanity check to give 

503# warnings about probably incorrect coding in Wiktionary. 

504template_allowed_pos_map: dict[str, list[str]] = { 

505 "abbr": ["abbrev"], 

506 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

507 "plural noun": ["noun", "name"], 

508 "plural-noun": ["noun", "name"], 

509 "proper noun": ["noun", "name"], 

510 "proper-noun": ["name", "noun"], 

511 "prop": ["name", "noun"], 

512 "verb": ["verb", "phrase"], 

513 "gerund": ["verb"], 

514 "particle": ["adv", "particle"], 

515 "adj": ["adj", "adj_noun"], 

516 "pron": ["pron", "noun"], 

517 "name": ["name", "noun"], 

518 "adv": ["adv", "intj", "conj", "particle"], 

519 "phrase": ["phrase", "prep_phrase"], 

520 "noun phrase": ["phrase"], 

521 "ordinal": ["num"], 

522 "number": ["num"], 

523 "pos": ["affix", "name", "num"], 

524 "suffix": ["suffix", "affix"], 

525 "character": ["character"], 

526 "letter": ["character"], 

527 "kanji": ["character"], 

528 "cont": ["abbrev"], 

529 "interj": ["intj"], 

530 "con": ["conj"], 

531 "part": ["particle"], 

532 "prep": ["prep", "postp"], 

533 "postp": ["postp"], 

534 "misspelling": ["noun", "adj", "verb", "adv"], 

535 "part-form": ["verb"], 

536} 

537for k, v in template_allowed_pos_map.items(): 

538 for x in v: 

539 if x not in PARTS_OF_SPEECH: 539 ↛ 540line 539 didn't jump to line 540 because the condition on line 539 was never true

540 print( 

541 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

542 "".format(x, k, v) 

543 ) 

544 assert False 

545 

546 

547# Templates ignored during etymology extraction, i.e., these will not be listed 

548# in the extracted etymology templates. 

549ignored_etymology_templates: list[str] = [ 

550 "...", 

551 "IPAchar", 

552 "ipachar", 

553 "ISBN", 

554 "isValidPageName", 

555 "redlink category", 

556 "deprecated code", 

557 "check deprecated lang param usage", 

558 "para", 

559 "p", 

560 "cite", 

561 "Cite news", 

562 "Cite newsgroup", 

563 "cite paper", 

564 "cite MLLM 1976", 

565 "cite journal", 

566 "cite news/documentation", 

567 "cite paper/documentation", 

568 "cite video game", 

569 "cite video game/documentation", 

570 "cite newsgroup", 

571 "cite newsgroup/documentation", 

572 "cite web/documentation", 

573 "cite news", 

574 "Cite book", 

575 "Cite-book", 

576 "cite book", 

577 "cite web", 

578 "cite-usenet", 

579 "cite-video/documentation", 

580 "Cite-journal", 

581 "rfe", 

582 "catlangname", 

583 "cln", 

584 "langname-lite", 

585 "no deprecated lang param usage", 

586 "mention", 

587 "m", 

588 "m-self", 

589 "link", 

590 "l", 

591 "ll", 

592 "l-self", 

593] 

594# Regexp for matching ignored etymology template names. This adds certain 

595# prefixes to the names listed above. 

596ignored_etymology_templates_re = re.compile( 

597 r"^((cite-|R:|RQ:).*|" 

598 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

599 + r")$" 

600) 

601 

602# Regexp for matching ignored descendants template names. Right now we just 

603# copy the ignored etymology templates 

604ignored_descendants_templates_re = ignored_etymology_templates_re 

605 

606# Set of template names that are used to define usage examples. If the usage 

607# example contains one of these templates, then it its type is set to 

608# "example" 

609usex_templates: set[str] = { 

610 "afex", 

611 "affixusex", 

612 "co", # {{collocation}} acts like a example template, specifically for 

613 # pairs of combinations of words that are more common than you'd 

614 # except would be randomly; hlavní#Czech 

615 "coi", 

616 "collocation", 

617 "el-example", 

618 "el-x", 

619 "example", 

620 "examples", 

621 "he-usex", 

622 "he-x", 

623 "hi-usex", 

624 "hi-x", 

625 "ja-usex-inline", 

626 "ja-usex", 

627 "ja-x", 

628 "jbo-example", 

629 "jbo-x", 

630 "km-usex", 

631 "km-x", 

632 "ko-usex", 

633 "ko-x", 

634 "lo-usex", 

635 "lo-x", 

636 "ne-x", 

637 "ne-usex", 

638 "prefixusex", 

639 "ryu-usex", 

640 "ryu-x", 

641 "shn-usex", 

642 "shn-x", 

643 "suffixusex", 

644 "th-usex", 

645 "th-x", 

646 "ur-usex", 

647 "ur-x", 

648 "usex", 

649 "usex-suffix", 

650 "ux", 

651 "uxi", 

652} 

653 

654stop_head_at_these_templates: set[str] = { 

655 "category", 

656 "cat", 

657 "topics", 

658 "catlangname", 

659 "c", 

660 "C", 

661 "top", 

662 "cln", 

663} 

664 

665# Set of template names that are used to define quotation examples. If the 

666# usage example contains one of these templates, then its type is set to 

667# "quotation". 

668quotation_templates: set[str] = { 

669 "collapse-quote", 

670 "quote-av", 

671 "quote-book", 

672 "quote-GYLD", 

673 "quote-hansard", 

674 "quotei", 

675 "quote-journal", 

676 "quotelite", 

677 "quote-mailing list", 

678 "quote-meta", 

679 "quote-newsgroup", 

680 "quote-song", 

681 "quote-text", 

682 "quote", 

683 "quote-us-patent", 

684 "quote-video game", 

685 "quote-web", 

686 "quote-wikipedia", 

687 "wikiquote", 

688 "Wikiquote", 

689} 

690 

691taxonomy_templates = { 

692 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

693 "taxfmt", 

694 "taxlink", 

695 "taxlink2", 

696 "taxlinknew", 

697 "taxlook", 

698} 

699 

700# Template name component to linkage section listing. Integer section means 

701# default section, starting at that argument. 

702# XXX not used anymore, except for the first elements: moved to 

703# template_linkages 

704# template_linkage_mappings: list[list[Union[str, int]]] = [ 

705# ["syn", "synonyms"], 

706# ["synonyms", "synonyms"], 

707# ["ant", "antonyms"], 

708# ["antonyms", "antonyms"], 

709# ["hyp", "hyponyms"], 

710# ["hyponyms", "hyponyms"], 

711# ["der", "derived"], 

712# ["derived terms", "derived"], 

713# ["coordinate terms", "coordinate_terms"], 

714# ["rel", "related"], 

715# ["col", 2], 

716# ] 

717 

718# Template names, this was exctracted from template_linkage_mappings, 

719# because the code using template_linkage_mappings was actually not used 

720# (but not removed). 

721template_linkages_to_ignore_in_examples: set[str] = { 

722 "syn", 

723 "synonyms", 

724 "ant", 

725 "antonyms", 

726 "hyp", 

727 "hyponyms", 

728 "der", 

729 "derived terms", 

730 "coordinate terms", 

731 "cot", 

732 "rel", 

733 "col", 

734 "inline alt forms", 

735 "alti", 

736 "comeronyms", 

737 "holonyms", 

738 "holo", 

739 "hypernyms", 

740 "hyper", 

741 "meronyms," 

742 "mero", 

743 "troponyms", 

744 "perfectives", 

745 "pf", 

746 "imperfectives", 

747 "impf", 

748 "syndiff", 

749 "synsee", 

750} 

751 

752# Maps template name used in a word sense to a linkage field that it adds. 

753sense_linkage_templates: dict[str, str] = { 

754 "syn": "synonyms", 

755 "synonyms": "synonyms", 

756 "synsee": "synonyms", 

757 "syndiff": "synonyms", 

758 "hyp": "hyponyms", 

759 "hyponyms": "hyponyms", 

760 "ant": "antonyms", 

761 "antonyms": "antonyms", 

762 "alti": "related", 

763 "inline alt forms": "related", 

764 "coordinate terms": "coordinate_terms", 

765 "cot": "coordinate_terms", 

766 "comeronyms": "related", 

767 "holonyms": "holonyms", 

768 "holo": "holonyms", 

769 "hypernyms": "hypernyms", 

770 "hyper": "hypernyms", 

771 "meronyms": "meronyms", 

772 "mero": "meronyms", 

773 "troponyms": "troponyms", 

774 "perfectives": "related", 

775 "pf": "related", 

776 "imperfectives": "related", 

777 "impf": "related", 

778} 

779 

780sense_linkage_templates_tags: dict[str, list[str]] = { 

781 "alti": ["alternative"], 

782 "inline alt forms": ["alternative"], 

783 "comeronyms": ["comeronym"], 

784 "perfectives": ["perfective"], 

785 "pf": ["perfective"], 

786 "imperfectives": ["imperfective"], 

787 "impf": ["imperfective"], 

788} 

789 

790 

791def decode_html_entities(v: Union[str, int]) -> str: 

792 """Decodes HTML entities from a value, converting them to the respective 

793 Unicode characters/strings.""" 

794 if isinstance(v, int): 

795 # I changed this to return str(v) instead of v = str(v), 

796 # but there might have been the intention to have more logic 

797 # here. html.unescape would not do anything special with an integer, 

798 # it needs html escape symbols (&xx;). 

799 return str(v) 

800 return html.unescape(v) 

801 

802 

803def parse_sense_linkage( 

804 wxr: WiktextractContext, 

805 data: SenseData, 

806 name: str, 

807 ht: TemplateArgs, 

808 pos: str, 

809) -> None: 

810 """Parses a linkage (synonym, etc) specified in a word sense.""" 

811 assert isinstance(wxr, WiktextractContext) 

812 assert isinstance(data, dict) 

813 assert isinstance(name, str) 

814 assert isinstance(ht, dict) 

815 field = sense_linkage_templates[name] 

816 field_tags = sense_linkage_templates_tags.get(name, []) 

817 for i in range(2, 20): 

818 w = ht.get(i) or "" 

819 w = clean_node(wxr, data, w) 

820 is_thesaurus = False 

821 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

822 if w.startswith(alias): 822 ↛ 823line 822 didn't jump to line 823 because the condition on line 822 was never true

823 is_thesaurus = True 

824 w = w[len(alias) :] 

825 if w != wxr.wtp.title: 

826 from ...thesaurus import search_thesaurus 

827 

828 lang_code = clean_node(wxr, None, ht.get(1, "")) 

829 for t_data in search_thesaurus( 

830 wxr.thesaurus_db_conn, w, lang_code, pos, field 

831 ): 

832 l_data = { 

833 "word": t_data.term, 

834 "source": "Thesaurus:" + w, 

835 } 

836 if len(t_data.tags) > 0: 

837 l_data["tags"] = t_data.tags 

838 if len(t_data.raw_tags) > 0: 

839 l_data["raw_tags"] = t_data.raw_tags 

840 data_append(data, field, l_data) 

841 break 

842 if not w: 

843 break 

844 if is_thesaurus: 844 ↛ 845line 844 didn't jump to line 845 because the condition on line 844 was never true

845 continue 

846 tags: list[str] = [] 

847 topics: list[str] = [] 

848 english: Optional[str] = None 

849 # Try to find qualifiers for this synonym 

850 q = ht.get("q{}".format(i - 1)) 

851 if q: 

852 cls = classify_desc(q) 

853 if cls == "tags": 

854 tagsets1, topics1 = decode_tags(q) 

855 for ts in tagsets1: 

856 tags.extend(ts) 

857 topics.extend(topics1) 

858 elif cls == "english": 858 ↛ 864line 858 didn't jump to line 864 because the condition on line 858 was always true

859 if english: 859 ↛ 860line 859 didn't jump to line 860 because the condition on line 859 was never true

860 english += "; " + q 

861 else: 

862 english = q 

863 # Try to find English translation for this synonym 

864 t = ht.get("t{}".format(i - 1)) 

865 if t: 865 ↛ 866line 865 didn't jump to line 866 because the condition on line 865 was never true

866 if english: 

867 english += "; " + t 

868 else: 

869 english = t 

870 

871 # See if the linkage contains a parenthesized alt 

872 alt = None 

873 m = re.search(r"\(([^)]+)\)$", w) 

874 if m: 874 ↛ 875line 874 didn't jump to line 875 because the condition on line 874 was never true

875 w = w[: m.start()].strip() 

876 alt = m.group(1) 

877 

878 dt = {"word": w} 

879 if field_tags: 879 ↛ 880line 879 didn't jump to line 880 because the condition on line 879 was never true

880 data_extend(dt, "tags", field_tags) 

881 if tags: 

882 data_extend(dt, "tags", tags) 

883 if topics: 883 ↛ 884line 883 didn't jump to line 884 because the condition on line 883 was never true

884 data_extend(dt, "topics", topics) 

885 if english: 

886 dt["english"] = english 

887 if alt: 887 ↛ 888line 887 didn't jump to line 888 because the condition on line 887 was never true

888 dt["alt"] = alt 

889 data_append(data, field, dt) 

890 

891 

892EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

893example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

894captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

895 

896 

897def synch_splits_with_args( 

898 line: str, targs: TemplateArgs 

899) -> Optional[list[str]]: 

900 """If it looks like there's something weird with how a line of example 

901 text has been split, this function will do the splitting after counting 

902 occurences of the splitting regex inside the two main template arguments 

903 containing the string data for the original language example and the 

904 English translations. 

905 """ 

906 # Previously, we split without capturing groups, but here we want to 

907 # keep the original splitting hyphen regex intact. 

908 fparts = captured_splitters_re.split(line) 

909 new_parts = [] 

910 # ["First", " – ", "second", " – ", "third..."] from OL argument 

911 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

912 new_parts.append("".join(fparts[:first])) 

913 # Translation argument 

914 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

915 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

916 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

917 new_parts.append("".join(fparts[first + 1 : second])) 

918 

919 if all(new_parts): # no empty strings from the above spaghetti 

920 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

921 return new_parts 

922 else: 

923 return None 

924 

925 

926QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

927QUALIFIERS_RE = re.compile(QUALIFIERS) 

928# (...): ... or (...(...)...): ... 

929 

930 

931def parse_language( 

932 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

933) -> list[WordData]: 

934 """Iterates over the text of the page, returning words (parts-of-speech) 

935 defined on the page one at a time. (Individual word senses for the 

936 same part-of-speech are typically encoded in the same entry.)""" 

937 # imported here to avoid circular import 

938 from .pronunciation import parse_pronunciation 

939 

940 assert isinstance(wxr, WiktextractContext) 

941 assert isinstance(langnode, WikiNode) 

942 assert isinstance(language, str) 

943 assert isinstance(lang_code, str) 

944 # print("parse_language", language) 

945 

946 is_reconstruction = False 

947 word: str = wxr.wtp.title # type: ignore[assignment] 

948 unsupported_prefix = "Unsupported titles/" 

949 if word.startswith(unsupported_prefix): 

950 w = word[len(unsupported_prefix) :] 

951 if w in unsupported_title_map: 951 ↛ 954line 951 didn't jump to line 954 because the condition on line 951 was always true

952 word = unsupported_title_map[w] 

953 else: 

954 wxr.wtp.error( 

955 "Unimplemented unsupported title: {}".format(word), 

956 sortid="page/870", 

957 ) 

958 word = w 

959 elif word.startswith("Reconstruction:"): 959 ↛ 960line 959 didn't jump to line 960 because the condition on line 959 was never true

960 word = word[word.find("/") + 1 :] 

961 is_reconstruction = True 

962 

963 base_data: WordData = { 

964 "word": word, 

965 "lang": language, 

966 "lang_code": lang_code, 

967 } 

968 if is_reconstruction: 968 ↛ 969line 968 didn't jump to line 969 because the condition on line 968 was never true

969 data_append(base_data, "tags", "reconstruction") 

970 sense_data: SenseData = {} 

971 pos_data: WordData = {} # For a current part-of-speech 

972 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

973 etym_data: WordData = {} # For one etymology 

974 pos_datas: list[SenseData] = [] 

975 level_four_datas: list[WordData] = [] 

976 etym_datas: list[WordData] = [] 

977 page_datas: list[WordData] = [] 

978 have_etym = False 

979 inside_level_four = False # This is for checking if the etymology section 

980 # or article has a Pronunciation section, for Chinese mostly; because 

981 # Chinese articles can have three level three sections (two etymology 

982 # sections and pronunciation sections) one after another, we need a kludge 

983 # to better keep track of whether we're in a normal "etym" or inside a 

984 # "level four" (which is what we've turned the level three Pron sections 

985 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

986 # a step. 

987 stack: list[str] = [] # names of items on the "stack" 

988 

989 def merge_base(data: WordData, base: WordData) -> None: 

990 for k, v in base.items(): 

991 # Copy the value to ensure that we don't share lists or 

992 # dicts between structures (even nested ones). 

993 v = copy.deepcopy(v) 

994 if k not in data: 

995 # The list was copied above, so this will not create shared ref 

996 data[k] = v # type: ignore[literal-required] 

997 continue 

998 if data[k] == v: # type: ignore[literal-required] 

999 continue 

1000 if ( 1000 ↛ 1008line 1000 didn't jump to line 1008 because the condition on line 1000 was always true

1001 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

1002 or isinstance( 

1003 v, 

1004 (list, tuple), # Should this be "and"? 

1005 ) 

1006 ): 

1007 data[k] = list(data[k]) + list(v) # type: ignore 

1008 elif data[k] != v: # type: ignore[literal-required] 

1009 wxr.wtp.warning( 

1010 "conflicting values for {} in merge_base: " 

1011 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1012 sortid="page/904", 

1013 ) 

1014 

1015 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1016 """Remove unnecessary keys from dict values 

1017 in a list comprehension...""" 

1018 if key in pron: 

1019 pron.pop(key) # type: ignore 

1020 return pron 

1021 

1022 # If the result has sounds, eliminate sounds that have a prefix that 

1023 # does not match "word" or one of "forms" 

1024 if "sounds" in data and "word" in data: 

1025 accepted = [data["word"]] 

1026 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1027 data["sounds"] = list( 

1028 s 

1029 for s in data["sounds"] 

1030 if "form" not in s or s["form"] in accepted 

1031 ) 

1032 # If the result has sounds, eliminate sounds that have a pos that 

1033 # does not match "pos" 

1034 if "sounds" in data and "pos" in data: 

1035 data["sounds"] = list( 

1036 complementary_pop(s, "pos") 

1037 for s in data["sounds"] 

1038 # "pos" is not a field of SoundData, correctly, so we're 

1039 # removing it here. It's a kludge on a kludge on a kludge. 

1040 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

1041 ) 

1042 

1043 def push_sense() -> bool: 

1044 """Starts collecting data for a new word sense. This returns True 

1045 if a sense was added.""" 

1046 nonlocal sense_data 

1047 tags = sense_data.get("tags", ()) 

1048 if ( 

1049 not sense_data.get("glosses") 

1050 and "translation-hub" not in tags 

1051 and "no-gloss" not in tags 

1052 ): 

1053 return False 

1054 

1055 if ( 1055 ↛ 1065line 1055 didn't jump to line 1065 because the condition on line 1055 was never true

1056 ( 

1057 "participle" in sense_data.get("tags", ()) 

1058 or "infinitive" in sense_data.get("tags", ()) 

1059 ) 

1060 and "alt_of" not in sense_data 

1061 and "form_of" not in sense_data 

1062 and "etymology_text" in etym_data 

1063 and etym_data["etymology_text"] != "" 

1064 ): 

1065 etym = etym_data["etymology_text"] 

1066 etym = etym.split(". ")[0] 

1067 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1068 if ret is not None: 

1069 tags, lst = ret 

1070 assert isinstance(lst, (list, tuple)) 

1071 if "form-of" in tags: 

1072 data_extend(sense_data, "form_of", lst) 

1073 data_extend(sense_data, "tags", tags) 

1074 elif "alt-of" in tags: 

1075 data_extend(sense_data, "alt_of", lst) 

1076 data_extend(sense_data, "tags", tags) 

1077 

1078 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1078 ↛ 1081line 1078 didn't jump to line 1081 because the condition on line 1078 was never true

1079 "tags", () 

1080 ): 

1081 data_append(sense_data, "tags", "no-gloss") 

1082 

1083 pos_datas.append(sense_data) 

1084 sense_data = {} 

1085 return True 

1086 

1087 def push_pos() -> None: 

1088 """Starts collecting data for a new part-of-speech.""" 

1089 nonlocal pos_data 

1090 nonlocal pos_datas 

1091 push_sense() 

1092 if wxr.wtp.subsection: 

1093 data: WordData = {"senses": pos_datas} 

1094 merge_base(data, pos_data) 

1095 level_four_datas.append(data) 

1096 pos_data = {} 

1097 pos_datas = [] 

1098 wxr.wtp.start_subsection(None) 

1099 

1100 def push_level_four_section() -> None: 

1101 """Starts collecting data for a new level four sections, which 

1102 is usually virtual and empty, unless the article has Chinese 

1103 'Pronunciation' sections that are etymology-section-like but 

1104 under etymology, and at the same level in the source. We modify 

1105 the source to demote Pronunciation sections like that to level 

1106 4, and other sections one step lower.""" 

1107 nonlocal level_four_data 

1108 nonlocal level_four_datas 

1109 nonlocal etym_datas 

1110 push_pos() 

1111 # print(f"======\n{etym_data=}") 

1112 # print(f"======\n{etym_datas=}") 

1113 # print(f"======\n{level_four_data=}") 

1114 # print(f"======\n{level_four_datas=}") 

1115 for data in level_four_datas: 

1116 merge_base(data, level_four_data) 

1117 etym_datas.append(data) 

1118 for data in etym_datas: 

1119 merge_base(data, etym_data) 

1120 page_datas.append(data) 

1121 level_four_data = {} 

1122 level_four_datas = [] 

1123 etym_datas = [] 

1124 

1125 def push_etym() -> None: 

1126 """Starts collecting data for a new etymology.""" 

1127 nonlocal etym_data 

1128 nonlocal etym_datas 

1129 nonlocal have_etym 

1130 nonlocal inside_level_four 

1131 have_etym = True 

1132 push_level_four_section() 

1133 inside_level_four = False 

1134 etym_data = {} 

1135 

1136 def select_data() -> WordData: 

1137 """Selects where to store data (pos or etym) based on whether we 

1138 are inside a pos (part-of-speech).""" 

1139 # print(f"{wxr.wtp.subsection=}") 

1140 # print(f"{stack=}") 

1141 if wxr.wtp.subsection is not None: 

1142 return pos_data 

1143 if stack[-1] == language: 

1144 return base_data 

1145 if inside_level_four is False: 

1146 return etym_data 

1147 return level_four_data 

1148 

1149 term_label_templates: list[TemplateData] = [] 

1150 

1151 def head_post_template_fn( 

1152 name: str, ht: TemplateArgs, expansion: str 

1153 ) -> Optional[str]: 

1154 """Handles special templates in the head section of a word. Head 

1155 section is the text after part-of-speech subtitle and before word 

1156 sense list. Typically it generates the bold line for the word, but 

1157 may also contain other useful information that often ends in 

1158 side boxes. We want to capture some of that additional information.""" 

1159 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1160 if is_panel_template(wxr, name): 1160 ↛ 1163line 1160 didn't jump to line 1163 because the condition on line 1160 was never true

1161 # Completely ignore these templates (not even recorded in 

1162 # head_templates) 

1163 return "" 

1164 if name == "head": 

1165 # XXX are these also captured in forms? Should this special case 

1166 # be removed? 

1167 t = ht.get(2, "") 

1168 if t == "pinyin": 1168 ↛ 1169line 1168 didn't jump to line 1169 because the condition on line 1168 was never true

1169 data_append(pos_data, "tags", "Pinyin") 

1170 elif t == "romanization": 1170 ↛ 1171line 1170 didn't jump to line 1171 because the condition on line 1170 was never true

1171 data_append(pos_data, "tags", "romanization") 

1172 if ( 

1173 HEAD_TAG_RE.search(name) is not None 

1174 or name in WORD_LEVEL_HEAD_TEMPLATES 

1175 ): 

1176 args_ht = clean_template_args(wxr, ht) 

1177 cleaned_expansion = clean_node(wxr, None, expansion) 

1178 dt: TemplateData = { 

1179 "name": name, 

1180 "args": args_ht, 

1181 "expansion": cleaned_expansion, 

1182 } 

1183 data_append(pos_data, "head_templates", dt) 

1184 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1185 term_label_templates.append(dt) 

1186 # Squash these, their tags are applied to the whole word, 

1187 # and some cause problems like "term-label" 

1188 return "" 

1189 

1190 # The following are both captured in head_templates and parsed 

1191 # separately 

1192 

1193 if name in wikipedia_templates: 

1194 # Note: various places expect to have content from wikipedia 

1195 # templates, so cannot convert this to empty 

1196 parse_wikipedia_template(wxr, pos_data, ht) 

1197 return None 

1198 

1199 if name == "number box": 1199 ↛ 1201line 1199 didn't jump to line 1201 because the condition on line 1199 was never true

1200 # XXX extract numeric value? 

1201 return "" 

1202 if name == "enum": 

1203 # XXX extract? 

1204 return "" 

1205 if name == "cardinalbox": 1205 ↛ 1208line 1205 didn't jump to line 1208 because the condition on line 1205 was never true

1206 # XXX extract similar to enum? 

1207 # XXX this can also occur in top-level under language 

1208 return "" 

1209 if name == "Han simplified forms": 1209 ↛ 1211line 1209 didn't jump to line 1211 because the condition on line 1209 was never true

1210 # XXX extract? 

1211 return "" 

1212 # if name == "ja-kanji forms": 

1213 # # XXX extract? 

1214 # return "" 

1215 # if name == "vi-readings": 

1216 # # XXX extract? 

1217 # return "" 

1218 # if name == "ja-kanji": 

1219 # # XXX extract? 

1220 # return "" 

1221 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1221 ↛ 1223line 1221 didn't jump to line 1223 because the condition on line 1221 was never true

1222 # XXX extract? 

1223 return "" 

1224 

1225 return None 

1226 

1227 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1228 """Parses the subsection for a part-of-speech under a language on 

1229 a page.""" 

1230 assert isinstance(posnode, WikiNode) 

1231 assert isinstance(pos, str) 

1232 # print("parse_part_of_speech", pos) 

1233 pos_data["pos"] = pos 

1234 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1235 lists: list[list[WikiNode]] = [[]] # list of lists 

1236 first_para = True 

1237 first_head_tmplt = True 

1238 collecting_head = True 

1239 start_of_paragraph = True 

1240 

1241 # XXX extract templates from posnode with recursively_extract 

1242 # that break stuff, like ja-kanji or az-suffix-form. 

1243 # Do the extraction with a list of template names, combined from 

1244 # different lists, then separate out them into different lists 

1245 # that are handled at different points of the POS section. 

1246 # First, extract az-suffix-form, put it in `inflection`, 

1247 # and parse `inflection`'s content when appropriate later. 

1248 # The contents of az-suffix-form (and ja-kanji) that generate 

1249 # divs with "floatright" in their style gets deleted by 

1250 # clean_value, so templates that slip through from here won't 

1251 # break anything. 

1252 # XXX bookmark 

1253 # print("===================") 

1254 # print(posnode.children) 

1255 

1256 floaters, poschildren = recursively_extract( 

1257 posnode.children, 

1258 lambda x: ( 

1259 isinstance(x, WikiNode) 

1260 and ( 

1261 ( 

1262 x.kind == NodeKind.TEMPLATE 

1263 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES 

1264 ) 

1265 or ( 

1266 x.kind == NodeKind.LINK 

1267 # Need to check for stringiness because some links are 

1268 # broken; for example, if a template is missing an 

1269 # argument, a link might look like `[[{{{1}}}...]]` 

1270 and isinstance(x.largs[0][0], str) 

1271 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1272 ) 

1273 ) 

1274 ), 

1275 ) 

1276 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1277 tempnode.largs = [["Inflection"]] 

1278 tempnode.children = floaters 

1279 parse_inflection(tempnode, "Floating Div", pos) 

1280 # print(poschildren) 

1281 # XXX new above 

1282 

1283 if not poschildren: 1283 ↛ 1284line 1283 didn't jump to line 1284 because the condition on line 1283 was never true

1284 if not floaters: 

1285 wxr.wtp.debug( 

1286 "PoS section without contents", 

1287 sortid="en/page/1051/20230612", 

1288 ) 

1289 else: 

1290 wxr.wtp.debug( 

1291 "PoS section without contents except for a floating table", 

1292 sortid="en/page/1056/20230612", 

1293 ) 

1294 return 

1295 

1296 for node in poschildren: 

1297 if isinstance(node, str): 

1298 for m in re.finditer(r"\n+|[^\n]+", node): 

1299 p = m.group(0) 

1300 if p.startswith("\n\n") and pre: 

1301 first_para = False 

1302 start_of_paragraph = True 

1303 break 

1304 if p and collecting_head: 

1305 pre[-1].append(p) 

1306 continue 

1307 assert isinstance(node, WikiNode) 

1308 kind = node.kind 

1309 if kind == NodeKind.LIST: 

1310 lists[-1].append(node) 

1311 collecting_head = False 

1312 start_of_paragraph = True 

1313 continue 

1314 elif kind in LEVEL_KINDS: 

1315 # Stop parsing section if encountering any kind of 

1316 # level header (like ===Noun=== or ====Further Reading====). 

1317 # At a quick glance, this should be the default behavior, 

1318 # but if some kinds of source articles have sub-sub-sections 

1319 # that should be parsed XXX it should be handled by changing 

1320 # this break. 

1321 break 

1322 elif collecting_head and kind == NodeKind.LINK: 

1323 # We might collect relevant links as they are often pictures 

1324 # relating to the word 

1325 if len(node.largs[0]) >= 1 and isinstance( 1325 ↛ 1340line 1325 didn't jump to line 1340 because the condition on line 1325 was always true

1326 node.largs[0][0], str 

1327 ): 

1328 if node.largs[0][0].startswith( 1328 ↛ 1334line 1328 didn't jump to line 1334 because the condition on line 1328 was never true

1329 ns_title_prefix_tuple(wxr, "Category") 

1330 ): 

1331 # [[Category:...]] 

1332 # We're at the end of the file, probably, so stop 

1333 # here. Otherwise the head will get garbage. 

1334 break 

1335 if node.largs[0][0].startswith( 1335 ↛ 1340line 1335 didn't jump to line 1340 because the condition on line 1335 was always true

1336 ns_title_prefix_tuple(wxr, "File") 

1337 ): 

1338 # Skips file links 

1339 continue 

1340 start_of_paragraph = False 

1341 pre[-1].extend(node.largs[-1]) 

1342 elif kind == NodeKind.HTML: 

1343 if node.sarg == "br": 

1344 if pre[-1]: 1344 ↛ 1296line 1344 didn't jump to line 1296 because the condition on line 1344 was always true

1345 pre.append([]) # Switch to next head 

1346 lists.append([]) # Lists parallels pre 

1347 collecting_head = True 

1348 start_of_paragraph = True 

1349 elif collecting_head and node.sarg not in ( 1349 ↛ 1355line 1349 didn't jump to line 1355 because the condition on line 1349 was never true

1350 "gallery", 

1351 "ref", 

1352 "cite", 

1353 "caption", 

1354 ): 

1355 start_of_paragraph = False 

1356 pre[-1].append(node) 

1357 else: 

1358 start_of_paragraph = False 

1359 elif isinstance(node, TemplateNode): 

1360 # XXX Insert code here that disambiguates between 

1361 # templates that generate word heads and templates 

1362 # that don't. 

1363 # There's head_tag_re that seems like a regex meant 

1364 # to identify head templates. Too bad it's None. 

1365 

1366 # ignore {{category}}, {{cat}}... etc. 

1367 if node.template_name in stop_head_at_these_templates: 

1368 # we've reached a template that should be at the end, 

1369 continue 

1370 

1371 # skip these templates; panel_templates is already used 

1372 # to skip certain templates else, but it also applies to 

1373 # head parsing quite well. 

1374 # node.largs[0][0] should always be str, but can't type-check 

1375 # that. 

1376 if is_panel_template(wxr, node.template_name): 

1377 continue 

1378 # skip these templates 

1379 # if node.largs[0][0] in skip_these_templates_in_head: 

1380 # first_head_tmplt = False # no first_head_tmplt at all 

1381 # start_of_paragraph = False 

1382 # continue 

1383 

1384 if first_head_tmplt and pre[-1]: 

1385 first_head_tmplt = False 

1386 start_of_paragraph = False 

1387 pre[-1].append(node) 

1388 elif pre[-1] and start_of_paragraph: 

1389 pre.append([]) # Switch to the next head 

1390 lists.append([]) # lists parallel pre 

1391 collecting_head = True 

1392 start_of_paragraph = False 

1393 pre[-1].append(node) 

1394 else: 

1395 pre[-1].append(node) 

1396 elif first_para: 

1397 start_of_paragraph = False 

1398 if collecting_head: 1398 ↛ 1296line 1398 didn't jump to line 1296 because the condition on line 1398 was always true

1399 pre[-1].append(node) 

1400 # XXX use template_fn in clean_node to check that the head macro 

1401 # is compatible with the current part-of-speech and generate warning 

1402 # if not. Use template_allowed_pos_map. 

1403 

1404 # Clean up empty pairs, and fix messes with extra newlines that 

1405 # separate templates that are followed by lists wiktextract issue #314 

1406 

1407 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1408 cleaned_lists: list[list[WikiNode]] = [] 

1409 pairless_pre_index = None 

1410 

1411 for pre1, ls in zip(pre, lists): 

1412 if pre1 and not ls: 

1413 pairless_pre_index = len(cleaned_pre) 

1414 if not pre1 and not ls: 1414 ↛ 1416line 1414 didn't jump to line 1416 because the condition on line 1414 was never true

1415 # skip [] + [] 

1416 continue 

1417 if not ls and all( 

1418 (isinstance(x, str) and not x.strip()) for x in pre1 

1419 ): 

1420 # skip ["\n", " "] + [] 

1421 continue 

1422 if ls and not pre1: 

1423 if pairless_pre_index is not None: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true

1424 cleaned_lists[pairless_pre_index] = ls 

1425 pairless_pre_index = None 

1426 continue 

1427 cleaned_pre.append(pre1) 

1428 cleaned_lists.append(ls) 

1429 

1430 pre = cleaned_pre 

1431 lists = cleaned_lists 

1432 

1433 there_are_many_heads = len(pre) > 1 

1434 header_tags: list[str] = [] 

1435 header_topics: list[str] = [] 

1436 previous_head_had_list = False 

1437 

1438 if not any(g for g in lists): 

1439 process_gloss_without_list( 

1440 poschildren, pos, pos_data, header_tags, header_topics 

1441 ) 

1442 else: 

1443 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1444 # if len(ls) == 0: 

1445 # # don't have gloss list 

1446 # # XXX add code here to filter out 'garbage', like text 

1447 # # that isn't a head template or head. 

1448 # continue 

1449 

1450 if all(not sl for sl in lists[i:]): 

1451 if i == 0: 1451 ↛ 1452line 1451 didn't jump to line 1452 because the condition on line 1451 was never true

1452 if isinstance(node, str): 

1453 wxr.wtp.debug( 

1454 "first head without list of senses," 

1455 "string: '{}[...]', {}/{}".format( 

1456 node[:20], word, language 

1457 ), 

1458 sortid="page/1689/20221215", 

1459 ) 

1460 if isinstance(node, WikiNode): 

1461 if node.largs and node.largs[0][0] in [ 

1462 "Han char", 

1463 ]: 

1464 # just ignore these templates 

1465 pass 

1466 else: 

1467 wxr.wtp.debug( 

1468 "first head without " 

1469 "list of senses, " 

1470 "template node " 

1471 "{}, {}/{}".format( 

1472 node.largs, word, language 

1473 ), 

1474 sortid="page/1694/20221215", 

1475 ) 

1476 else: 

1477 wxr.wtp.debug( 

1478 "first head without list of senses, " 

1479 "{}/{}".format(word, language), 

1480 sortid="page/1700/20221215", 

1481 ) 

1482 # no break here so that the first head always 

1483 # gets processed. 

1484 else: 

1485 if isinstance(node, str): 1485 ↛ 1486line 1485 didn't jump to line 1486 because the condition on line 1485 was never true

1486 wxr.wtp.debug( 

1487 "later head without list of senses," 

1488 "string: '{}[...]', {}/{}".format( 

1489 node[:20], word, language 

1490 ), 

1491 sortid="page/1708/20221215", 

1492 ) 

1493 if isinstance(node, WikiNode): 1493 ↛ 1505line 1493 didn't jump to line 1505 because the condition on line 1493 was always true

1494 wxr.wtp.debug( 

1495 "later head without list of senses," 

1496 "template node " 

1497 "{}, {}/{}".format( 

1498 node.sarg if node.sarg else node.largs, 

1499 word, 

1500 language, 

1501 ), 

1502 sortid="page/1713/20221215", 

1503 ) 

1504 else: 

1505 wxr.wtp.debug( 

1506 "later head without list of senses, " 

1507 "{}/{}".format(word, language), 

1508 sortid="page/1719/20221215", 

1509 ) 

1510 break 

1511 head_group = i + 1 if there_are_many_heads else None 

1512 # print("parse_part_of_speech: {}: {}: pre={}" 

1513 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1514 

1515 if previous_head_had_list: 

1516 # We use a boolean flag here because we want to be able 

1517 # let the header_tags data pass through after the loop 

1518 # is over without accidentally emptying it, if there are 

1519 # no pos_datas and we need a dummy data. 

1520 header_tags.clear() 

1521 header_topics.clear() 

1522 

1523 process_gloss_header( 

1524 pre1, pos, head_group, pos_data, header_tags, header_topics 

1525 ) 

1526 for ln in ls: 

1527 # Parse each list associated with this head. 

1528 for node in ln.children: 

1529 # Parse nodes in l.children recursively. 

1530 # The recursion function uses push_sense() to 

1531 # add stuff into pos_data, and returns True or 

1532 # False if something is added, which bubbles upward. 

1533 # If the bubble is "True", then higher levels of 

1534 # the recursion will not push_sense(), because 

1535 # the data is already pushed into a sub-gloss 

1536 # downstream, unless the higher level has examples 

1537 # that need to be put somewhere. 

1538 common_data: SenseData = { 

1539 "tags": list(header_tags), 

1540 "topics": list(header_topics), 

1541 } 

1542 if head_group: 

1543 common_data["head_nr"] = head_group 

1544 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1545 

1546 if len(ls) > 0: 

1547 previous_head_had_list = True 

1548 else: 

1549 previous_head_had_list = False 

1550 

1551 # If there are no senses extracted, add a dummy sense. We want to 

1552 # keep tags extracted from the head for the dummy sense. 

1553 push_sense() # Make sure unfinished data pushed, and start clean sense 

1554 if len(pos_datas) == 0: 

1555 data_extend(sense_data, "tags", header_tags) 

1556 data_extend(sense_data, "topics", header_topics) 

1557 data_append(sense_data, "tags", "no-gloss") 

1558 push_sense() 

1559 

1560 def process_gloss_header( 

1561 header_nodes: list[Union[WikiNode, str]], 

1562 pos_type: str, 

1563 header_group: Optional[int], 

1564 pos_data: WordData, 

1565 header_tags: list[str], 

1566 header_topics: list[str], 

1567 ) -> None: 

1568 ruby = [] 

1569 links: list[str] = [] 

1570 

1571 # process template parse nodes here 

1572 new_nodes = [] 

1573 info_template_data = [] 

1574 for node in header_nodes: 

1575 # print(f"{node=}") 

1576 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1577 if info_data or info_out: 

1578 if info_data: 1578 ↛ 1580line 1578 didn't jump to line 1580 because the condition on line 1578 was always true

1579 info_template_data.append(info_data) 

1580 if info_out: # including just the original node 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true

1581 new_nodes.append(info_out) 

1582 else: 

1583 new_nodes.append(node) 

1584 header_nodes = new_nodes 

1585 

1586 if info_template_data: 

1587 if "info_templates" not in pos_data: 1587 ↛ 1590line 1587 didn't jump to line 1590 because the condition on line 1587 was always true

1588 pos_data["info_templates"] = info_template_data 

1589 else: 

1590 pos_data["info_templates"].extend(info_template_data) 

1591 

1592 if not word.isalnum(): 

1593 # `-` is kosher, add more of these if needed. 

1594 if word.replace("-", "").isalnum(): 

1595 pass 

1596 else: 

1597 # if the word contains non-letter or -number characters, it 

1598 # might have something that messes with split-at-semi-comma; we 

1599 # collect links so that we can skip splitting them. 

1600 exp = wxr.wtp.parse( 

1601 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1602 ) 

1603 link_nodes, _ = recursively_extract( 

1604 exp.children, 

1605 lambda x: isinstance(x, WikiNode) 

1606 and x.kind == NodeKind.LINK, 

1607 ) 

1608 for ln in link_nodes: 

1609 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1610 if not ltext.isalnum(): 

1611 links.append(ltext) 

1612 if word not in links: 1612 ↛ 1614line 1612 didn't jump to line 1614 because the condition on line 1612 was always true

1613 links.append(word) 

1614 if lang_code == "ja": 

1615 exp = wxr.wtp.parse( 

1616 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1617 ) 

1618 rub, _ = recursively_extract( 

1619 exp.children, 

1620 lambda x: isinstance(x, WikiNode) 

1621 and x.kind == NodeKind.HTML 

1622 and x.sarg == "ruby", 

1623 ) 

1624 if rub is not None: 1624 ↛ 1633line 1624 didn't jump to line 1633 because the condition on line 1624 was always true

1625 for r in rub: 1625 ↛ 1626line 1625 didn't jump to line 1626 because the loop on line 1625 never started

1626 if TYPE_CHECKING: 

1627 # we know the lambda above in recursively_extract 

1628 # returns only WikiNodes in rub 

1629 assert isinstance(r, WikiNode) 

1630 rt = parse_ruby(wxr, r) 

1631 if rt is not None: 

1632 ruby.append(rt) 

1633 header_text = clean_node( 

1634 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1635 ) 

1636 

1637 term_label_tags: list[str] = [] 

1638 term_label_topics: list[str] = [] 

1639 if len(term_label_templates) > 0: 

1640 # parse term label templates; if there are other similar kinds 

1641 # of templates in headers that you want to squash and apply as 

1642 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1643 for templ_data in term_label_templates: 

1644 # print(templ_data) 

1645 expan = templ_data.get("expansion", "").strip("().,; ") 

1646 if not expan: 1646 ↛ 1647line 1646 didn't jump to line 1647 because the condition on line 1646 was never true

1647 continue 

1648 tlb_tagsets, tlb_topics = decode_tags(expan) 

1649 for tlb_tags in tlb_tagsets: 

1650 if len(tlb_tags) > 0 and not any( 

1651 t.startswith("error-") for t in tlb_tags 

1652 ): 

1653 term_label_tags.extend(tlb_tags) 

1654 term_label_topics.extend(tlb_topics) 

1655 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1656 

1657 header_text = re.sub(r"\s+", " ", header_text) 

1658 # print(f"{header_text=}") 

1659 parse_word_head( 

1660 wxr, 

1661 pos_type, 

1662 header_text, 

1663 pos_data, 

1664 is_reconstruction, 

1665 header_group, 

1666 ruby=ruby, 

1667 links=links, 

1668 ) 

1669 if "tags" in pos_data: 

1670 # pos_data can get "tags" data from some source; type-checkers 

1671 # doesn't like it, so let's ignore it. 

1672 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1673 del pos_data["tags"] # type: ignore[typeddict-item] 

1674 if len(term_label_tags) > 0: 

1675 header_tags.extend(term_label_tags) 

1676 if len(term_label_topics) > 0: 

1677 header_topics.extend(term_label_topics) 

1678 

1679 def process_gloss_without_list( 

1680 nodes: list[Union[WikiNode, str]], 

1681 pos_type: str, 

1682 pos_data: WordData, 

1683 header_tags: list[str], 

1684 header_topics: list[str], 

1685 ) -> None: 

1686 # gloss text might not inside a list 

1687 header_nodes: list[Union[str, WikiNode]] = [] 

1688 gloss_nodes: list[Union[str, WikiNode]] = [] 

1689 for node in strip_nodes(nodes): 

1690 if isinstance(node, WikiNode): 

1691 if isinstance(node, TemplateNode): 

1692 if node.template_name in ( 

1693 "zh-see", 

1694 "ja-see", 

1695 "ja-see-kango", 

1696 ): 

1697 continue # soft redirect 

1698 elif ( 

1699 node.template_name == "head" 

1700 or node.template_name.startswith(f"{lang_code}-") 

1701 ): 

1702 header_nodes.append(node) 

1703 continue 

1704 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1704 ↛ 1706line 1704 didn't jump to line 1706 because the condition on line 1704 was always true

1705 break 

1706 gloss_nodes.append(node) 

1707 

1708 if len(header_nodes) > 0: 

1709 process_gloss_header( 

1710 header_nodes, 

1711 pos_type, 

1712 None, 

1713 pos_data, 

1714 header_tags, 

1715 header_topics, 

1716 ) 

1717 if len(gloss_nodes) > 0: 

1718 process_gloss_contents( 

1719 gloss_nodes, 

1720 pos_type, 

1721 {"tags": list(header_tags), "topics": list(header_topics)}, 

1722 ) 

1723 

1724 def parse_sense_node( 

1725 node: Union[str, WikiNode], # never receives str 

1726 sense_base: SenseData, 

1727 pos: str, 

1728 ) -> bool: 

1729 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1730 Uses push_sense() to attempt adding data to pos_data in the scope 

1731 of parse_language() when it reaches deep in the recursion. push_sense() 

1732 returns True if it succeeds, and that is bubbled up the stack; if 

1733 a sense was added downstream, the higher levels (whose shared data 

1734 was already added by a subsense) do not push_sense(), unless it 

1735 has examples that need to be put somewhere. 

1736 """ 

1737 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1738 if not isinstance(node, WikiNode): 1738 ↛ 1740line 1738 didn't jump to line 1740 because the condition on line 1738 was never true

1739 # This doesn't seem to ever happen in practice. 

1740 wxr.wtp.debug( 

1741 "{}: parse_sense_node called with" 

1742 "something that isn't a WikiNode".format(pos), 

1743 sortid="page/1287/20230119", 

1744 ) 

1745 return False 

1746 

1747 if node.kind != NodeKind.LIST_ITEM: 1747 ↛ 1748line 1747 didn't jump to line 1748 because the condition on line 1747 was never true

1748 wxr.wtp.debug( 

1749 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1750 ) 

1751 return False 

1752 

1753 if node.sarg == ":": 

1754 # Skip example entries at the highest level, ones without 

1755 # a sense ("...#") above them. 

1756 # If node.sarg is exactly and only ":", then it's at 

1757 # the highest level; lower levels would have more 

1758 # "indentation", like "#:" or "##:" 

1759 return False 

1760 

1761 # If a recursion call succeeds in push_sense(), bubble it up with 

1762 # `added`. 

1763 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1764 added = False 

1765 

1766 gloss_template_args: set[str] = set() 

1767 

1768 # For LISTs and LIST_ITEMS, their argument is something like 

1769 # "##" or "##:", and using that we can rudimentally determine 

1770 # list 'depth' if need be, and also what kind of list or 

1771 # entry it is; # is for normal glosses, : for examples (indent) 

1772 # and * is used for quotations on wiktionary. 

1773 current_depth = node.sarg 

1774 

1775 children = node.children 

1776 

1777 # subentries, (presumably) a list 

1778 # of subglosses below this. The list's 

1779 # argument ends with #, and its depth should 

1780 # be bigger than parent node. 

1781 subentries = [ 

1782 x 

1783 for x in children 

1784 if isinstance(x, WikiNode) 

1785 and x.kind == NodeKind.LIST 

1786 and x.sarg == current_depth + "#" 

1787 ] 

1788 

1789 # sublists of examples and quotations. .sarg 

1790 # does not end with "#". 

1791 others = [ 

1792 x 

1793 for x in children 

1794 if isinstance(x, WikiNode) 

1795 and x.kind == NodeKind.LIST 

1796 and x.sarg != current_depth + "#" 

1797 ] 

1798 

1799 # the actual contents of this particular node. 

1800 # can be a gloss (or a template that expands into 

1801 # many glosses which we can't easily pre-expand) 

1802 # or could be an "outer gloss" with more specific 

1803 # subglosses, or could be a qualfier for the subglosses. 

1804 contents = [ 

1805 x 

1806 for x in children 

1807 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1808 ] 

1809 # If this entry has sublists of entries, we should combine 

1810 # gloss information from both the "outer" and sublist content. 

1811 # Sometimes the outer gloss 

1812 # is more non-gloss or tags, sometimes it is a coarse sense 

1813 # and the inner glosses are more specific. The outer one 

1814 # does not seem to have qualifiers. 

1815 

1816 # If we have one sublist with one element, treat it 

1817 # specially as it may be a Wiktionary error; raise 

1818 # that nested element to the same level. 

1819 # XXX If need be, this block can be easily removed in 

1820 # the current recursive logicand the result is one sense entry 

1821 # with both glosses in the glosses list, as you would 

1822 # expect. If the higher entry has examples, there will 

1823 # be a higher entry with some duplicated data. 

1824 if len(subentries) == 1: 

1825 slc = subentries[0].children 

1826 if len(slc) == 1: 

1827 # copy current node and modify it so it doesn't 

1828 # loop infinitely. 

1829 cropped_node = copy.copy(node) 

1830 cropped_node.children = [ 

1831 x 

1832 for x in children 

1833 if not ( 

1834 isinstance(x, WikiNode) 

1835 and x.kind == NodeKind.LIST 

1836 and x.sarg == current_depth + "#" 

1837 ) 

1838 ] 

1839 added |= parse_sense_node(cropped_node, sense_base, pos) 

1840 nonlocal sense_data # this kludge causes duplicated raw_ 

1841 # glosses data if this is not done; 

1842 # if the top-level (cropped_node) 

1843 # does not push_sense() properly or 

1844 # parse_sense_node() returns early, 

1845 # sense_data is not reset. This happens 

1846 # for example when you have a no-gloss 

1847 # string like "(intransitive)": 

1848 # no gloss, push_sense() returns early 

1849 # and sense_data has duplicate data with 

1850 # sense_base 

1851 sense_data = {} 

1852 added |= parse_sense_node(slc[0], sense_base, pos) 

1853 return added 

1854 

1855 return process_gloss_contents( 

1856 contents, 

1857 pos, 

1858 sense_base, 

1859 subentries, 

1860 others, 

1861 gloss_template_args, 

1862 added, 

1863 ) 

1864 

1865 def process_gloss_contents( 

1866 contents: list[Union[str, WikiNode]], 

1867 pos: str, 

1868 sense_base: SenseData, 

1869 subentries: list[WikiNode] = [], 

1870 others: list[WikiNode] = [], 

1871 gloss_template_args: Set[str] = set(), 

1872 added: bool = False, 

1873 ) -> bool: 

1874 def sense_template_fn( 

1875 name: str, ht: TemplateArgs, is_gloss: bool = False 

1876 ) -> Optional[str]: 

1877 # print(f"sense_template_fn: {name}, {ht}") 

1878 if name in wikipedia_templates: 

1879 # parse_wikipedia_template(wxr, pos_data, ht) 

1880 return None 

1881 if is_panel_template(wxr, name): 

1882 return "" 

1883 if name in INFO_TEMPLATE_FUNCS: 

1884 info_data, info_exp = parse_info_template_arguments( 

1885 wxr, name, ht, "sense" 

1886 ) 

1887 if info_data or info_exp: 1887 ↛ 1893line 1887 didn't jump to line 1893 because the condition on line 1887 was always true

1888 if info_data: 1888 ↛ 1890line 1888 didn't jump to line 1890 because the condition on line 1888 was always true

1889 data_append(sense_base, "info_templates", info_data) 

1890 if info_exp and isinstance(info_exp, str): 1890 ↛ 1892line 1890 didn't jump to line 1892 because the condition on line 1890 was always true

1891 return info_exp 

1892 return "" 

1893 if name in ("defdate",): 

1894 return "" 

1895 if name == "senseid": 

1896 langid = clean_node(wxr, None, ht.get(1, ())) 

1897 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1898 if re.match(r"Q\d+$", arg): 

1899 data_append(sense_base, "wikidata", arg) 

1900 data_append(sense_base, "senseid", langid + ":" + arg) 

1901 if name in sense_linkage_templates: 

1902 # print(f"SENSE_TEMPLATE_FN: {name}") 

1903 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

1904 return "" 

1905 if name == "†" or name == "zh-obsolete": 

1906 data_append(sense_base, "tags", "obsolete") 

1907 return "" 

1908 if name in { 

1909 "ux", 

1910 "uxi", 

1911 "usex", 

1912 "afex", 

1913 "prefixusex", 

1914 "ko-usex", 

1915 "ko-x", 

1916 "hi-x", 

1917 "ja-usex-inline", 

1918 "ja-x", 

1919 "quotei", 

1920 "he-x", 

1921 "hi-x", 

1922 "km-x", 

1923 "ne-x", 

1924 "shn-x", 

1925 "th-x", 

1926 "ur-x", 

1927 }: 

1928 # Usage examples are captured separately below. We don't 

1929 # want to expand them into glosses even when unusual coding 

1930 # is used in the entry. 

1931 # These templates may slip through inside another item, but 

1932 # currently we're separating out example entries (..#:) 

1933 # well enough that there seems to very little contamination. 

1934 if is_gloss: 

1935 wxr.wtp.warning( 

1936 "Example template is used for gloss text", 

1937 sortid="extractor.en.page.sense_template_fn/1415", 

1938 ) 

1939 else: 

1940 return "" 

1941 if name == "w": 1941 ↛ 1942line 1941 didn't jump to line 1942 because the condition on line 1941 was never true

1942 if ht.get(2) == "Wp": 

1943 return "" 

1944 for k, v in ht.items(): 

1945 v = v.strip() 

1946 if v and "<" not in v: 

1947 gloss_template_args.add(v) 

1948 return None 

1949 

1950 def extract_link_texts(item: GeneralNode) -> None: 

1951 """Recursively extracts link texts from the gloss source. This 

1952 information is used to select whether to remove final "." from 

1953 form_of/alt_of (e.g., ihm/Hunsrik).""" 

1954 if isinstance(item, (list, tuple)): 

1955 for x in item: 

1956 extract_link_texts(x) 

1957 return 

1958 if isinstance(item, str): 

1959 # There seem to be HTML sections that may futher contain 

1960 # unparsed links. 

1961 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1961 ↛ 1962line 1961 didn't jump to line 1962 because the loop on line 1961 never started

1962 print("ITER:", m.group(0)) 

1963 v = m.group(1).split("|")[-1].strip() 

1964 if v: 

1965 gloss_template_args.add(v) 

1966 return 

1967 if not isinstance(item, WikiNode): 1967 ↛ 1968line 1967 didn't jump to line 1968 because the condition on line 1967 was never true

1968 return 

1969 if item.kind == NodeKind.LINK: 

1970 v = item.largs[-1] 

1971 if ( 1971 ↛ 1977line 1971 didn't jump to line 1977 because the condition on line 1971 was always true

1972 isinstance(v, list) 

1973 and len(v) == 1 

1974 and isinstance(v[0], str) 

1975 ): 

1976 gloss_template_args.add(v[0].strip()) 

1977 for x in item.children: 

1978 extract_link_texts(x) 

1979 

1980 extract_link_texts(contents) 

1981 

1982 # get the raw text of non-list contents of this node, and other stuff 

1983 # like tag and category data added to sense_base 

1984 # cast = no-op type-setter for the type-checker 

1985 partial_template_fn = cast( 

1986 TemplateFnCallable, 

1987 partial(sense_template_fn, is_gloss=True), 

1988 ) 

1989 rawgloss = clean_node( 

1990 wxr, 

1991 sense_base, 

1992 contents, 

1993 template_fn=partial_template_fn, 

1994 collect_links=True, 

1995 ) 

1996 

1997 if not rawgloss: 1997 ↛ 1998line 1997 didn't jump to line 1998 because the condition on line 1997 was never true

1998 return False 

1999 

2000 # remove manually typed ordered list text at the start("1. ") 

2001 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2002 

2003 # get stuff like synonyms and categories from "others", 

2004 # maybe examples and quotations 

2005 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2006 

2007 # The gloss could contain templates that produce more list items. 

2008 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2009 # to parts. However, e.g. Interlingua generates multiple glosses 

2010 # in HTML directly without Wikitext markup, so we must also split 

2011 # by just newlines. 

2012 subglosses = rawgloss.splitlines() 

2013 

2014 if len(subglosses) == 0: 2014 ↛ 2015line 2014 didn't jump to line 2015 because the condition on line 2014 was never true

2015 return False 

2016 

2017 if any(s.startswith("#") for s in subglosses): 

2018 subtree = wxr.wtp.parse(rawgloss) 

2019 # from wikitextprocessor.parser import print_tree 

2020 # print("SUBTREE GENERATED BY TEMPLATE:") 

2021 # print_tree(subtree) 

2022 new_subentries = [ 

2023 x 

2024 for x in subtree.children 

2025 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2026 ] 

2027 

2028 new_others = [ 

2029 x 

2030 for x in subtree.children 

2031 if isinstance(x, WikiNode) 

2032 and x.kind == NodeKind.LIST 

2033 and not x.sarg.endswith("#") 

2034 ] 

2035 

2036 new_contents = [ 

2037 clean_node(wxr, [], x) 

2038 for x in subtree.children 

2039 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2040 ] 

2041 

2042 subentries = subentries or new_subentries 

2043 others = others or new_others 

2044 subglosses = new_contents 

2045 rawgloss = "".join(subglosses) 

2046 # Generate no gloss for translation hub pages, but add the 

2047 # "translation-hub" tag for them 

2048 if rawgloss == "(This entry is a translation hub.)": 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true

2049 data_append(sense_data, "tags", "translation-hub") 

2050 return push_sense() 

2051 

2052 # Remove certain substrings specific to outer glosses 

2053 strip_ends = [", particularly:"] 

2054 for x in strip_ends: 

2055 if rawgloss.endswith(x): 

2056 rawgloss = rawgloss[: -len(x)].strip() 

2057 break 

2058 

2059 # A single gloss, or possibly an outer gloss. 

2060 # Check if the possible outer gloss starts with 

2061 # parenthesized tags/topics 

2062 

2063 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2064 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2065 m = QUALIFIERS_RE.match(rawgloss) 

2066 # (...): ... or (...(...)...): ... 

2067 if m: 

2068 q = m.group(1) 

2069 rawgloss = rawgloss[m.end() :].strip() 

2070 parse_sense_qualifier(wxr, q, sense_base) 

2071 if rawgloss == "A pejorative:": 2071 ↛ 2072line 2071 didn't jump to line 2072 because the condition on line 2071 was never true

2072 data_append(sense_base, "tags", "pejorative") 

2073 rawgloss = "" 

2074 elif rawgloss == "Short forms.": 2074 ↛ 2075line 2074 didn't jump to line 2075 because the condition on line 2074 was never true

2075 data_append(sense_base, "tags", "abbreviation") 

2076 rawgloss = "" 

2077 elif rawgloss == "Technical or specialized senses.": 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true

2078 rawgloss = "" 

2079 elif rawgloss.startswith("inflection of "): 

2080 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2081 if parsed is not None: 2081 ↛ 2090line 2081 didn't jump to line 2090 because the condition on line 2081 was always true

2082 tags, origins = parsed 

2083 if origins is not None: 2083 ↛ 2085line 2083 didn't jump to line 2085 because the condition on line 2083 was always true

2084 data_extend(sense_base, "form_of", origins) 

2085 if tags is not None: 2085 ↛ 2088line 2085 didn't jump to line 2088 because the condition on line 2085 was always true

2086 data_extend(sense_base, "tags", tags) 

2087 else: 

2088 data_append(sense_base, "tags", "form-of") 

2089 else: 

2090 data_append(sense_base, "tags", "form-of") 

2091 if rawgloss: 2091 ↛ 2122line 2091 didn't jump to line 2122 because the condition on line 2091 was always true

2092 # Code duplicating a lot of clean-up operations from later in 

2093 # this block. We want to clean up the "supergloss" as much as 

2094 # possible, in almost the same way as a normal gloss. 

2095 supergloss = rawgloss 

2096 

2097 if supergloss.startswith("; "): 2097 ↛ 2098line 2097 didn't jump to line 2098 because the condition on line 2097 was never true

2098 supergloss = supergloss[1:].strip() 

2099 

2100 if supergloss.startswith(("^†", "†")): 

2101 data_append(sense_base, "tags", "obsolete") 

2102 supergloss = supergloss[2:].strip() 

2103 elif supergloss.startswith("^‡"): 2103 ↛ 2104line 2103 didn't jump to line 2104 because the condition on line 2103 was never true

2104 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2105 supergloss = supergloss[2:].strip() 

2106 

2107 # remove [14th century...] style brackets at the end 

2108 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2109 

2110 if supergloss.startswith((",", ":")): 

2111 supergloss = supergloss[1:] 

2112 supergloss = supergloss.strip() 

2113 if supergloss.startswith("N. of "): 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true

2114 supergloss = "Name of " + supergloss[6:] 

2115 supergloss = supergloss[2:] 

2116 data_append(sense_base, "glosses", supergloss) 

2117 if supergloss in ("A person:",): 

2118 data_append(sense_base, "tags", "g-person") 

2119 

2120 # The main recursive call (except for the exceptions at the 

2121 # start of this function). 

2122 for sublist in subentries: 

2123 if not ( 2123 ↛ 2126line 2123 didn't jump to line 2126 because the condition on line 2123 was never true

2124 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2125 ): 

2126 wxr.wtp.debug( 

2127 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2128 f"with items that are not LISTs", 

2129 sortid="page/1511/20230119", 

2130 ) 

2131 continue 

2132 for item in sublist.children: 

2133 if not ( 2133 ↛ 2137line 2133 didn't jump to line 2137 because the condition on line 2133 was never true

2134 isinstance(item, WikiNode) 

2135 and item.kind == NodeKind.LIST_ITEM 

2136 ): 

2137 continue 

2138 # copy sense_base to prevent cross-contamination between 

2139 # subglosses and other subglosses and superglosses 

2140 sense_base2 = copy.deepcopy(sense_base) 

2141 if parse_sense_node(item, sense_base2, pos): 2141 ↛ 2132line 2141 didn't jump to line 2132 because the condition on line 2141 was always true

2142 added = True 

2143 

2144 # Capture examples. 

2145 # This is called after the recursive calls above so that 

2146 # sense_base is not contaminated with meta-data from 

2147 # example entries for *this* gloss. 

2148 examples = [] 

2149 if wxr.config.capture_examples: 2149 ↛ 2153line 2149 didn't jump to line 2153 because the condition on line 2149 was always true

2150 examples = extract_examples(others, sense_base) 

2151 

2152 # push_sense() succeeded somewhere down-river, so skip this level 

2153 if added: 

2154 if examples: 

2155 # this higher-up gloss has examples that we do not want to skip 

2156 wxr.wtp.debug( 

2157 "'{}[...]' gloss has examples we want to keep, " 

2158 "but there are subglosses.".format(repr(rawgloss[:30])), 

2159 sortid="page/1498/20230118", 

2160 ) 

2161 else: 

2162 return True 

2163 

2164 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2165 # after the gloss, but these sentences don't seem to be intended 

2166 # as glosses. Skip them. 

2167 indexed_subglosses = list( 

2168 (i, gl) 

2169 for i, gl in enumerate(subglosses) 

2170 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2171 ) 

2172 

2173 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2173 ↛ 2174line 2173 didn't jump to line 2174 because the condition on line 2173 was never true

2174 gl = indexed_subglosses[0][1].strip() 

2175 if gl.endswith(":"): 

2176 gl = gl[:-1].strip() 

2177 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2178 if parsed is not None: 

2179 infl_tags, infl_dts = parsed 

2180 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2181 # Interpret others as a particular form under 

2182 # "inflection of" 

2183 data_extend(sense_base, "tags", infl_tags) 

2184 data_extend(sense_base, "form_of", infl_dts) 

2185 indexed_subglosses = indexed_subglosses[1:] 

2186 elif not infl_dts: 

2187 data_extend(sense_base, "tags", infl_tags) 

2188 indexed_subglosses = indexed_subglosses[1:] 

2189 

2190 # Create senses for remaining subglosses 

2191 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2192 gloss = gloss.strip() 

2193 if not gloss and len(indexed_subglosses) > 1: 2193 ↛ 2194line 2193 didn't jump to line 2194 because the condition on line 2193 was never true

2194 continue 

2195 # Push a new sense (if the last one is not empty) 

2196 if push_sense(): 2196 ↛ 2197line 2196 didn't jump to line 2197 because the condition on line 2196 was never true

2197 added = True 

2198 # if gloss not in sense_data.get("raw_glosses", ()): 

2199 # data_append(sense_data, "raw_glosses", gloss) 

2200 if i == 0 and examples: 

2201 # In a multi-line gloss, associate examples 

2202 # with only one of them. 

2203 # XXX or you could use gloss_i == len(indexed_subglosses) 

2204 # to associate examples with the *last* one. 

2205 data_extend(sense_data, "examples", examples) 

2206 if gloss.startswith("; ") and gloss_i > 0: 2206 ↛ 2207line 2206 didn't jump to line 2207 because the condition on line 2206 was never true

2207 gloss = gloss[1:].strip() 

2208 # If the gloss starts with †, mark as obsolete 

2209 if gloss.startswith("^†"): 2209 ↛ 2210line 2209 didn't jump to line 2210 because the condition on line 2209 was never true

2210 data_append(sense_data, "tags", "obsolete") 

2211 gloss = gloss[2:].strip() 

2212 elif gloss.startswith("^‡"): 2212 ↛ 2213line 2212 didn't jump to line 2213 because the condition on line 2212 was never true

2213 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2214 gloss = gloss[2:].strip() 

2215 # Copy data for all senses to this sense 

2216 for k, v in sense_base.items(): 

2217 if isinstance(v, (list, tuple)): 

2218 if k != "tags": 

2219 # Tags handled below (countable/uncountable special) 

2220 data_extend(sense_data, k, v) 

2221 else: 

2222 assert k not in ("tags", "categories", "topics") 

2223 sense_data[k] = v # type:ignore[literal-required] 

2224 # Parse the gloss for this particular sense 

2225 m = QUALIFIERS_RE.match(gloss) 

2226 # (...): ... or (...(...)...): ... 

2227 if m: 

2228 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2229 gloss = gloss[m.end() :].strip() 

2230 

2231 # Remove common suffix "[from 14th c.]" and similar 

2232 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2233 

2234 # Check to make sure we don't have unhandled list items in gloss 

2235 ofs = max(gloss.find("#"), gloss.find("* ")) 

2236 if ofs > 10 and "(#)" not in gloss: 

2237 wxr.wtp.debug( 

2238 "gloss may contain unhandled list items: {}".format(gloss), 

2239 sortid="page/1412", 

2240 ) 

2241 elif "\n" in gloss: 2241 ↛ 2242line 2241 didn't jump to line 2242 because the condition on line 2241 was never true

2242 wxr.wtp.debug( 

2243 "gloss contains newline: {}".format(gloss), 

2244 sortid="page/1416", 

2245 ) 

2246 

2247 # Kludge, some glosses have a comma after initial qualifiers in 

2248 # parentheses 

2249 if gloss.startswith((",", ":")): 

2250 gloss = gloss[1:] 

2251 gloss = gloss.strip() 

2252 if gloss.endswith(":"): 

2253 gloss = gloss[:-1].strip() 

2254 if gloss.startswith("N. of "): 2254 ↛ 2255line 2254 didn't jump to line 2255 because the condition on line 2254 was never true

2255 gloss = "Name of " + gloss[6:] 

2256 if gloss.startswith("†"): 2256 ↛ 2257line 2256 didn't jump to line 2257 because the condition on line 2256 was never true

2257 data_append(sense_data, "tags", "obsolete") 

2258 gloss = gloss[1:] 

2259 elif gloss.startswith("^†"): 2259 ↛ 2260line 2259 didn't jump to line 2260 because the condition on line 2259 was never true

2260 data_append(sense_data, "tags", "obsolete") 

2261 gloss = gloss[2:] 

2262 

2263 # Copy tags from sense_base if any. This will not copy 

2264 # countable/uncountable if either was specified in the sense, 

2265 # as sometimes both are specified in word head but only one 

2266 # in individual senses. 

2267 countability_tags = [] 

2268 base_tags = sense_base.get("tags", ()) 

2269 sense_tags = sense_data.get("tags", ()) 

2270 for tag in base_tags: 

2271 if tag in ("countable", "uncountable"): 

2272 if tag not in countability_tags: 2272 ↛ 2274line 2272 didn't jump to line 2274 because the condition on line 2272 was always true

2273 countability_tags.append(tag) 

2274 continue 

2275 if tag not in sense_tags: 

2276 data_append(sense_data, "tags", tag) 

2277 if countability_tags: 

2278 if ( 2278 ↛ 2287line 2278 didn't jump to line 2287 because the condition on line 2278 was always true

2279 "countable" not in sense_tags 

2280 and "uncountable" not in sense_tags 

2281 ): 

2282 data_extend(sense_data, "tags", countability_tags) 

2283 

2284 # If outer gloss specifies a form-of ("inflection of", see 

2285 # aquamarine/German), try to parse the inner glosses as 

2286 # tags for an inflected form. 

2287 if "form-of" in sense_base.get("tags", ()): 

2288 parsed = parse_alt_or_inflection_of( 

2289 wxr, gloss, gloss_template_args 

2290 ) 

2291 if parsed is not None: 2291 ↛ 2297line 2291 didn't jump to line 2297 because the condition on line 2291 was always true

2292 infl_tags, infl_dts = parsed 

2293 if not infl_dts and infl_tags: 2293 ↛ 2297line 2293 didn't jump to line 2297 because the condition on line 2293 was always true

2294 # Interpret as a particular form under "inflection of" 

2295 data_extend(sense_data, "tags", infl_tags) 

2296 

2297 if not gloss: 2297 ↛ 2298line 2297 didn't jump to line 2298 because the condition on line 2297 was never true

2298 data_append(sense_data, "tags", "empty-gloss") 

2299 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2300 if ( 2300 ↛ 2311line 2300 didn't jump to line 2311 because the condition on line 2300 was always true

2301 gloss_i == 0 

2302 and len(sense_data.get("glosses", tuple())) >= 1 

2303 ): 

2304 # If we added a "high-level gloss" from rawgloss, but this 

2305 # is that same gloss_i, add this instead of the raw_gloss 

2306 # from before if they're different: the rawgloss was not 

2307 # cleaned exactly the same as this later gloss 

2308 sense_data["glosses"][-1] = gloss 

2309 else: 

2310 # Add the gloss for the sense. 

2311 data_append(sense_data, "glosses", gloss) 

2312 

2313 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2314 # two abbreviations in the same sense, both generated by the 

2315 # {{abbreviation of|...}} template. Handle these with some magic. 

2316 position = 0 

2317 split_glosses = [] 

2318 for m in re.finditer(r"Abbreviation of ", gloss): 

2319 if m.start() != position: 2319 ↛ 2318line 2319 didn't jump to line 2318 because the condition on line 2319 was always true

2320 split_glosses.append(gloss[position : m.start()]) 

2321 position = m.start() 

2322 split_glosses.append(gloss[position:]) 

2323 for gloss in split_glosses: 

2324 # Check if this gloss describes an alt-of or inflection-of 

2325 if ( 

2326 lang_code != "en" 

2327 and " " not in gloss 

2328 and distw([word], gloss) < 0.3 

2329 ): 

2330 # Don't try to parse gloss if it is one word 

2331 # that is close to the word itself for non-English words 

2332 # (probable translations of a tag/form name) 

2333 continue 

2334 parsed = parse_alt_or_inflection_of( 

2335 wxr, gloss, gloss_template_args 

2336 ) 

2337 if parsed is None: 

2338 continue 

2339 tags, dts = parsed 

2340 if not dts and tags: 

2341 data_extend(sense_data, "tags", tags) 

2342 continue 

2343 for dt in dts: # type:ignore[union-attr] 

2344 ftags = list(tag for tag in tags if tag != "form-of") 

2345 if "alt-of" in tags: 

2346 data_extend(sense_data, "tags", ftags) 

2347 data_append(sense_data, "alt_of", dt) 

2348 elif "compound-of" in tags: 2348 ↛ 2349line 2348 didn't jump to line 2349 because the condition on line 2348 was never true

2349 data_extend(sense_data, "tags", ftags) 

2350 data_append(sense_data, "compound_of", dt) 

2351 elif "synonym-of" in tags: 2351 ↛ 2352line 2351 didn't jump to line 2352 because the condition on line 2351 was never true

2352 data_extend(dt, "tags", ftags) 

2353 data_append(sense_data, "synonyms", dt) 

2354 elif tags and dt.get("word", "").startswith("of "): 2354 ↛ 2355line 2354 didn't jump to line 2355 because the condition on line 2354 was never true

2355 dt["word"] = dt["word"][3:] 

2356 data_append(sense_data, "tags", "form-of") 

2357 data_extend(sense_data, "tags", ftags) 

2358 data_append(sense_data, "form_of", dt) 

2359 elif "form-of" in tags: 2359 ↛ 2343line 2359 didn't jump to line 2343 because the condition on line 2359 was always true

2360 data_extend(sense_data, "tags", tags) 

2361 data_append(sense_data, "form_of", dt) 

2362 

2363 if len(sense_data) == 0: 

2364 if len(sense_base.get("tags", [])) == 0: 2364 ↛ 2366line 2364 didn't jump to line 2366 because the condition on line 2364 was always true

2365 del sense_base["tags"] 

2366 sense_data.update(sense_base) 

2367 if push_sense(): 2367 ↛ 2371line 2367 didn't jump to line 2371 because the condition on line 2367 was always true

2368 # push_sense succeded in adding a sense to pos_data 

2369 added = True 

2370 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2371 return added 

2372 

2373 def parse_inflection( 

2374 node: WikiNode, section: str, pos: Optional[str] 

2375 ) -> None: 

2376 """Parses inflection data (declension, conjugation) from the given 

2377 page. This retrieves the actual inflection template 

2378 parameters, which are very useful for applications that need 

2379 to learn the inflection classes and generate inflected 

2380 forms.""" 

2381 assert isinstance(node, WikiNode) 

2382 assert isinstance(section, str) 

2383 assert pos is None or isinstance(pos, str) 

2384 # print("parse_inflection:", node) 

2385 

2386 if pos is None: 2386 ↛ 2387line 2386 didn't jump to line 2387 because the condition on line 2386 was never true

2387 wxr.wtp.debug( 

2388 "inflection table outside part-of-speech", sortid="page/1812" 

2389 ) 

2390 return 

2391 

2392 def inflection_template_fn( 

2393 name: str, ht: TemplateArgs 

2394 ) -> Optional[str]: 

2395 # print("decl_conj_template_fn", name, ht) 

2396 if is_panel_template(wxr, name): 2396 ↛ 2397line 2396 didn't jump to line 2397 because the condition on line 2396 was never true

2397 return "" 

2398 if name in ("is-u-mutation",): 2398 ↛ 2401line 2398 didn't jump to line 2401 because the condition on line 2398 was never true

2399 # These are not to be captured as an exception to the 

2400 # generic code below 

2401 return None 

2402 m = re.search( 

2403 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2404 r"declension|inflection|mut|mutation)($|-)", 

2405 name, 

2406 ) 

2407 if m: 

2408 args_ht = clean_template_args(wxr, ht) 

2409 dt = {"name": name, "args": args_ht} 

2410 data_append(pos_data, "inflection_templates", dt) 

2411 

2412 return None 

2413 

2414 # Convert the subtree back to Wikitext, then expand all and parse, 

2415 # capturing templates in the process 

2416 text = wxr.wtp.node_to_wikitext(node.children) 

2417 

2418 # Split text into separate sections for each to-level template 

2419 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2420 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2421 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2422 # like the group around it, it would create elements in brace_matches, 

2423 # including None if it doesn't match. 

2424 # 20250114: Added {| and |} into the regex because tables were being 

2425 # cut into pieces by this code. Issue #973, introduction of two-part 

2426 # book-end templates similar to trans-top and tran-bottom. 

2427 template_sections = [] 

2428 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2429 # Because there is the possibility of triple curly braces 

2430 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2431 # count nesting depth using pairs of two brackets, but 

2432 # instead use singular braces ("{ }"). 

2433 # Because template delimiters should be balanced, regardless 

2434 # of whether {{ or {{{ is used, and because we only care 

2435 # about the outer-most delimiters (the highest level template) 

2436 # we can just count the single braces when those single 

2437 # braces are part of a group. 

2438 table_nesting = 0 

2439 # However, if we have a stray table ({| ... |}) that should always 

2440 # be its own section, and should prevent templates from cutting it 

2441 # into sections. 

2442 

2443 # print(f"Parse inflection: {text=}") 

2444 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2445 if len(brace_matches) > 1: 

2446 tsection: list[str] = [] 

2447 after_templates = False # kludge to keep any text 

2448 # before first template 

2449 # with the first template; 

2450 # otherwise, text 

2451 # goes with preceding template 

2452 for m in brace_matches: 

2453 if m.startswith("\n; ") and after_templates: 2453 ↛ 2454line 2453 didn't jump to line 2454 because the condition on line 2453 was never true

2454 after_templates = False 

2455 template_sections.append(tsection) 

2456 tsection = [] 

2457 tsection.append(m) 

2458 elif m.startswith("{{") or m.endswith("{|"): 

2459 if ( 

2460 template_nesting == 0 

2461 and after_templates 

2462 and table_nesting == 0 

2463 ): 

2464 template_sections.append(tsection) 

2465 tsection = [] 

2466 # start new section 

2467 after_templates = True 

2468 if m.startswith("{{"): 

2469 template_nesting += 1 

2470 else: 

2471 # m.endswith("{|") 

2472 table_nesting += 1 

2473 tsection.append(m) 

2474 elif m.startswith("}}") or m.endswith("|}"): 

2475 if m.startswith("}}"): 

2476 template_nesting -= 1 

2477 if template_nesting < 0: 2477 ↛ 2478line 2477 didn't jump to line 2478 because the condition on line 2477 was never true

2478 wxr.wtp.error( 

2479 "Negatively nested braces, " 

2480 "couldn't split inflection templates, " 

2481 "{}/{} section {}".format( 

2482 word, language, section 

2483 ), 

2484 sortid="page/1871", 

2485 ) 

2486 template_sections = [] # use whole text 

2487 break 

2488 else: 

2489 table_nesting -= 1 

2490 if table_nesting < 0: 2490 ↛ 2491line 2490 didn't jump to line 2491 because the condition on line 2490 was never true

2491 wxr.wtp.error( 

2492 "Negatively nested table braces, " 

2493 "couldn't split inflection section, " 

2494 "{}/{} section {}".format( 

2495 word, language, section 

2496 ), 

2497 sortid="page/20250114", 

2498 ) 

2499 template_sections = [] # use whole text 

2500 break 

2501 tsection.append(m) 

2502 else: 

2503 tsection.append(m) 

2504 if tsection: # dangling tsection 2504 ↛ 2512line 2504 didn't jump to line 2512 because the condition on line 2504 was always true

2505 template_sections.append(tsection) 

2506 # Why do it this way around? The parser has a preference 

2507 # to associate bits outside of tables with the preceding 

2508 # table (`after`-variable), so a new tsection begins 

2509 # at {{ and everything before it belongs to the previous 

2510 # template. 

2511 

2512 texts = [] 

2513 if not template_sections: 

2514 texts = [text] 

2515 else: 

2516 for tsection in template_sections: 

2517 texts.append("".join(tsection)) 

2518 if template_nesting != 0: 2518 ↛ 2519line 2518 didn't jump to line 2519 because the condition on line 2518 was never true

2519 wxr.wtp.error( 

2520 "Template nesting error: " 

2521 "template_nesting = {} " 

2522 "couldn't split inflection templates, " 

2523 "{}/{} section {}".format( 

2524 template_nesting, word, language, section 

2525 ), 

2526 sortid="page/1896", 

2527 ) 

2528 texts = [text] 

2529 for text in texts: 

2530 tree = wxr.wtp.parse( 

2531 text, expand_all=True, template_fn=inflection_template_fn 

2532 ) 

2533 

2534 if not text.strip(): 

2535 continue 

2536 

2537 # Parse inflection tables from the section. The data is stored 

2538 # under "forms". 

2539 if wxr.config.capture_inflections: 2539 ↛ 2529line 2539 didn't jump to line 2529 because the condition on line 2539 was always true

2540 tablecontext = None 

2541 m = re.search(r"{{([^}{|]+)\|?", text) 

2542 if m: 

2543 template_name = m.group(1) 

2544 tablecontext = TableContext(template_name) 

2545 

2546 parse_inflection_section( 

2547 wxr, 

2548 pos_data, 

2549 word, 

2550 language, 

2551 pos, 

2552 section, 

2553 tree, 

2554 tablecontext=tablecontext, 

2555 ) 

2556 

2557 def get_subpage_section( 

2558 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2559 ) -> Optional[Union[WikiNode, str]]: 

2560 """Loads a subpage of the given page, and finds the section 

2561 for the given language, part-of-speech, and section title. This 

2562 is used for finding translations and other sections on subpages.""" 

2563 assert isinstance(language, str) 

2564 assert isinstance(title, str) 

2565 assert isinstance(subtitle, str) 

2566 assert isinstance(seqs, (list, tuple)) 

2567 for seq in seqs: 

2568 for x in seq: 

2569 assert isinstance(x, str) 

2570 subpage_title = word + "/" + subtitle 

2571 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2572 if subpage_content is None: 

2573 wxr.wtp.error( 

2574 "/translations not found despite " 

2575 "{{see translation subpage|...}}", 

2576 sortid="page/1934", 

2577 ) 

2578 return None 

2579 

2580 def recurse( 

2581 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2582 ) -> Optional[Union[str, WikiNode]]: 

2583 # print(f"seq: {seq}") 

2584 if not seq: 

2585 return node 

2586 if not isinstance(node, WikiNode): 

2587 return None 

2588 # print(f"node.kind: {node.kind}") 

2589 if node.kind in LEVEL_KINDS: 

2590 t = clean_node(wxr, None, node.largs[0]) 

2591 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2592 if t.lower() == seq[0].lower(): 

2593 seq = seq[1:] 

2594 if not seq: 

2595 return node 

2596 for n in node.children: 

2597 ret = recurse(n, seq) 

2598 if ret is not None: 

2599 return ret 

2600 return None 

2601 

2602 tree = wxr.wtp.parse( 

2603 subpage_content, 

2604 pre_expand=True, 

2605 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2606 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2607 ) 

2608 assert tree.kind == NodeKind.ROOT 

2609 for seq in seqs: 

2610 ret = recurse(tree, seq) 

2611 if ret is None: 

2612 wxr.wtp.debug( 

2613 "Failed to find subpage section {}/{} seq {}".format( 

2614 title, subtitle, seq 

2615 ), 

2616 sortid="page/1963", 

2617 ) 

2618 return ret 

2619 

2620 def parse_linkage( 

2621 data: WordData, field: str, linkagenode: WikiNode 

2622 ) -> None: 

2623 assert isinstance(data, dict) 

2624 assert isinstance(field, str) 

2625 assert isinstance(linkagenode, WikiNode) 

2626 # if field == "synonyms": 

2627 # print("field", field) 

2628 # print("data", data) 

2629 # print("children:") 

2630 # print(linkagenode.children) 

2631 if not wxr.config.capture_linkages: 2631 ↛ 2632line 2631 didn't jump to line 2632 because the condition on line 2631 was never true

2632 return 

2633 have_panel_template = False 

2634 toplevel_text = [] 

2635 next_navframe_sense = None # Used for "(sense):" before NavFrame 

2636 

2637 def parse_linkage_item( 

2638 contents: list[Union[str, WikiNode]], 

2639 field: str, 

2640 sense: Optional[str] = None, 

2641 ): 

2642 assert isinstance(contents, (list, tuple)) 

2643 assert isinstance(field, str) 

2644 assert sense is None or isinstance(sense, str) 

2645 

2646 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

2647 # .format(field, sense, contents)) 

2648 

2649 parts: list[str] = [] 

2650 ruby: list[tuple[str, str]] = [] 

2651 urls: list[str] = [] 

2652 # data about link text; this is used to skip splitting on 

2653 # linkage text items that contain stuff like commas; for 

2654 # example "Hunde, die bellen, beißen nicht" in article 

2655 # beißen is split into "Hunde", "die bellen" etc. 

2656 # We take that link text and use it, eventually, 

2657 # in split_at_comma_semi to skip splitting on those 

2658 # commas. 

2659 links_that_should_not_be_split: list[str] = [] 

2660 

2661 def item_recurse( 

2662 contents: list[Union[str, WikiNode]], italic=False 

2663 ) -> None: 

2664 assert isinstance(contents, (list, tuple)) 

2665 nonlocal sense 

2666 nonlocal ruby 

2667 nonlocal parts 

2668 # print("ITEM_RECURSE:", contents) 

2669 for node in contents: 

2670 if isinstance(node, str): 

2671 parts.append(node) 

2672 continue 

2673 kind = node.kind 

2674 # print("ITEM_RECURSE KIND:", kind, 

2675 # node.sarg if node.sarg else node.largs) 

2676 if kind == NodeKind.LIST: 

2677 if parts: 2677 ↛ 2692line 2677 didn't jump to line 2692 because the condition on line 2677 was always true

2678 sense1: Optional[str] 

2679 sense1 = clean_node(wxr, None, parts) 

2680 if sense1.endswith(":"): 

2681 sense1 = sense1[:-1].strip() 

2682 if sense1.startswith("(") and sense1.endswith(")"): 2682 ↛ 2683line 2682 didn't jump to line 2683 because the condition on line 2682 was never true

2683 sense1 = sense1[1:-1].strip() 

2684 if sense1.lower() == TRANSLATIONS_TITLE: 2684 ↛ 2685line 2684 didn't jump to line 2685 because the condition on line 2684 was never true

2685 sense1 = None 

2686 # print("linkage item_recurse LIST sense1:", sense1) 

2687 parse_linkage_recurse( 

2688 node.children, field, sense=sense1 or sense 

2689 ) 

2690 parts = [] 

2691 else: 

2692 parse_linkage_recurse(node.children, field, sense) 

2693 elif kind in ( 2693 ↛ 2698line 2693 didn't jump to line 2698 because the condition on line 2693 was never true

2694 NodeKind.TABLE, 

2695 NodeKind.TABLE_ROW, 

2696 NodeKind.TABLE_CELL, 

2697 ): 

2698 parse_linkage_recurse(node.children, field, sense) 

2699 elif kind in ( 2699 ↛ 2703line 2699 didn't jump to line 2703 because the condition on line 2699 was never true

2700 NodeKind.TABLE_HEADER_CELL, 

2701 NodeKind.TABLE_CAPTION, 

2702 ): 

2703 continue 

2704 elif kind == NodeKind.HTML: 2704 ↛ 2705line 2704 didn't jump to line 2705 because the condition on line 2704 was never true

2705 classes = (node.attrs.get("class") or "").split() 

2706 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2707 continue 

2708 elif node.sarg == "ruby": 

2709 rb = parse_ruby(wxr, node) 

2710 if rb: 

2711 ruby.append(rb) 

2712 parts.append(rb[0]) 

2713 continue 

2714 elif node.sarg == "math": 

2715 parts.append(clean_node(wxr, None, node)) 

2716 continue 

2717 elif "interProject" in classes: 

2718 continue # These do not seem to be displayed 

2719 if "NavFrame" in classes: 

2720 parse_linkage_recurse(node.children, field, sense) 

2721 else: 

2722 item_recurse(node.children, italic=italic) 

2723 elif kind == NodeKind.ITALIC: 

2724 item_recurse(node.children, italic=True) 

2725 elif kind == NodeKind.LINK: 

2726 ignore = False 

2727 if isinstance(node.largs[0][0], str): 2727 ↛ 2669line 2727 didn't jump to line 2669 because the condition on line 2727 was always true

2728 v1 = node.largs[0][0].strip().lower() 

2729 if v1.startswith( 2729 ↛ 2733line 2729 didn't jump to line 2733 because the condition on line 2729 was never true

2730 ns_title_prefix_tuple(wxr, "Category", True) 

2731 + ns_title_prefix_tuple(wxr, "File", True) 

2732 ): 

2733 ignore = True 

2734 if not ignore: 2734 ↛ 2669line 2734 didn't jump to line 2669 because the condition on line 2734 was always true

2735 v = node.largs[-1] 

2736 if ( 

2737 len(node.largs) == 1 

2738 and len(v) > 0 

2739 and isinstance(v[0], str) 

2740 and v[0][0] == ":" 

2741 ): 

2742 v = [v[0][1:]] + list(v[1:]) # type:ignore 

2743 if isinstance(v[0], str) and not v[0].isalnum(): 

2744 links_that_should_not_be_split.append( 

2745 "".join(v[0]) 

2746 ) # type: ignore 

2747 item_recurse(v, italic=italic) 

2748 elif kind == NodeKind.URL: 

2749 if len(node.largs) < 2 and node.largs: 

2750 # Naked url captured 

2751 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

2752 continue 

2753 if len(node.largs) == 2: 2753 ↛ 2758line 2753 didn't jump to line 2758 because the condition on line 2753 was always true

2754 # Url from link with text 

2755 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

2756 # print(f"{node.largs=!r}") 

2757 # print("linkage recurse URL {}".format(node)) 

2758 item_recurse(node.largs[-1], italic=italic) 

2759 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 2759 ↛ 2762line 2759 didn't jump to line 2762 because the condition on line 2759 was always true

2760 item_recurse(node.children, italic=italic) 

2761 else: 

2762 wxr.wtp.debug( 

2763 "linkage item_recurse unhandled {}: {}".format( 

2764 node.kind, node 

2765 ), 

2766 sortid="page/2073", 

2767 ) 

2768 

2769 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

2770 # .format(contents)) 

2771 

2772 item_recurse(contents) 

2773 item = clean_node(wxr, None, parts) 

2774 # print("LINKAGE ITEM CONTENTS:", parts) 

2775 # print("CLEANED ITEM: {!r}".format(item)) 

2776 # print(f"URLS {urls=!r}") 

2777 

2778 return parse_linkage_item_text( 

2779 wxr, 

2780 word, 

2781 data, 

2782 field, 

2783 item, 

2784 sense, 

2785 ruby, 

2786 pos_datas, 

2787 is_reconstruction, 

2788 urls or None, 

2789 links_that_should_not_be_split or None, 

2790 ) 

2791 

2792 def parse_linkage_recurse( 

2793 contents: list[Union[WikiNode, str]], 

2794 field: str, 

2795 sense: Optional[str], 

2796 ) -> None: 

2797 assert isinstance(contents, (list, tuple)) 

2798 assert sense is None or isinstance(sense, str) 

2799 nonlocal next_navframe_sense 

2800 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

2801 for node in contents: 

2802 if isinstance(node, str): 

2803 # Ignore top-level text, generally comments before the 

2804 # linkages list. However, if no linkages are found, then 

2805 # use this for linkages (not all words use bullet points 

2806 # for linkages). 

2807 toplevel_text.append(node) 

2808 continue 

2809 assert isinstance(node, WikiNode) 

2810 kind = node.kind 

2811 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

2812 if kind == NodeKind.LIST: 

2813 parse_linkage_recurse(node.children, field, sense) 

2814 elif kind == NodeKind.LIST_ITEM: 

2815 v = parse_linkage_item(node.children, field, sense) 

2816 if v: 2816 ↛ 2820line 2816 didn't jump to line 2820 because the condition on line 2816 was never true

2817 # parse_linkage_item() can return a value that should 

2818 # be used as the sense for the follow-on linkages, 

2819 # which are typically provided in a table (see 滿) 

2820 next_navframe_sense = v 

2821 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

2822 parse_linkage_recurse(node.children, field, sense) 

2823 elif kind == NodeKind.TABLE_CELL: 

2824 parse_linkage_item(node.children, field, sense) 

2825 elif kind in ( 

2826 NodeKind.TABLE_CAPTION, 

2827 NodeKind.TABLE_HEADER_CELL, 

2828 NodeKind.PREFORMATTED, 

2829 NodeKind.BOLD, 

2830 ): 

2831 continue 

2832 elif kind == NodeKind.HTML: 2832 ↛ 2834line 2832 didn't jump to line 2834 because the condition on line 2832 was never true

2833 # Recurse to process inside the HTML for most tags 

2834 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2835 continue 

2836 classes = (node.attrs.get("class") or "").split() 

2837 if node.sarg == "li": 

2838 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑ 

2839 v = parse_linkage_item(node.children, field, sense) 

2840 if v: 

2841 next_navframe_sense = v 

2842 elif "qualifier-content" in classes: 

2843 sense1 = clean_node(wxr, None, node.children) 

2844 if sense1.endswith(":"): 

2845 sense1 = sense1[:-1].strip() 

2846 if sense and sense1: 

2847 wxr.wtp.debug( 

2848 "linkage qualifier-content on multiple " 

2849 "levels: {!r} and {!r}".format(sense, sense1), 

2850 sortid="page/2170", 

2851 ) 

2852 parse_linkage_recurse(node.children, field, sense1) 

2853 elif "NavFrame" in classes: 

2854 # NavFrame uses previously assigned next_navframe_sense 

2855 # (from a "(sense):" item) and clears it afterwards 

2856 parse_linkage_recurse( 

2857 node.children, field, sense or next_navframe_sense 

2858 ) 

2859 next_navframe_sense = None 

2860 else: 

2861 parse_linkage_recurse(node.children, field, sense) 

2862 elif kind in LEVEL_KINDS: 2862 ↛ 2864line 2862 didn't jump to line 2864 because the condition on line 2862 was never true

2863 # Just recurse to any possible subsections 

2864 parse_linkage_recurse(node.children, field, sense) 

2865 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

2866 # Skip these on top level; at least sometimes bold is 

2867 # used for indicating a subtitle 

2868 continue 

2869 elif kind == NodeKind.LINK: 2869 ↛ 2875line 2869 didn't jump to line 2875 because the condition on line 2869 was always true

2870 # Recurse into the last argument 

2871 # Apparently ":/" is used as a link to "/", so strip 

2872 # initial value 

2873 parse_linkage_recurse(node.largs[-1], field, sense) 

2874 else: 

2875 wxr.wtp.debug( 

2876 "parse_linkage_recurse unhandled {}: {}".format( 

2877 kind, node 

2878 ), 

2879 sortid="page/2196", 

2880 ) 

2881 

2882 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

2883 nonlocal have_panel_template 

2884 if is_panel_template(wxr, name): 

2885 have_panel_template = True 

2886 return "" 

2887 return None 

2888 

2889 def parse_zh_synonyms( 

2890 parsed: list[Union[WikiNode, str]], 

2891 data: list[LinkageData], 

2892 hdrs: list[str], 

2893 root_word: str, 

2894 ) -> None: 

2895 """Parses Chinese dialectal synonyms tables""" 

2896 for item in parsed: 

2897 if isinstance(item, WikiNode): 

2898 if item.kind == NodeKind.TABLE_ROW: 2898 ↛ 2899line 2898 didn't jump to line 2899 because the condition on line 2898 was never true

2899 cleaned = clean_node(wxr, None, item.children) 

2900 # print("cleaned:", repr(cleaned)) 

2901 if any( 

2902 [ 

2903 "Variety" in cleaned, 

2904 "Location" in cleaned, 

2905 "Words" in cleaned, 

2906 ] 

2907 ): 

2908 pass 

2909 else: 

2910 split = cleaned.split("\n") 

2911 new_hdrs = split[:-1] 

2912 if len(new_hdrs) == 2: 

2913 hdrs = [new_hdrs[0]] 

2914 new_hdrs.pop(0) 

2915 combined_hdrs = [x.strip() for x in hdrs + new_hdrs] 

2916 tags = [] 

2917 words = split[-1].split(",") 

2918 for hdr in combined_hdrs: 

2919 hdr = hdr.replace("(", ",") 

2920 hdr = hdr.replace(")", "") 

2921 hdr = hdr.replace("N.", "Northern,") 

2922 hdr = hdr.replace("S.", "Southern,") 

2923 new = hdr.split(",") 

2924 for tag in sorted(new): 

2925 tag = tag.strip() 

2926 tag = tag.replace(" ", "-") 

2927 if tag in valid_tags: 

2928 tags.append(tag) 

2929 else: 

2930 if tag in zh_tag_lookup: 

2931 tags.extend(zh_tag_lookup[tag]) 

2932 else: 

2933 print( 

2934 f"MISSING ZH SYNONYM TAG for " 

2935 f"root {root_word}, word " 

2936 f"{words}: {tag}" 

2937 ) 

2938 sys.stdout.flush() 

2939 

2940 for word in words: 

2941 data.append( 

2942 {"word": word.strip(), "tags": tags} 

2943 ) 

2944 elif item.kind == NodeKind.HTML: 2944 ↛ 2945line 2944 didn't jump to line 2945 because the condition on line 2944 was never true

2945 cleaned = clean_node(wxr, None, item.children) 

2946 if "Synonyms of" in cleaned: 

2947 cleaned = cleaned.replace("Synonyms of ", "") 

2948 root_word = cleaned 

2949 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2950 else: 

2951 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2952 

2953 def parse_zh_synonyms_list( 

2954 parsed: list[Union[WikiNode, str]], 

2955 data: list[LinkageData], 

2956 hdrs: list[str], 

2957 root_word: str, 

2958 ) -> None: 

2959 """Parses Chinese dialectal synonyms tables (list format)""" 

2960 for item in parsed: 

2961 if isinstance(item, WikiNode): 

2962 if item.kind == NodeKind.LIST_ITEM: 

2963 cleaned = clean_node(wxr, None, item.children) 

2964 # print("cleaned:", repr(cleaned)) 

2965 if any( 

2966 [ 

2967 "Variety" in cleaned, 

2968 "Location" in cleaned, 

2969 "Words" in cleaned, 

2970 ] 

2971 ): 

2972 pass 

2973 else: 

2974 cleaned = cleaned.replace("(", ",") 

2975 cleaned = cleaned.replace(")", "") 

2976 split = cleaned.split(",") 

2977 # skip empty words / titles 

2978 if split[0] == "": 

2979 continue 

2980 words = split[0].split("/") 

2981 new_hdrs = [x.strip() for x in split[1:]] 

2982 tags = [] 

2983 roman = None 

2984 for tag in sorted(new_hdrs): 

2985 if tag in valid_tags: 

2986 tags.append(tag) 

2987 elif tag in zh_tag_lookup: 

2988 tags.extend(zh_tag_lookup[tag]) 

2989 elif ( 

2990 classify_desc(tag) == "romanization" 

2991 and roman is None 

2992 ): 

2993 roman = tag 

2994 else: 

2995 print( 

2996 f"MISSING ZH SYNONYM TAG " 

2997 f"(possibly pinyin) - root " 

2998 f"{root_word}, word {words}: {tag}" 

2999 ) 

3000 sys.stdout.flush() 

3001 

3002 for word in words: 

3003 dt: LinkageData = {"word": word.strip()} 

3004 if tags: 

3005 dt["tags"] = tags 

3006 if roman is not None: 

3007 dt["roman"] = roman 

3008 data.append(dt) 

3009 elif item.kind == NodeKind.HTML: 

3010 cleaned = clean_node(wxr, None, item.children) 

3011 if cleaned.find("Synonyms of") >= 0: 

3012 cleaned = cleaned.replace("Synonyms of ", "") 

3013 root_word = cleaned 

3014 parse_zh_synonyms_list( 

3015 item.children, data, hdrs, root_word 

3016 ) 

3017 else: 

3018 parse_zh_synonyms_list( 

3019 item.children, data, hdrs, root_word 

3020 ) 

3021 

3022 def contains_kind( 

3023 children: list[Union[WikiNode, str]], nodekind: NodeKind 

3024 ) -> bool: 

3025 assert isinstance(children, list) 

3026 for item in children: 

3027 if not isinstance(item, WikiNode): 

3028 continue 

3029 if item.kind == nodekind: 3029 ↛ 3030line 3029 didn't jump to line 3030 because the condition on line 3029 was never true

3030 return True 

3031 elif contains_kind(item.children, nodekind): 3031 ↛ 3032line 3031 didn't jump to line 3032 because the condition on line 3031 was never true

3032 return True 

3033 return False 

3034 

3035 # Main body of parse_linkage() 

3036 text = wxr.wtp.node_to_wikitext(linkagenode.children) 

3037 parsed = wxr.wtp.parse( 

3038 text, expand_all=True, template_fn=linkage_template_fn1 

3039 ) 

3040 if field == "synonyms" and lang_code == "zh": 

3041 synonyms: list[LinkageData] = [] 

3042 if contains_kind(parsed.children, NodeKind.LIST): 3042 ↛ 3043line 3042 didn't jump to line 3043 because the condition on line 3042 was never true

3043 parse_zh_synonyms_list(parsed.children, synonyms, [], "") 

3044 else: 

3045 parse_zh_synonyms(parsed.children, synonyms, [], "") 

3046 # print(json.dumps(synonyms, indent=4, ensure_ascii=False)) 

3047 data_extend(data, "synonyms", synonyms) 

3048 parse_linkage_recurse(parsed.children, field, None) 

3049 if not data.get(field) and not have_panel_template: 

3050 text = "".join(toplevel_text).strip() 

3051 if "\n" not in text and "," in text and text.count(",") > 3: 

3052 if not text.startswith("See "): 3052 ↛ exitline 3052 didn't return from function 'parse_linkage' because the condition on line 3052 was always true

3053 parse_linkage_item([text], field, None) 

3054 

3055 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

3056 """Parses translations for a word. This may also pull in translations 

3057 from separate translation subpages.""" 

3058 assert isinstance(data, dict) 

3059 assert isinstance(xlatnode, WikiNode) 

3060 # print("===== PARSE_TRANSLATIONS {} {} {}" 

3061 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

3062 # print("parse_translations xlatnode={}".format(xlatnode)) 

3063 if not wxr.config.capture_translations: 3063 ↛ 3064line 3063 didn't jump to line 3064 because the condition on line 3063 was never true

3064 return 

3065 sense_parts: list[Union[WikiNode, str]] = [] 

3066 sense: Optional[str] = None 

3067 

3068 def parse_translation_item( 

3069 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

3070 ) -> None: 

3071 nonlocal sense 

3072 assert isinstance(contents, list) 

3073 assert lang is None or isinstance(lang, str) 

3074 # print("PARSE_TRANSLATION_ITEM:", contents) 

3075 

3076 langcode: Optional[str] = None 

3077 if sense is None: 

3078 sense = clean_node(wxr, data, sense_parts).strip() 

3079 # print("sense <- clean_node: ", sense) 

3080 idx = sense.find("See also translations at") 

3081 if idx > 0: 3081 ↛ 3082line 3081 didn't jump to line 3082 because the condition on line 3081 was never true

3082 wxr.wtp.debug( 

3083 "Skipping translation see also: {}".format(sense), 

3084 sortid="page/2361", 

3085 ) 

3086 sense = sense[:idx].strip() 

3087 if sense.endswith(":"): 3087 ↛ 3088line 3087 didn't jump to line 3088 because the condition on line 3087 was never true

3088 sense = sense[:-1].strip() 

3089 if sense.endswith("—"): 3089 ↛ 3090line 3089 didn't jump to line 3090 because the condition on line 3089 was never true

3090 sense = sense[:-1].strip() 

3091 translations_from_template: list[str] = [] 

3092 

3093 def translation_item_template_fn( 

3094 name: str, ht: TemplateArgs 

3095 ) -> Optional[str]: 

3096 nonlocal langcode 

3097 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

3098 if is_panel_template(wxr, name): 

3099 return "" 

3100 if name in ("t+check", "t-check", "t-needed"): 

3101 # We ignore these templates. They seem to have outright 

3102 # garbage in some entries, and very varying formatting in 

3103 # others. These should be transitory and unreliable 

3104 # anyway. 

3105 return "__IGNORE__" 

3106 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

3107 code = ht.get(1) 

3108 if code: 3108 ↛ 3118line 3108 didn't jump to line 3118 because the condition on line 3108 was always true

3109 if langcode and code != langcode: 

3110 wxr.wtp.debug( 

3111 "inconsistent language codes {} vs " 

3112 "{} in translation item: {!r} {}".format( 

3113 langcode, code, name, ht 

3114 ), 

3115 sortid="page/2386", 

3116 ) 

3117 langcode = code 

3118 tr = ht.get(2) 

3119 if tr: 

3120 tr = clean_node(wxr, None, [tr]) 

3121 translations_from_template.append(tr) 

3122 return None 

3123 if name == "t-egy": 

3124 langcode = "egy" 

3125 return None 

3126 if name == "ttbc": 

3127 code = ht.get(1) 

3128 if code: 3128 ↛ 3130line 3128 didn't jump to line 3130 because the condition on line 3128 was always true

3129 langcode = code 

3130 return None 

3131 if name == "trans-see": 3131 ↛ 3132line 3131 didn't jump to line 3132 because the condition on line 3131 was never true

3132 wxr.wtp.error( 

3133 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

3134 ) 

3135 return "" 

3136 if name.endswith("-top"): 3136 ↛ 3137line 3136 didn't jump to line 3137 because the condition on line 3136 was never true

3137 return "" 

3138 if name.endswith("-bottom"): 3138 ↛ 3139line 3138 didn't jump to line 3139 because the condition on line 3138 was never true

3139 return "" 

3140 if name.endswith("-mid"): 3140 ↛ 3141line 3140 didn't jump to line 3141 because the condition on line 3140 was never true

3141 return "" 

3142 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

3143 # .format(name), 

3144 # sortid="page/2414") 

3145 return None 

3146 

3147 sublists = list( 

3148 x 

3149 for x in contents 

3150 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

3151 ) 

3152 contents = list( 

3153 x 

3154 for x in contents 

3155 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

3156 ) 

3157 

3158 item = clean_node( 

3159 wxr, data, contents, template_fn=translation_item_template_fn 

3160 ) 

3161 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

3162 

3163 # Parse the translation item. 

3164 if item: 3164 ↛ exitline 3164 didn't return from function 'parse_translation_item' because the condition on line 3164 was always true

3165 lang = parse_translation_item_text( 

3166 wxr, 

3167 word, 

3168 data, 

3169 item, 

3170 sense, 

3171 lang, 

3172 langcode, 

3173 translations_from_template, 

3174 is_reconstruction, 

3175 ) 

3176 

3177 # Handle sublists. They are frequently used for different 

3178 # scripts for the language and different variants of the 

3179 # language. We will include the lower-level header as a 

3180 # tag in those cases. 

3181 for listnode in sublists: 

3182 assert listnode.kind == NodeKind.LIST 

3183 for node in listnode.children: 

3184 if not isinstance(node, WikiNode): 3184 ↛ 3185line 3184 didn't jump to line 3185 because the condition on line 3184 was never true

3185 continue 

3186 if node.kind == NodeKind.LIST_ITEM: 3186 ↛ 3183line 3186 didn't jump to line 3183 because the condition on line 3186 was always true

3187 parse_translation_item(node.children, lang=lang) 

3188 

3189 def parse_translation_template(node: WikiNode) -> None: 

3190 assert isinstance(node, WikiNode) 

3191 

3192 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3193 nonlocal sense_parts 

3194 nonlocal sense 

3195 if is_panel_template(wxr, name): 

3196 return "" 

3197 if name == "see also": 

3198 # XXX capture 

3199 # XXX for example, "/" has top-level list containing 

3200 # see also items. So also should parse those. 

3201 return "" 

3202 if name == "trans-see": 

3203 # XXX capture 

3204 return "" 

3205 if name == "see translation subpage": 3205 ↛ 3206line 3205 didn't jump to line 3206 because the condition on line 3205 was never true

3206 sense_parts = [] 

3207 sense = None 

3208 sub = ht.get(1, "") 

3209 if sub: 

3210 m = re.match( 

3211 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

3212 ) 

3213 else: 

3214 m = None 

3215 etym = "" 

3216 etym_numbered = "" 

3217 pos = "" 

3218 if m: 

3219 etym_numbered = m.group(1) 

3220 etym = m.group(2) 

3221 pos = m.group(3) 

3222 if not sub: 

3223 wxr.wtp.debug( 

3224 "no part-of-speech in " 

3225 "{{see translation subpage|...}}, " 

3226 "defaulting to just wxr.wtp.section " 

3227 "(= language)", 

3228 sortid="page/2468", 

3229 ) 

3230 # seq sent to get_subpage_section without sub and pos 

3231 seq = [ 

3232 language, 

3233 TRANSLATIONS_TITLE, 

3234 ] 

3235 elif ( 

3236 m 

3237 and etym.lower().strip() in ETYMOLOGY_TITLES 

3238 and pos.lower() in POS_TITLES 

3239 ): 

3240 seq = [ 

3241 language, 

3242 etym_numbered, 

3243 pos, 

3244 TRANSLATIONS_TITLE, 

3245 ] 

3246 elif sub.lower() in POS_TITLES: 

3247 # seq with sub but not pos 

3248 seq = [ 

3249 language, 

3250 sub, 

3251 TRANSLATIONS_TITLE, 

3252 ] 

3253 else: 

3254 # seq with sub and pos 

3255 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3256 if pos.lower() not in POS_TITLES: 

3257 wxr.wtp.debug( 

3258 "unhandled see translation subpage: " 

3259 "language={} sub={} " 

3260 "wxr.wtp.subsection={}".format( 

3261 language, sub, wxr.wtp.subsection 

3262 ), 

3263 sortid="page/2478", 

3264 ) 

3265 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

3266 subnode = get_subpage_section( 

3267 wxr.wtp.title or "MISSING_TITLE", 

3268 TRANSLATIONS_TITLE, 

3269 [seq], 

3270 ) 

3271 if subnode is None or not isinstance(subnode, WikiNode): 

3272 # Failed to find the normal subpage section 

3273 # seq with sub and pos 

3274 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3275 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

3276 seqs: list[list[str] | tuple[str, ...]] = [ 

3277 [TRANSLATIONS_TITLE], 

3278 [language, pos], 

3279 ] 

3280 subnode = get_subpage_section( 

3281 wxr.wtp.title or "MISSING_TITLE", 

3282 TRANSLATIONS_TITLE, 

3283 seqs, 

3284 ) 

3285 if subnode is not None and isinstance( 

3286 subnode, WikiNode 

3287 ): 

3288 parse_translations(data, subnode) 

3289 return "" 

3290 if name in ( 

3291 "c", 

3292 "C", 

3293 "categorize", 

3294 "cat", 

3295 "catlangname", 

3296 "topics", 

3297 "top", 

3298 "qualifier", 

3299 "cln", 

3300 ): 

3301 # These are expanded in the default way 

3302 return None 

3303 if name in ( 

3304 "trans-top", 

3305 "trans-top-see", 

3306 ): 

3307 # XXX capture id from trans-top? Capture sense here 

3308 # instead of trying to parse it from expanded content? 

3309 if ht.get(1): 

3310 sense_parts = [] 

3311 sense = ht.get(1) 

3312 else: 

3313 sense_parts = [] 

3314 sense = None 

3315 return None 

3316 if name in ( 

3317 "trans-bottom", 

3318 "trans-mid", 

3319 "checktrans-mid", 

3320 "checktrans-bottom", 

3321 ): 

3322 return None 

3323 if name == "checktrans-top": 

3324 sense_parts = [] 

3325 sense = None 

3326 return "" 

3327 if name == "trans-top-also": 

3328 # XXX capture? 

3329 sense_parts = [] 

3330 sense = None 

3331 return "" 

3332 wxr.wtp.error( 

3333 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3334 name, ht 

3335 ), 

3336 sortid="page/2517", 

3337 ) 

3338 return "" 

3339 

3340 wxr.wtp.expand( 

3341 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3342 ) 

3343 

3344 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3345 nonlocal sense 

3346 nonlocal sense_parts 

3347 for node in xlatnode.children: 

3348 # print(node) 

3349 if isinstance(node, str): 

3350 if sense: 

3351 if not node.isspace(): 

3352 wxr.wtp.debug( 

3353 "skipping string in the middle of " 

3354 "translations: {}".format(node), 

3355 sortid="page/2530", 

3356 ) 

3357 continue 

3358 # Add a part to the sense 

3359 sense_parts.append(node) 

3360 sense = None 

3361 continue 

3362 assert isinstance(node, WikiNode) 

3363 kind = node.kind 

3364 if kind == NodeKind.LIST: 

3365 for item in node.children: 

3366 if not isinstance(item, WikiNode): 3366 ↛ 3367line 3366 didn't jump to line 3367 because the condition on line 3366 was never true

3367 continue 

3368 if item.kind != NodeKind.LIST_ITEM: 3368 ↛ 3369line 3368 didn't jump to line 3369 because the condition on line 3368 was never true

3369 continue 

3370 if item.sarg == ":": 3370 ↛ 3371line 3370 didn't jump to line 3371 because the condition on line 3370 was never true

3371 continue 

3372 parse_translation_item(item.children) 

3373 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3373 ↛ 3377line 3373 didn't jump to line 3377 because the condition on line 3373 was never true

3374 # Silently skip list items that are just indented; these 

3375 # are used for text between translations, such as indicating 

3376 # translations that need to be checked. 

3377 pass 

3378 elif kind == NodeKind.TEMPLATE: 

3379 parse_translation_template(node) 

3380 elif kind in ( 3380 ↛ 3385line 3380 didn't jump to line 3385 because the condition on line 3380 was never true

3381 NodeKind.TABLE, 

3382 NodeKind.TABLE_ROW, 

3383 NodeKind.TABLE_CELL, 

3384 ): 

3385 parse_translation_recurse(node) 

3386 elif kind == NodeKind.HTML: 

3387 if node.attrs.get("class") == "NavFrame": 3387 ↛ 3393line 3387 didn't jump to line 3393 because the condition on line 3387 was never true

3388 # Reset ``sense_parts`` (and force recomputing 

3389 # by clearing ``sense``) as each NavFrame specifies 

3390 # its own sense. This helps eliminate garbage coming 

3391 # from text at the beginning at the translations 

3392 # section. 

3393 sense_parts = [] 

3394 sense = None 

3395 # for item in node.children: 

3396 # if not isinstance(item, WikiNode): 

3397 # continue 

3398 # parse_translation_recurse(item) 

3399 parse_translation_recurse(node) 

3400 elif kind in LEVEL_KINDS: 3400 ↛ 3402line 3400 didn't jump to line 3402 because the condition on line 3400 was never true

3401 # Sub-levels will be recursed elsewhere 

3402 pass 

3403 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3404 parse_translation_recurse(node) 

3405 elif kind == NodeKind.PREFORMATTED: 3405 ↛ 3406line 3405 didn't jump to line 3406 because the condition on line 3405 was never true

3406 print("parse_translation_recurse: PREFORMATTED:", node) 

3407 elif kind == NodeKind.LINK: 3407 ↛ 3461line 3407 didn't jump to line 3461 because the condition on line 3407 was always true

3408 arg0 = node.largs[0] 

3409 # Kludge: I've seen occasional normal links to translation 

3410 # subpages from main pages (e.g., language/English/Noun 

3411 # in July 2021) instead of the normal 

3412 # {{see translation subpage|...}} template. This should 

3413 # handle them. Note: must be careful not to read other 

3414 # links, particularly things like in "human being": 

3415 # "a human being -- see [[man/translations]]" (group title) 

3416 if ( 3416 ↛ 3424line 3416 didn't jump to line 3424 because the condition on line 3416 was never true

3417 isinstance(arg0, (list, tuple)) 

3418 and arg0 

3419 and isinstance(arg0[0], str) 

3420 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3421 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3422 == wxr.wtp.title 

3423 ): 

3424 wxr.wtp.debug( 

3425 "translations subpage link found on main " 

3426 "page instead " 

3427 "of normal {{see translation subpage|...}}", 

3428 sortid="page/2595", 

3429 ) 

3430 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3431 if sub.lower() in POS_TITLES: 

3432 seq = [ 

3433 language, 

3434 sub, 

3435 TRANSLATIONS_TITLE, 

3436 ] 

3437 subnode = get_subpage_section( 

3438 wxr.wtp.title, 

3439 TRANSLATIONS_TITLE, 

3440 [seq], 

3441 ) 

3442 if subnode is not None and isinstance( 

3443 subnode, WikiNode 

3444 ): 

3445 parse_translations(data, subnode) 

3446 else: 

3447 wxr.wtp.error( 

3448 "/translations link outside part-of-speech" 

3449 ) 

3450 

3451 if ( 

3452 len(arg0) >= 1 

3453 and isinstance(arg0[0], str) 

3454 and not arg0[0].lower().startswith("category:") 

3455 ): 

3456 for x in node.largs[-1]: 

3457 if isinstance(x, str): 3457 ↛ 3460line 3457 didn't jump to line 3460 because the condition on line 3457 was always true

3458 sense_parts.append(x) 

3459 else: 

3460 parse_translation_recurse(x) 

3461 elif not sense: 

3462 sense_parts.append(node) 

3463 else: 

3464 wxr.wtp.debug( 

3465 "skipping text between translation items/senses: " 

3466 "{}".format(node), 

3467 sortid="page/2621", 

3468 ) 

3469 

3470 # Main code of parse_translation(). We want ``sense`` to be assigned 

3471 # regardless of recursion levels, and thus the code is structured 

3472 # to define at this level and recurse in parse_translation_recurse(). 

3473 parse_translation_recurse(xlatnode) 

3474 

3475 def parse_etymology(data: WordData, node: WikiNode) -> None: 

3476 """Parses an etymology section.""" 

3477 assert isinstance(data, dict) 

3478 assert isinstance(node, WikiNode) 

3479 

3480 templates: list[TemplateData] = [] 

3481 

3482 # Counter for preventing the capture of etymology templates 

3483 # when we are inside templates that we want to ignore (i.e., 

3484 # not capture). 

3485 ignore_count = 0 

3486 

3487 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3488 nonlocal ignore_count 

3489 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3490 return "" 

3491 if re.match(ignored_etymology_templates_re, name): 

3492 ignore_count += 1 

3493 return None 

3494 

3495 # CONTINUE_HERE 

3496 

3497 def etym_post_template_fn( 

3498 name: str, ht: TemplateArgs, expansion: str 

3499 ) -> None: 

3500 nonlocal ignore_count 

3501 if name in wikipedia_templates: 

3502 parse_wikipedia_template(wxr, data, ht) 

3503 return None 

3504 if re.match(ignored_etymology_templates_re, name): 

3505 ignore_count -= 1 

3506 return None 

3507 if ignore_count == 0: 3507 ↛ 3513line 3507 didn't jump to line 3513 because the condition on line 3507 was always true

3508 ht = clean_template_args(wxr, ht) 

3509 expansion = clean_node(wxr, None, expansion) 

3510 templates.append( 

3511 {"name": name, "args": ht, "expansion": expansion} 

3512 ) 

3513 return None 

3514 

3515 # Remove any subsections 

3516 contents = list( 

3517 x 

3518 for x in node.children 

3519 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3520 ) 

3521 # Convert to text, also capturing templates using post_template_fn 

3522 text = clean_node( 

3523 wxr, 

3524 None, 

3525 contents, 

3526 template_fn=etym_template_fn, 

3527 post_template_fn=etym_post_template_fn, 

3528 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3529 # Save the collected information. 

3530 if len(text) > 0: 

3531 data["etymology_text"] = text 

3532 if len(templates) > 0: 

3533 # Some etymology templates, like Template:root do not generate 

3534 # text, so they should be added here. Elsewhere, we check 

3535 # for Template:root and add some text to the expansion to please 

3536 # the validation. 

3537 data["etymology_templates"] = templates 

3538 

3539 for child_node in node.find_child_recursively( 3539 ↛ exitline 3539 didn't return from function 'parse_etymology' because the loop on line 3539 didn't complete

3540 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3541 ): 

3542 if child_node.kind in LEVEL_KIND_FLAGS: 

3543 break 

3544 elif isinstance( 3544 ↛ 3547line 3544 didn't jump to line 3547 because the condition on line 3544 was never true

3545 child_node, TemplateNode 

3546 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3547 if "etymology_examples" not in data: 

3548 data["etymology_examples"] = [] 

3549 data["etymology_examples"].extend( 

3550 extract_template_zh_x( 

3551 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3552 ) 

3553 ) 

3554 

3555 def parse_descendants( 

3556 data: WordData, node: WikiNode, is_proto_root_derived_section=False 

3557 ) -> None: 

3558 """Parses a Descendants section. Also used on Derived terms and 

3559 Extensions sections when we are dealing with a root of a reconstructed 

3560 language (i.e. is_proto_root_derived_section == True), as they use the 

3561 same structure. In the latter case, The wiktionary convention is not to 

3562 title the section as descendants since the immediate offspring of the 

3563 roots are morphologically derived terms within the same proto-language. 

3564 Still, since the rest of the section lists true descendants, we use the 

3565 same function. Entries in the descendants list that are technically 

3566 derived terms will have a field "tags": ["derived"].""" 

3567 assert isinstance(data, dict) 

3568 assert isinstance(node, WikiNode) 

3569 assert isinstance(is_proto_root_derived_section, bool) 

3570 

3571 descendants = [] 

3572 

3573 # Most templates that are not in a LIST should be ignored as they only 

3574 # add formatting, like "desc-top", "der-top3", etc. Any template in 

3575 # unignored_non_list_templates actually contains relevant descendant 

3576 # info. E.g. "CJKV" is often the only line at all in descendants 

3577 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would 

3578 # be skipped if we didn't handle it specially as it is not part of a 

3579 # LIST, and additionally is in panel_templates. There are probably more 

3580 # such templates that should be added to this... 

3581 unignored_non_list_templates: list[str] = ["CJKV"] 

3582 

3583 def process_list_item_children( 

3584 sarg: str, children: list[Union[str, WikiNode]] 

3585 ) -> None: 

3586 assert isinstance(sarg, str) 

3587 assert isinstance(children, list) 

3588 # The descendants section is a hierarchical bulleted listed. sarg is 

3589 # usually some number of "*" characters indicating the level of 

3590 # indentation of the line, e.g. "***" indicates the line will be 

3591 # thrice-indented. A bare ";" is used to indicate a subtitle-like 

3592 # line with no indentation. ":" at the end of one or more "*"s is 

3593 # used to indicate that the bullet will not be displayed. 

3594 item_data: DescendantData = {"depth": sarg.count("*")} 

3595 templates: list[TemplateData] = [] 

3596 is_derived = False 

3597 

3598 # Counter for preventing the capture of templates when we are inside 

3599 # templates that we want to ignore (i.e., not capture). 

3600 ignore_count = 0 

3601 

3602 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3603 nonlocal ignore_count 

3604 if ( 3604 ↛ 3608line 3604 didn't jump to line 3608 because the condition on line 3604 was never true

3605 is_panel_template(wxr, name) 

3606 and name not in unignored_non_list_templates 

3607 ): 

3608 return "" 

3609 if re.match(ignored_descendants_templates_re, name): 

3610 ignore_count += 1 

3611 return None 

3612 

3613 def desc_post_template_fn( 

3614 name: str, ht: TemplateArgs, expansion: str 

3615 ) -> None: 

3616 nonlocal ignore_count 

3617 if name in wikipedia_templates: 3617 ↛ 3618line 3617 didn't jump to line 3618 because the condition on line 3617 was never true

3618 parse_wikipedia_template(wxr, data, ht) 

3619 return None 

3620 if re.match(ignored_descendants_templates_re, name): 

3621 ignore_count -= 1 

3622 return None 

3623 if ignore_count == 0: 3623 ↛ 3639line 3623 didn't jump to line 3639 because the condition on line 3623 was always true

3624 ht = clean_template_args(wxr, ht) 

3625 nonlocal is_derived 

3626 # If we're in a proto-root Derived terms or Extensions 

3627 # section, and the current list item has a link template 

3628 # to a term in the same proto-language, then we tag this 

3629 # descendant entry with "derived" 

3630 is_derived = ( 

3631 is_proto_root_derived_section 

3632 and (name == "l" or name == "link") 

3633 and ("1" in ht and ht["1"] == lang_code) 

3634 ) 

3635 expansion = clean_node(wxr, None, expansion) 

3636 templates.append( 

3637 {"name": name, "args": ht, "expansion": expansion} 

3638 ) 

3639 return None 

3640 

3641 text = clean_node( 

3642 wxr, 

3643 None, 

3644 children, 

3645 template_fn=desc_template_fn, 

3646 post_template_fn=desc_post_template_fn, 

3647 ) 

3648 item_data["templates"] = templates 

3649 item_data["text"] = text 

3650 if is_derived: 3650 ↛ 3651line 3650 didn't jump to line 3651 because the condition on line 3650 was never true

3651 item_data["tags"] = ["derived"] 

3652 descendants.append(item_data) 

3653 

3654 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]: 

3655 for i, child in enumerate(node.children): 

3656 if isinstance(child, WikiNode): 

3657 yield (i, child) 

3658 

3659 def get_sublist_index(list_item: WikiNode) -> Optional[int]: 

3660 for i, child in node_children(list_item): 

3661 if child.kind == NodeKind.LIST: 

3662 return i 

3663 return None 

3664 

3665 def get_descendants(node: WikiNode) -> None: 

3666 """Appends the data for every list item in every list in node 

3667 to descendants.""" 

3668 for _, c in node_children(node): 

3669 if ( 

3670 c.kind == NodeKind.TEMPLATE 

3671 and c.largs 

3672 and len(c.largs[0]) == 1 

3673 and isinstance(c.largs[0][0], str) 

3674 and c.largs[0][0] in unignored_non_list_templates 

3675 ): 

3676 # Some Descendants sections have no wikitext list. Rather, 

3677 # the list is entirely generated by a single template (see 

3678 # e.g. the use of {{CJKV}} in Chinese entries). 

3679 process_list_item_children("", [c]) 

3680 elif c.kind == NodeKind.HTML: 3680 ↛ 3686line 3680 didn't jump to line 3686 because the condition on line 3680 was never true

3681 # The Descendants sections for many languages feature 

3682 # templates that generate html to add styling (e.g. using 

3683 # multiple columns) to the list, so that the actual wikitext 

3684 # list items are found within a <div>. We look within the 

3685 # children of the html node for the actual list items. 

3686 get_descendants(c) 

3687 elif c.kind == NodeKind.LIST: 

3688 get_descendants(c) 

3689 elif c.kind == NodeKind.LIST_ITEM: 

3690 # If a LIST_ITEM has subitems in a sublist, usually its 

3691 # last child is a LIST. However, sometimes after the LIST 

3692 # there is one or more trailing LIST_ITEMs, like "\n" or 

3693 # a reference template. If there is a sublist, we discard 

3694 # everything after it. 

3695 i = get_sublist_index(c) 

3696 if i is not None: 

3697 process_list_item_children(c.sarg, c.children[:i]) 

3698 get_descendants(c.children[i]) # type: ignore[arg-type] 

3699 else: 

3700 process_list_item_children(c.sarg, c.children) 

3701 

3702 # parse_descendants() actual work starts here 

3703 get_descendants(node) 

3704 

3705 # if e.g. on a PIE page, there may be both Derived terms and Extensions 

3706 # sections, in which case this function will be called multiple times, 

3707 # so we have to check if descendants exists first. 

3708 if "descendants" in data: 3708 ↛ 3709line 3708 didn't jump to line 3709 because the condition on line 3708 was never true

3709 data["descendants"].extend(descendants) 

3710 else: 

3711 data["descendants"] = descendants 

3712 

3713 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3714 """This recurses into a subtree in the parse tree for a page.""" 

3715 nonlocal etym_data 

3716 nonlocal pos_data 

3717 nonlocal inside_level_four 

3718 

3719 redirect_list: list[str] = [] # for `zh-see` template 

3720 

3721 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3722 """This is called for otherwise unprocessed parts of the page. 

3723 We still expand them so that e.g. Category links get captured.""" 

3724 if name in wikipedia_templates: 

3725 data = select_data() 

3726 parse_wikipedia_template(wxr, data, ht) 

3727 return None 

3728 if is_panel_template(wxr, name): 

3729 return "" 

3730 return None 

3731 

3732 for node in treenode.children: 

3733 if not isinstance(node, WikiNode): 

3734 # print(" X{}".format(repr(node)[:40])) 

3735 continue 

3736 if isinstance(node, TemplateNode): 

3737 if process_soft_redirect_template(wxr, node, redirect_list): 

3738 continue 

3739 elif node.template_name == "zh-forms": 

3740 process_zh_forms_templates(wxr, node, base_data) 

3741 

3742 if node.kind not in LEVEL_KINDS: 

3743 # XXX handle e.g. wikipedia links at the top of a language 

3744 # XXX should at least capture "also" at top of page 

3745 if node.kind in ( 

3746 NodeKind.HLINE, 

3747 NodeKind.LIST, 

3748 NodeKind.LIST_ITEM, 

3749 ): 

3750 continue 

3751 # print(" UNEXPECTED: {}".format(node)) 

3752 # Clean the node to collect category links 

3753 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3754 continue 

3755 t = clean_node( 

3756 wxr, etym_data, node.sarg if node.sarg else node.largs 

3757 ) 

3758 t = t.lower() 

3759 # XXX these counts were never implemented fully, and even this 

3760 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3761 wxr.config.section_counts[t] += 1 

3762 # print("PROCESS_CHILDREN: T:", repr(t)) 

3763 if t in IGNORED_TITLES: 

3764 pass 

3765 elif t.startswith(PRONUNCIATION_TITLE): 

3766 # Chinese Pronunciation section kludge; we demote these to 

3767 # be level 4 instead of 3 so that they're part of a larger 

3768 # etymology hierarchy; usually the data here is empty and 

3769 # acts as an inbetween between POS and Etymology data 

3770 inside_level_four = True 

3771 if t.startswith(PRONUNCIATION_TITLE + " "): 

3772 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3773 # and each of them may have senses under Definition 

3774 push_level_four_section() 

3775 wxr.wtp.start_subsection(None) 

3776 if wxr.config.capture_pronunciation: 3776 ↛ 3858line 3776 didn't jump to line 3858 because the condition on line 3776 was always true

3777 data = select_data() 

3778 parse_pronunciation( 

3779 wxr, 

3780 node, 

3781 data, 

3782 etym_data, 

3783 have_etym, 

3784 base_data, 

3785 lang_code, 

3786 ) 

3787 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3788 push_etym() 

3789 wxr.wtp.start_subsection(None) 

3790 if wxr.config.capture_etymologies: 3790 ↛ 3858line 3790 didn't jump to line 3858 because the condition on line 3790 was always true

3791 m = re.search(r"\s(\d+)$", t) 

3792 if m: 

3793 etym_data["etymology_number"] = int(m.group(1)) 

3794 parse_etymology(etym_data, node) 

3795 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3796 data = select_data() 

3797 parse_descendants(data, node) 

3798 elif ( 3798 ↛ 3804line 3798 didn't jump to line 3804 because the condition on line 3798 was never true

3799 t in PROTO_ROOT_DERIVED_TITLES 

3800 and pos == "root" 

3801 and is_reconstruction 

3802 and wxr.config.capture_descendants 

3803 ): 

3804 data = select_data() 

3805 parse_descendants(data, node, True) 

3806 elif t == TRANSLATIONS_TITLE: 

3807 data = select_data() 

3808 parse_translations(data, node) 

3809 elif t in INFLECTION_TITLES: 

3810 parse_inflection(node, t, pos) 

3811 elif t == "alternative forms": 

3812 extract_alt_form_section(wxr, select_data(), node) 

3813 else: 

3814 lst = t.split() 

3815 while len(lst) > 1 and lst[-1].isdigit(): 3815 ↛ 3816line 3815 didn't jump to line 3816 because the condition on line 3815 was never true

3816 lst = lst[:-1] 

3817 t_no_number = " ".join(lst).lower() 

3818 if t_no_number in POS_TITLES: 

3819 push_pos() 

3820 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3821 pos = dt["pos"] or "MISSING_POS" 

3822 wxr.wtp.start_subsection(t) 

3823 if "debug" in dt: 

3824 wxr.wtp.debug( 

3825 "{} in section {}".format(dt["debug"], t), 

3826 sortid="page/2755", 

3827 ) 

3828 if "warning" in dt: 3828 ↛ 3829line 3828 didn't jump to line 3829 because the condition on line 3828 was never true

3829 wxr.wtp.warning( 

3830 "{} in section {}".format(dt["warning"], t), 

3831 sortid="page/2759", 

3832 ) 

3833 if "error" in dt: 3833 ↛ 3834line 3833 didn't jump to line 3834 because the condition on line 3833 was never true

3834 wxr.wtp.error( 

3835 "{} in section {}".format(dt["error"], t), 

3836 sortid="page/2763", 

3837 ) 

3838 # Parse word senses for the part-of-speech 

3839 parse_part_of_speech(node, pos) 

3840 if "tags" in dt: 

3841 for pdata in pos_datas: 

3842 data_extend(pdata, "tags", dt["tags"]) 

3843 elif t_no_number in LINKAGE_TITLES: 

3844 # print(f"LINKAGE_TITLES NODE {node=}") 

3845 rel = LINKAGE_TITLES[t_no_number] 

3846 data = select_data() 

3847 parse_linkage(data, rel, node) 

3848 elif t_no_number == COMPOUNDS_TITLE: 

3849 data = select_data() 

3850 if wxr.config.capture_compounds: 3850 ↛ 3858line 3850 didn't jump to line 3858 because the condition on line 3850 was always true

3851 parse_linkage(data, "derived", node) 

3852 

3853 # XXX parse interesting templates also from other sections. E.g., 

3854 # {{Letter|...}} in ===See also=== 

3855 # Also <gallery> 

3856 

3857 # Recurse to children of this node, processing subtitles therein 

3858 stack.append(t) 

3859 process_children(node, pos) 

3860 stack.pop() 

3861 

3862 if len(redirect_list) > 0: 

3863 if len(pos_data) > 0: 

3864 pos_data["redirects"] = redirect_list 

3865 if "pos" not in pos_data: 3865 ↛ 3866line 3865 didn't jump to line 3866 because the condition on line 3865 was never true

3866 pos_data["pos"] = "soft-redirect" 

3867 else: 

3868 new_page_data = copy.deepcopy(base_data) 

3869 new_page_data["redirects"] = redirect_list 

3870 if "pos" not in new_page_data: 3870 ↛ 3872line 3870 didn't jump to line 3872 because the condition on line 3870 was always true

3871 new_page_data["pos"] = "soft-redirect" 

3872 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3873 page_datas.append(new_page_data) 

3874 

3875 def extract_examples( 

3876 others: list[WikiNode], sense_base: SenseData 

3877 ) -> list[ExampleData]: 

3878 """Parses through a list of definitions and quotes to find examples. 

3879 Returns a list of example dicts to be added to sense data. Adds 

3880 meta-data, mostly categories, into sense_base.""" 

3881 assert isinstance(others, list) 

3882 examples: list[ExampleData] = [] 

3883 

3884 for sub in others: 

3885 if not sub.sarg.endswith((":", "*")): 3885 ↛ 3886line 3885 didn't jump to line 3886 because the condition on line 3885 was never true

3886 continue 

3887 for item in sub.children: 

3888 if not isinstance(item, WikiNode): 3888 ↛ 3889line 3888 didn't jump to line 3889 because the condition on line 3888 was never true

3889 continue 

3890 if item.kind != NodeKind.LIST_ITEM: 3890 ↛ 3891line 3890 didn't jump to line 3891 because the condition on line 3890 was never true

3891 continue 

3892 usex_type = None 

3893 example_template_args = [] 

3894 example_template_names = [] 

3895 taxons = set() 

3896 

3897 # Bypass this function when parsing Chinese, Japanese and 

3898 # quotation templates. 

3899 new_example_lists = extract_example_list_item( 

3900 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3901 ) 

3902 if len(new_example_lists) > 0: 

3903 examples.extend(new_example_lists) 

3904 continue 

3905 

3906 def usex_template_fn( 

3907 name: str, ht: TemplateArgs 

3908 ) -> Optional[str]: 

3909 nonlocal usex_type 

3910 if is_panel_template(wxr, name): 

3911 return "" 

3912 if name in usex_templates: 

3913 usex_type = "example" 

3914 example_template_args.append(ht) 

3915 example_template_names.append(name) 

3916 elif name in quotation_templates: 

3917 usex_type = "quotation" 

3918 elif name in taxonomy_templates: 3918 ↛ 3919line 3918 didn't jump to line 3919 because the condition on line 3918 was never true

3919 taxons.update(ht.get(1, "").split()) 

3920 for prefix in template_linkages_to_ignore_in_examples: 

3921 if re.search( 

3922 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3923 ): 

3924 return "" 

3925 return None 

3926 

3927 # bookmark 

3928 ruby: list[tuple[str, str]] = [] 

3929 contents = item.children 

3930 if lang_code == "ja": 

3931 # Capture ruby contents if this is a Japanese language 

3932 # example. 

3933 # print(contents) 

3934 if ( 3934 ↛ 3939line 3934 didn't jump to line 3939 because the condition on line 3934 was never true

3935 contents 

3936 and isinstance(contents, str) 

3937 and re.match(r"\s*$", contents[0]) 

3938 ): 

3939 contents = contents[1:] 

3940 exp = wxr.wtp.parse( 

3941 wxr.wtp.node_to_wikitext(contents), 

3942 # post_template_fn=head_post_template_fn, 

3943 expand_all=True, 

3944 ) 

3945 rub, rest = extract_ruby(wxr, exp.children) 

3946 if rub: 

3947 for rtup in rub: 

3948 ruby.append(rtup) 

3949 contents = rest 

3950 subtext = clean_node( 

3951 wxr, sense_base, contents, template_fn=usex_template_fn 

3952 ) 

3953 

3954 frozen_taxons = frozenset(taxons) 

3955 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3956 

3957 # print(f"{subtext=}") 

3958 subtext = re.sub( 

3959 r"\s*\(please add an English " 

3960 r"translation of this " 

3961 r"(example|usage example|quote)\)", 

3962 "", 

3963 subtext, 

3964 ).strip() 

3965 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3966 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3967 # print("subtext:", repr(subtext)) 

3968 

3969 lines = subtext.splitlines() 

3970 # print(lines) 

3971 

3972 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3973 lines = list( 

3974 x 

3975 for x in lines 

3976 if not re.match( 

3977 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3978 r"Synonym: |Antonym: |Hyponym: |" 

3979 r"Hypernyms: |Derived terms: |" 

3980 r"Related terms: |" 

3981 r"Hypernym: |Derived term: |" 

3982 r"Coordinate terms:|" 

3983 r"Related term: |" 

3984 r"For more quotations using )", 

3985 x, 

3986 ) 

3987 ) 

3988 tr = "" 

3989 ref = "" 

3990 roman = "" 

3991 # for line in lines: 

3992 # print("LINE:", repr(line)) 

3993 # print(classify_desc(line)) 

3994 if len(lines) == 1 and lang_code != "en": 

3995 parts = example_splitter_re.split(lines[0]) 

3996 if ( 3996 ↛ 4004line 3996 didn't jump to line 4004 because the condition on line 3996 was never true

3997 len(parts) > 2 

3998 and len(example_template_args) == 1 

3999 and any( 

4000 ("―" in s) or ("—" in s) 

4001 for s in example_template_args[0].values() 

4002 ) 

4003 ): 

4004 if nparts := synch_splits_with_args( 

4005 lines[0], example_template_args[0] 

4006 ): 

4007 parts = nparts 

4008 if ( 4008 ↛ 4013line 4008 didn't jump to line 4013 because the condition on line 4008 was never true

4009 len(example_template_args) == 1 

4010 and "lit" in example_template_args[0] 

4011 ): 

4012 # ugly brute-force kludge in case there's a lit= arg 

4013 literally = example_template_args[0].get("lit", "") 

4014 if literally: 

4015 literally = ( 

4016 " (literally, “" 

4017 + clean_value(wxr, literally) 

4018 + "”)" 

4019 ) 

4020 else: 

4021 literally = "" 

4022 if ( 4022 ↛ 4061line 4022 didn't jump to line 4061 because the condition on line 4022 was never true

4023 len(example_template_args) == 1 

4024 and len(parts) == 2 

4025 and len(example_template_args[0]) 

4026 - ( 

4027 # horrible kludge to ignore these arguments 

4028 # when calculating how many there are 

4029 sum( 

4030 s in example_template_args[0] 

4031 for s in ( 

4032 "lit", # generates text, but we handle it 

4033 "inline", 

4034 "noenum", 

4035 "nocat", 

4036 "sort", 

4037 ) 

4038 ) 

4039 ) 

4040 == 3 

4041 and clean_value( 

4042 wxr, example_template_args[0].get(2, "") 

4043 ) 

4044 == parts[0].strip() 

4045 and clean_value( 

4046 wxr, 

4047 ( 

4048 example_template_args[0].get(3) 

4049 or example_template_args[0].get("translation") 

4050 or example_template_args[0].get("t", "") 

4051 ) 

4052 + literally, # in case there's a lit= argument 

4053 ) 

4054 == parts[1].strip() 

4055 ): 

4056 # {{exampletemplate|ex|Foo bar baz|English translation}} 

4057 # is a pretty reliable 'heuristic', so we use it here 

4058 # before the others. To be extra sure the template 

4059 # doesn't do anything weird, we compare the arguments 

4060 # and the output to each other. 

4061 lines = [parts[0].strip()] 

4062 tr = parts[1].strip() 

4063 elif ( 

4064 len(parts) == 2 

4065 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

4066 ): 

4067 # These other branches just do some simple heuristics w/ 

4068 # the expanded output of the template (if applicable). 

4069 lines = [parts[0].strip()] 

4070 tr = parts[1].strip() 

4071 elif ( 4071 ↛ 4077line 4071 didn't jump to line 4077 because the condition on line 4071 was never true

4072 len(parts) == 3 

4073 and classify_desc2(parts[1]) 

4074 in ("romanization", "english") 

4075 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

4076 ): 

4077 lines = [parts[0].strip()] 

4078 roman = parts[1].strip() 

4079 tr = parts[2].strip() 

4080 else: 

4081 parts = re.split(r"\s+-\s+", lines[0]) 

4082 if ( 4082 ↛ 4086line 4082 didn't jump to line 4086 because the condition on line 4082 was never true

4083 len(parts) == 2 

4084 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

4085 ): 

4086 lines = [parts[0].strip()] 

4087 tr = parts[1].strip() 

4088 elif len(lines) > 1: 

4089 if any( 

4090 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

4091 ) and not (len(example_template_names) == 1): 

4092 refs: list[str] = [] 

4093 for i in range(len(lines)): 4093 ↛ 4099line 4093 didn't jump to line 4099 because the loop on line 4093 didn't complete

4094 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 4094 ↛ 4095line 4094 didn't jump to line 4095 because the condition on line 4094 was never true

4095 break 

4096 refs.append(lines[i].strip()) 

4097 if re.search(r"[]\d:)]\s*$", lines[i]): 

4098 break 

4099 ref = " ".join(refs) 

4100 lines = lines[i + 1 :] 

4101 if ( 

4102 lang_code != "en" 

4103 and len(lines) >= 2 

4104 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

4105 ): 

4106 i = len(lines) - 1 

4107 while ( 4107 ↛ 4112line 4107 didn't jump to line 4112 because the condition on line 4107 was never true

4108 i > 1 

4109 and classify_desc2(lines[i - 1]) 

4110 in ENGLISH_TEXTS 

4111 ): 

4112 i -= 1 

4113 tr = "\n".join(lines[i:]) 

4114 lines = lines[:i] 

4115 if len(lines) >= 2: 

4116 if classify_desc2(lines[-1]) == "romanization": 

4117 roman = lines[-1].strip() 

4118 lines = lines[:-1] 

4119 

4120 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

4121 ref = lines[0] 

4122 lines = lines[1:] 

4123 elif lang_code != "en" and len(lines) == 2: 

4124 cls1 = classify_desc2(lines[0]) 

4125 cls2 = classify_desc2(lines[1]) 

4126 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

4127 tr = lines[1] 

4128 lines = [lines[0]] 

4129 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 4129 ↛ 4130line 4129 didn't jump to line 4130 because the condition on line 4129 was never true

4130 tr = lines[0] 

4131 lines = [lines[1]] 

4132 elif ( 4132 ↛ 4139line 4132 didn't jump to line 4139 because the condition on line 4132 was never true

4133 re.match(r"^[#*]*:+", lines[1]) 

4134 and classify_desc2( 

4135 re.sub(r"^[#*:]+\s*", "", lines[1]) 

4136 ) 

4137 in ENGLISH_TEXTS 

4138 ): 

4139 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

4140 lines = [lines[0]] 

4141 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

4142 # Both were classified as English, but 

4143 # presumably one is not. Assume first is 

4144 # non-English, as that seems more common. 

4145 tr = lines[1] 

4146 lines = [lines[0]] 

4147 elif ( 

4148 usex_type != "quotation" 

4149 and lang_code != "en" 

4150 and len(lines) == 3 

4151 ): 

4152 cls1 = classify_desc2(lines[0]) 

4153 cls2 = classify_desc2(lines[1]) 

4154 cls3 = classify_desc2(lines[2]) 

4155 if ( 

4156 cls3 == "english" 

4157 and cls2 in ("english", "romanization") 

4158 and cls1 != "english" 

4159 ): 

4160 tr = lines[2].strip() 

4161 roman = lines[1].strip() 

4162 lines = [lines[0].strip()] 

4163 elif ( 4163 ↛ 4171line 4163 didn't jump to line 4171 because the condition on line 4163 was never true

4164 usex_type == "quotation" 

4165 and lang_code != "en" 

4166 and len(lines) > 2 

4167 ): 

4168 # for x in lines: 

4169 # print(" LINE: {}: {}" 

4170 # .format(classify_desc2(x), x)) 

4171 if re.match(r"^[#*]*:+\s*$", lines[1]): 

4172 ref = lines[0] 

4173 lines = lines[2:] 

4174 cls1 = classify_desc2(lines[-1]) 

4175 if cls1 == "english": 

4176 i = len(lines) - 1 

4177 while ( 

4178 i > 1 

4179 and classify_desc2(lines[i - 1]) 

4180 == ENGLISH_TEXTS 

4181 ): 

4182 i -= 1 

4183 tr = "\n".join(lines[i:]) 

4184 lines = lines[:i] 

4185 

4186 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

4187 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

4188 tr = re.sub(r"^[#*:]+\s*", "", tr) 

4189 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

4190 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

4191 ref = re.sub(r"^[#*:]+\s*", "", ref) 

4192 ref = re.sub( 

4193 r", (volume |number |page )?“?" 

4194 r"\(please specify ([^)]|\(s\))*\)”?|" 

4195 ", text here$", 

4196 "", 

4197 ref, 

4198 ) 

4199 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

4200 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

4201 subtext = "\n".join(x for x in lines if x) 

4202 if not tr and lang_code != "en": 

4203 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

4204 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 4204 ↛ 4205line 4204 didn't jump to line 4205 because the condition on line 4204 was never true

4205 tr = m.group(2) 

4206 subtext = subtext[: m.start()] + m.group(1) 

4207 elif lines: 

4208 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

4209 if ( 4209 ↛ 4213line 4209 didn't jump to line 4213 because the condition on line 4209 was never true

4210 len(parts) == 2 

4211 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

4212 ): 

4213 subtext = parts[0].strip() 

4214 tr = parts[1].strip() 

4215 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

4216 subtext = re.sub( 

4217 r"(please add an English translation of " 

4218 r"this (quote|usage example))", 

4219 "", 

4220 subtext, 

4221 ) 

4222 subtext = re.sub( 

4223 r"\s*→New International Version " "translation$", 

4224 "", 

4225 subtext, 

4226 ) # e.g. pis/Tok Pisin (Bible) 

4227 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

4228 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

4229 note = None 

4230 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

4231 if ( 4231 ↛ 4239line 4231 didn't jump to line 4239 because the condition on line 4231 was never true

4232 m is not None 

4233 and lang_code != "en" 

4234 and ( 

4235 m.group(1).startswith("with ") 

4236 or classify_desc2(m.group(1)) == "english" 

4237 ) 

4238 ): 

4239 note = m.group(1) 

4240 subtext = subtext[m.end() :] 

4241 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

4242 ref = re.sub(r",\s*→ISBN", "", ref) 

4243 ref = ref.strip() 

4244 if ref.endswith(":") or ref.endswith(","): 

4245 ref = ref[:-1].strip() 

4246 ref = re.sub(r"\s+,\s+", ", ", ref) 

4247 ref = re.sub(r"\s+", " ", ref) 

4248 if ref and not subtext: 4248 ↛ 4249line 4248 didn't jump to line 4249 because the condition on line 4248 was never true

4249 subtext = ref 

4250 ref = "" 

4251 if subtext: 

4252 dt: ExampleData = {"text": subtext} 

4253 if ref: 

4254 dt["ref"] = ref 

4255 if tr: 

4256 dt["english"] = tr 

4257 if usex_type: 

4258 dt["type"] = usex_type 

4259 if note: 4259 ↛ 4260line 4259 didn't jump to line 4260 because the condition on line 4259 was never true

4260 dt["note"] = note 

4261 if roman: 

4262 dt["roman"] = roman 

4263 if ruby: 

4264 dt["ruby"] = ruby 

4265 examples.append(dt) 

4266 

4267 return examples 

4268 

4269 # Main code of parse_language() 

4270 # Process the section 

4271 stack.append(language) 

4272 process_children(langnode, None) 

4273 stack.pop() 

4274 

4275 # Finalize word entires 

4276 push_etym() 

4277 ret = [] 

4278 for data in page_datas: 

4279 merge_base(data, base_data) 

4280 ret.append(data) 

4281 

4282 # Copy all tags to word senses 

4283 for data in ret: 

4284 if "senses" not in data: 4284 ↛ 4285line 4284 didn't jump to line 4285 because the condition on line 4284 was never true

4285 continue 

4286 # WordData should not have a 'tags' field, but if it does, it's 

4287 # deleted and its contents removed and placed in each sense; 

4288 # that's why the type ignores. 

4289 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

4290 if "tags" in data: 4290 ↛ 4291line 4290 didn't jump to line 4291 because the condition on line 4290 was never true

4291 del data["tags"] # type: ignore[typeddict-item] 

4292 for sense in data["senses"]: 

4293 data_extend(sense, "tags", tags) 

4294 

4295 return ret 

4296 

4297 

4298def parse_wikipedia_template( 

4299 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

4300) -> None: 

4301 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

4302 assert isinstance(wxr, WiktextractContext) 

4303 assert isinstance(data, dict) 

4304 assert isinstance(ht, dict) 

4305 langid = clean_node(wxr, data, ht.get("lang", ())) 

4306 pagename = ( 

4307 clean_node(wxr, data, ht.get(1, ())) 

4308 or wxr.wtp.title 

4309 or "MISSING_PAGE_TITLE" 

4310 ) 

4311 if langid: 

4312 data_append(data, "wikipedia", langid + ":" + pagename) 

4313 else: 

4314 data_append(data, "wikipedia", pagename) 

4315 

4316 

4317def parse_top_template( 

4318 wxr: WiktextractContext, node: WikiNode, data: WordData 

4319) -> None: 

4320 """Parses a template that occurs on the top-level in a page, before any 

4321 language subtitles.""" 

4322 assert isinstance(wxr, WiktextractContext) 

4323 assert isinstance(node, WikiNode) 

4324 assert isinstance(data, dict) 

4325 

4326 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

4327 if name in wikipedia_templates: 

4328 parse_wikipedia_template(wxr, data, ht) 

4329 return None 

4330 if is_panel_template(wxr, name): 

4331 return "" 

4332 if name in ("reconstruction",): 4332 ↛ 4333line 4332 didn't jump to line 4333 because the condition on line 4332 was never true

4333 return "" 

4334 if name.lower() == "also" or name.lower().startswith("also/"): 

4335 # XXX shows related words that might really have been the intended 

4336 # word, capture them 

4337 return "" 

4338 if name == "see also": 4338 ↛ 4340line 4338 didn't jump to line 4340 because the condition on line 4338 was never true

4339 # XXX capture 

4340 return "" 

4341 if name == "cardinalbox": 4341 ↛ 4343line 4341 didn't jump to line 4343 because the condition on line 4341 was never true

4342 # XXX capture 

4343 return "" 

4344 if name == "character info": 4344 ↛ 4346line 4344 didn't jump to line 4346 because the condition on line 4344 was never true

4345 # XXX capture 

4346 return "" 

4347 if name == "commonscat": 4347 ↛ 4349line 4347 didn't jump to line 4349 because the condition on line 4347 was never true

4348 # XXX capture link to Wikimedia commons 

4349 return "" 

4350 if name == "wrongtitle": 4350 ↛ 4353line 4350 didn't jump to line 4353 because the condition on line 4350 was never true

4351 # XXX this should be captured to replace page title with the 

4352 # correct title. E.g. ⿰亻革家 

4353 return "" 

4354 if name == "wikidata": 4354 ↛ 4355line 4354 didn't jump to line 4355 because the condition on line 4354 was never true

4355 arg = clean_node(wxr, data, ht.get(1, ())) 

4356 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

4357 data_append(data, "wikidata", arg) 

4358 return "" 

4359 wxr.wtp.debug( 

4360 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

4361 sortid="page/2870", 

4362 ) 

4363 return "" 

4364 

4365 clean_node(wxr, None, [node], template_fn=top_template_fn) 

4366 

4367 

4368def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

4369 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

4370 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

4371 that are next to each other.""" 

4372 

4373 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

4374 # section get overwritten. In this case, let's just combine the two. 

4375 

4376 # In Chinese entries, Pronunciation can be preceded on the 

4377 # same level 3 by its Etymology *and* Glyph Origin sections: 

4378 # ===Glyph Origin=== 

4379 # ===Etymology=== 

4380 # ===Pronunciation=== 

4381 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

4382 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

4383 # are now level 6 

4384 

4385 # Known lowercase PoS names are in part_of_speech_map 

4386 # Known lowercase linkage section names are in linkage_map 

4387 

4388 old = re.split( 

4389 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

4390 ) 

4391 

4392 parts = [] 

4393 npar = 4 # Number of parentheses in above expression 

4394 parts.append(old[0]) 

4395 prev_level = None 

4396 level = None 

4397 skip_level_title = False # When combining etymology sections 

4398 for i in range(1, len(old), npar + 1): 

4399 left = old[i] 

4400 right = old[i + npar - 1] 

4401 # remove Wikilinks in title 

4402 title = re.sub(r"^\[\[", "", old[i + 1]) 

4403 title = re.sub(r"\]\]$", "", title) 

4404 prev_level = level 

4405 level = len(left) 

4406 part = old[i + npar] 

4407 if level != len(right): 4407 ↛ 4408line 4407 didn't jump to line 4408 because the condition on line 4407 was never true

4408 wxr.wtp.debug( 

4409 "subtitle has unbalanced levels: " 

4410 "{!r} has {} on the left and {} on the right".format( 

4411 title, left, right 

4412 ), 

4413 sortid="page/2904", 

4414 ) 

4415 lc = title.lower() 

4416 if name_to_code(title, "en") != "": 

4417 if level > 2: 4417 ↛ 4418line 4417 didn't jump to line 4418 because the condition on line 4417 was never true

4418 wxr.wtp.debug( 

4419 "subtitle has language name {} at level {}".format( 

4420 title, level 

4421 ), 

4422 sortid="page/2911", 

4423 ) 

4424 level = 2 

4425 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4426 if level > 3: 4426 ↛ 4427line 4426 didn't jump to line 4427 because the condition on line 4426 was never true

4427 wxr.wtp.debug( 

4428 "etymology section {} at level {}".format(title, level), 

4429 sortid="page/2917", 

4430 ) 

4431 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

4432 # sections cheek-to-cheek 

4433 skip_level_title = True 

4434 # Modify the title of previous ("Glyph Origin") section, in 

4435 # case we have a meaningful title like "Etymology 1" 

4436 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4437 level = 3 

4438 elif lc.startswith(PRONUNCIATION_TITLE): 

4439 # Pronunciation is now a level between POS and Etymology, so 

4440 # we need to shift everything down by one 

4441 level = 4 

4442 elif lc in POS_TITLES: 

4443 level = 5 

4444 elif lc == TRANSLATIONS_TITLE: 

4445 level = 6 

4446 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

4447 level = 6 

4448 elif lc in INFLECTION_TITLES: 

4449 level = 6 

4450 elif lc == DESCENDANTS_TITLE: 

4451 level = 6 

4452 elif title in PROTO_ROOT_DERIVED_TITLES: 4452 ↛ 4453line 4452 didn't jump to line 4453 because the condition on line 4452 was never true

4453 level = 6 

4454 elif lc in IGNORED_TITLES: 

4455 level = 6 

4456 else: 

4457 level = 6 

4458 if skip_level_title: 

4459 skip_level_title = False 

4460 parts.append(part) 

4461 else: 

4462 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4463 parts.append(part) 

4464 # print("=" * level, title) 

4465 # if level != len(left): 

4466 # print(" FIXED LEVEL OF {} {} -> {}" 

4467 # .format(title, len(left), level)) 

4468 

4469 text = "".join(parts) 

4470 # print(text) 

4471 return text 

4472 

4473 

4474def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4475 # Skip translation pages 

4476 if word.endswith("/" + TRANSLATIONS_TITLE): 4476 ↛ 4477line 4476 didn't jump to line 4477 because the condition on line 4476 was never true

4477 return [] 

4478 

4479 if wxr.config.verbose: 4479 ↛ 4480line 4479 didn't jump to line 4480 because the condition on line 4479 was never true

4480 logger.info(f"Parsing page: {word}") 

4481 

4482 wxr.config.word = word 

4483 wxr.wtp.start_page(word) 

4484 

4485 # Remove <noinclude> and similar tags from main pages. They 

4486 # should not appear there, but at least net/Elfdala has one and it 

4487 # is probably not the only one. 

4488 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4489 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4490 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4491 

4492 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4493 # pages that have, for example, Translations section under Linkage, or 

4494 # Translations section on the same level as Noun. Enforce a proper 

4495 # hierarchy by manipulating the subtitle levels in certain cases. 

4496 text = fix_subtitle_hierarchy(wxr, text) 

4497 

4498 # Parse the page, pre-expanding those templates that are likely to 

4499 # influence parsing 

4500 tree = wxr.wtp.parse( 

4501 text, 

4502 pre_expand=True, 

4503 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4504 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4505 ) 

4506 # from wikitextprocessor.parser import print_tree 

4507 # print("PAGE PARSE:", print_tree(tree)) 

4508 

4509 top_data: WordData = {} 

4510 

4511 # Iterate over top-level titles, which should be languages for normal 

4512 # pages 

4513 by_lang = defaultdict(list) 

4514 for langnode in tree.children: 

4515 if not isinstance(langnode, WikiNode): 

4516 continue 

4517 if langnode.kind == NodeKind.TEMPLATE: 

4518 parse_top_template(wxr, langnode, top_data) 

4519 continue 

4520 if langnode.kind == NodeKind.LINK: 

4521 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4522 continue 

4523 if langnode.kind != NodeKind.LEVEL2: 4523 ↛ 4524line 4523 didn't jump to line 4524 because the condition on line 4523 was never true

4524 wxr.wtp.debug( 

4525 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4526 ) 

4527 continue 

4528 lang = clean_node( 

4529 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4530 ) 

4531 lang_code = name_to_code(lang, "en") 

4532 if lang_code == "": 4532 ↛ 4533line 4532 didn't jump to line 4533 because the condition on line 4532 was never true

4533 wxr.wtp.debug( 

4534 f"unrecognized language name: {lang}", sortid="page/3019" 

4535 ) 

4536 if ( 

4537 wxr.config.capture_language_codes 

4538 and lang_code not in wxr.config.capture_language_codes 

4539 ): 

4540 continue 

4541 wxr.wtp.start_section(lang) 

4542 

4543 # Collect all words from the page. 

4544 # print(f"{langnode=}") 

4545 datas = parse_language(wxr, langnode, lang, lang_code) 

4546 

4547 # Propagate fields resulting from top-level templates to this 

4548 # part-of-speech. 

4549 for data in datas: 

4550 if "lang" not in data: 4550 ↛ 4551line 4550 didn't jump to line 4551 because the condition on line 4550 was never true

4551 wxr.wtp.debug( 

4552 "internal error -- no lang in data: {}".format(data), 

4553 sortid="page/3034", 

4554 ) 

4555 continue 

4556 for k, v in top_data.items(): 

4557 assert isinstance(v, (list, tuple)) 

4558 data_extend(data, k, v) 

4559 by_lang[data["lang"]].append(data) 

4560 

4561 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4562 # field. FIX OR REMOVE. 

4563 # Do some post-processing on the words. For example, we may distribute 

4564 # conjugation information to all the words. 

4565 ret = [] 

4566 for lang, lang_datas in by_lang.items(): 

4567 ret.extend(lang_datas) 

4568 

4569 for x in ret: 

4570 if x["word"] != word: 

4571 if word.startswith("Unsupported titles/"): 4571 ↛ 4577line 4571 didn't jump to line 4577 because the condition on line 4571 was always true

4572 wxr.wtp.debug( 

4573 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4574 sortid="20231101/3578page.py", 

4575 ) 

4576 else: 

4577 wxr.wtp.debug( 

4578 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4579 sortid="20231101/3582page.py", 

4580 ) 

4581 x["original_title"] = word 

4582 # validate tag data 

4583 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4584 return ret 

4585 

4586 

4587def recursively_separate_raw_tags( 

4588 wxr: WiktextractContext, data: dict[str, Any] 

4589) -> None: 

4590 if not isinstance(data, dict): 4590 ↛ 4591line 4590 didn't jump to line 4591 because the condition on line 4590 was never true

4591 wxr.wtp.error( 

4592 "'data' is not dict; most probably " 

4593 "data has a list that contains at least one dict and " 

4594 "at least one non-dict item", 

4595 sortid="en/page-4016/20240419", 

4596 ) 

4597 return 

4598 new_tags: list[str] = [] 

4599 raw_tags: list[str] = data.get("raw_tags", []) 

4600 for field, val in data.items(): 

4601 if field == "tags": 

4602 for tag in val: 

4603 if tag not in valid_tags: 

4604 raw_tags.append(tag) 

4605 else: 

4606 new_tags.append(tag) 

4607 if isinstance(val, list): 

4608 if len(val) > 0 and isinstance(val[0], dict): 

4609 for d in val: 

4610 recursively_separate_raw_tags(wxr, d) 

4611 if "tags" in data and not new_tags: 

4612 del data["tags"] 

4613 elif new_tags: 

4614 data["tags"] = new_tags 

4615 if raw_tags: 

4616 data["raw_tags"] = raw_tags 

4617 

4618 

4619def process_soft_redirect_template( 

4620 wxr: WiktextractContext, 

4621 template_node: TemplateNode, 

4622 redirect_pages: list[str], 

4623) -> bool: 

4624 # return `True` if the template is soft redirect template 

4625 if template_node.template_name == "zh-see": 

4626 # https://en.wiktionary.org/wiki/Template:zh-see 

4627 title = clean_node( 

4628 wxr, None, template_node.template_parameters.get(1, "") 

4629 ) 

4630 if title != "": 4630 ↛ 4632line 4630 didn't jump to line 4632 because the condition on line 4630 was always true

4631 redirect_pages.append(title) 

4632 return True 

4633 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4634 # https://en.wiktionary.org/wiki/Template:ja-see 

4635 for key, value in template_node.template_parameters.items(): 

4636 if isinstance(key, int): 4636 ↛ 4635line 4636 didn't jump to line 4635 because the condition on line 4636 was always true

4637 title = clean_node(wxr, None, value) 

4638 if title != "": 4638 ↛ 4635line 4638 didn't jump to line 4635 because the condition on line 4638 was always true

4639 redirect_pages.append(title) 

4640 return True 

4641 return False 

4642 

4643 

4644def process_zh_forms_templates( 

4645 wxr: WiktextractContext, 

4646 template_node: TemplateNode, 

4647 base_data: WordData, 

4648) -> None: 

4649 # https://en.wiktionary.org/wiki/Template:zh-forms 

4650 if "forms" not in base_data: 4650 ↛ 4652line 4650 didn't jump to line 4652 because the condition on line 4650 was always true

4651 base_data["forms"] = [] 

4652 for p_name, p_value in template_node.template_parameters.items(): 

4653 if not isinstance(p_name, str): 4653 ↛ 4654line 4653 didn't jump to line 4654 because the condition on line 4653 was never true

4654 continue 

4655 if re.fullmatch(r"s\d*", p_name): 

4656 form_data: FormData = { 

4657 "form": clean_node(wxr, None, p_value), 

4658 "tags": ["Simplified Chinese"], 

4659 } 

4660 if len(form_data["form"]) > 0: 4660 ↛ 4652line 4660 didn't jump to line 4652 because the condition on line 4660 was always true

4661 base_data["forms"].append(form_data) 

4662 elif re.fullmatch(r"t\d+", p_name): 4662 ↛ 4663line 4662 didn't jump to line 4663 because the condition on line 4662 was never true

4663 form_data = { 

4664 "form": clean_node(wxr, None, p_value), 

4665 "tags": ["Traditional Chinese"], 

4666 } 

4667 if len(form_data["form"]) > 0: 

4668 base_data["forms"].append(form_data) 

4669 elif p_name == "alt": 4669 ↛ 4679line 4669 didn't jump to line 4679 because the condition on line 4669 was always true

4670 for form_text in clean_node(wxr, None, p_value).split(","): 

4671 texts = form_text.split("-") 

4672 form_data = {"form": texts[0]} 

4673 if len(texts) > 1: 

4674 # pronunciation data could be added after "-" 

4675 # see https://en.wiktionary.org/wiki/新婦 

4676 form_data["raw_tags"] = texts[1:] 

4677 if len(form_data["form"]) > 0: 4677 ↛ 4670line 4677 didn't jump to line 4670 because the condition on line 4677 was always true

4678 base_data["forms"].append(form_data) 

4679 elif p_name == "lit": 

4680 lit = clean_node(wxr, None, p_value) 

4681 if lit != "": 

4682 base_data["literal_meaning"] = lit 

4683 if len(base_data["forms"]) == 0: 

4684 del base_data["forms"]