Coverage for src / wiktextract / extractor / en / page.py: 78%

1818 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8from collections import defaultdict 

9from functools import partial 

10from typing import ( 

11 TYPE_CHECKING, 

12 Any, 

13 Iterable, 

14 Literal, 

15 Optional, 

16 Set, 

17 Union, 

18 cast, 

19) 

20 

21from mediawiki_langcodes import get_all_names, name_to_code 

22from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

23from wikitextprocessor.parser import ( 

24 LEVEL_KIND_FLAGS, 

25 GeneralNode, 

26 HTMLNode, 

27 LevelNode, 

28 NodeKind, 

29 TemplateNode, 

30 WikiNode, 

31) 

32 

33from ...clean import clean_template_args, clean_value 

34from ...datautils import ( 

35 data_append, 

36 data_extend, 

37 ns_title_prefix_tuple, 

38) 

39from ...page import ( 

40 LEVEL_KINDS, 

41 clean_node, 

42 is_panel_template, 

43 recursively_extract, 

44) 

45from ...tags import valid_tags 

46from ...wxr_context import WiktextractContext 

47from ...wxr_logging import logger 

48from ..ruby import extract_ruby, parse_ruby 

49from ..share import strip_nodes 

50from .descendant import extract_descendant_section 

51from .example import extract_example_list_item, extract_template_zh_x 

52from .form_descriptions import ( 

53 classify_desc, 

54 decode_tags, 

55 distw, 

56 parse_alt_or_inflection_of, 

57 parse_sense_qualifier, 

58 parse_word_head, 

59) 

60from .inflection import TableContext, parse_inflection_section 

61from .info_templates import ( 

62 INFO_TEMPLATE_FUNCS, 

63 parse_info_template_arguments, 

64 parse_info_template_node, 

65) 

66from .linkages import ( 

67 extract_alt_form_section, 

68 parse_linkage, 

69) 

70from .parts_of_speech import PARTS_OF_SPEECH 

71from .section_titles import ( 

72 COMPOUNDS_TITLE, 

73 DESCENDANTS_TITLE, 

74 ETYMOLOGY_TITLES, 

75 IGNORED_TITLES, 

76 INFLECTION_TITLES, 

77 LINKAGE_TITLES, 

78 POS_TITLES, 

79 PRONUNCIATION_TITLE, 

80 PROTO_ROOT_DERIVED_TITLES, 

81 TRANSLATIONS_TITLE, 

82) 

83from .translations import parse_translation_item_text 

84from .type_utils import ( 

85 AttestationData, 

86 ExampleData, 

87 FormData, 

88 LinkageData, 

89 ReferenceData, 

90 SenseData, 

91 SoundData, 

92 TemplateData, 

93 WordData, 

94) 

95from .unsupported_titles import unsupported_title_map 

96 

97# When determining whether a string is 'english', classify_desc 

98# might return 'taxonomic' which is English text 99% of the time. 

99ENGLISH_TEXTS = ("english", "taxonomic") 

100 

101# Matches head tag 

102HEAD_TAG_RE = re.compile( 

103 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

104 r"hangul-symbol|syllable-hangul)$|" 

105 + r"^(latin|" 

106 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

107 + r")-(" 

108 + "|".join( 

109 [ 

110 "abbr", 

111 "adj", 

112 "adjective", 

113 "adjective form", 

114 "adjective-form", 

115 "adv", 

116 "adverb", 

117 "affix", 

118 "animal command", 

119 "art", 

120 "article", 

121 "aux", 

122 "bound pronoun", 

123 "bound-pronoun", 

124 "Buyla", 

125 "card num", 

126 "card-num", 

127 "cardinal", 

128 "chunom", 

129 "classifier", 

130 "clitic", 

131 "cls", 

132 "cmene", 

133 "cmavo", 

134 "colloq-verb", 

135 "colverbform", 

136 "combining form", 

137 "combining-form", 

138 "comparative", 

139 "con", 

140 "concord", 

141 "conj", 

142 "conjunction", 

143 "conjug", 

144 "cont", 

145 "contr", 

146 "converb", 

147 "daybox", 

148 "decl", 

149 "decl noun", 

150 "def", 

151 "dem", 

152 "det", 

153 "determ", 

154 "Deva", 

155 "ending", 

156 "entry", 

157 "form", 

158 "fuhivla", 

159 "gerund", 

160 "gismu", 

161 "hanja", 

162 "hantu", 

163 "hanzi", 

164 "head", 

165 "ideophone", 

166 "idiom", 

167 "inf", 

168 "indef", 

169 "infixed pronoun", 

170 "infixed-pronoun", 

171 "infl", 

172 "inflection", 

173 "initialism", 

174 "int", 

175 "interfix", 

176 "interj", 

177 "interjection", 

178 "jyut", 

179 "latin", 

180 "letter", 

181 "locative", 

182 "lujvo", 

183 "monthbox", 

184 "mutverb", 

185 "name", 

186 "nisba", 

187 "nom", 

188 "noun", 

189 "noun form", 

190 "noun-form", 

191 "noun plural", 

192 "noun-plural", 

193 "nounprefix", 

194 "num", 

195 "number", 

196 "numeral", 

197 "ord", 

198 "ordinal", 

199 "par", 

200 "part", 

201 "part form", 

202 "part-form", 

203 "participle", 

204 "particle", 

205 "past", 

206 "past neg", 

207 "past-neg", 

208 "past participle", 

209 "past-participle", 

210 "perfect participle", 

211 "perfect-participle", 

212 "personal pronoun", 

213 "personal-pronoun", 

214 "pref", 

215 "prefix", 

216 "phrase", 

217 "pinyin", 

218 "plural noun", 

219 "plural-noun", 

220 "pos", 

221 "poss-noun", 

222 "post", 

223 "postp", 

224 "postposition", 

225 "PP", 

226 "pp", 

227 "ppron", 

228 "pred", 

229 "predicative", 

230 "prep", 

231 "prep phrase", 

232 "prep-phrase", 

233 "preposition", 

234 "present participle", 

235 "present-participle", 

236 "pron", 

237 "prondem", 

238 "pronindef", 

239 "pronoun", 

240 "prop", 

241 "proper noun", 

242 "proper-noun", 

243 "proper noun form", 

244 "proper-noun form", 

245 "proper noun-form", 

246 "proper-noun-form", 

247 "prov", 

248 "proverb", 

249 "prpn", 

250 "prpr", 

251 "punctuation mark", 

252 "punctuation-mark", 

253 "regnoun", 

254 "rel", 

255 "rom", 

256 "romanji", 

257 "root", 

258 "sign", 

259 "suff", 

260 "suffix", 

261 "syllable", 

262 "symbol", 

263 "verb", 

264 "verb form", 

265 "verb-form", 

266 "verbal noun", 

267 "verbal-noun", 

268 "verbnec", 

269 "vform", 

270 ] 

271 ) 

272 + r")(-|/|\+|$)" 

273) 

274 

275# Head-templates causing problems (like newlines) that can be squashed into 

276# an empty string in the template handler while saving their template 

277# data for later. 

278WORD_LEVEL_HEAD_TEMPLATES = {"term-label", "tlb"} 

279 

280FLOATING_TABLE_TEMPLATES: set[str] = { 

281 # az-suffix-form creates a style=floatright div that is otherwise 

282 # deleted; if it is not pre-expanded, we can intercept the template 

283 # so we add this set into do_not_pre_expand, and intercept the 

284 # templates in parse_part_of_speech 

285 "az-suffix-forms", 

286 "az-inf-p", 

287 "kk-suffix-forms", 

288 "ky-suffix-forms", 

289 "tr-inf-p", 

290 "tr-suffix-forms", 

291 "tt-suffix-forms", 

292 "uz-suffix-forms", 

293} 

294# These two should contain template names that should always be 

295# pre-expanded when *first* processing the tree, or not pre-expanded 

296# so that the template are left in place with their identifying 

297# name intact for later filtering. 

298 

299DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

300DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

301 

302# Additional templates to be expanded in the pre-expand phase 

303ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

304 "multitrans", 

305 "multitrans-nowiki", 

306 "trans-top", 

307 "trans-top-also", 

308 "trans-bottom", 

309 "checktrans-top", 

310 "checktrans-bottom", 

311 "col", 

312 "col1", 

313 "col2", 

314 "col3", 

315 "col4", 

316 "col5", 

317 "col1-u", 

318 "col2-u", 

319 "col3-u", 

320 "col4-u", 

321 "col5-u", 

322 "check deprecated lang param usage", 

323 "deprecated code", 

324 "ru-verb-alt-ё", 

325 "ru-noun-alt-ё", 

326 "ru-adj-alt-ё", 

327 "ru-proper noun-alt-ё", 

328 "ru-pos-alt-ё", 

329 "ru-alt-ё", 

330 "inflection of", 

331 "no deprecated lang param usage", 

332 "transclude", # these produce sense entries (or other lists) 

333 "tcl", 

334} 

335 

336# Inverse linkage for those that have them 

337linkage_inverses: dict[str, str] = { 

338 # XXX this is not currently used, move to post-processing 

339 "synonyms": "synonyms", 

340 "hypernyms": "hyponyms", 

341 "hyponyms": "hypernyms", 

342 "holonyms": "meronyms", 

343 "meronyms": "holonyms", 

344 "derived": "derived_from", 

345 "coordinate_terms": "coordinate_terms", 

346 "troponyms": "hypernyms", 

347 "antonyms": "antonyms", 

348 "instances": "instance_of", 

349 "related": "related", 

350} 

351 

352# Templates that are used to form panels on pages and that 

353# should be ignored in various positions 

354PANEL_TEMPLATES: set[str] = { 

355 "Character info", 

356 "CJKV", 

357 "French personal pronouns", 

358 "French possessive adjectives", 

359 "French possessive pronouns", 

360 "Han etym", 

361 "Japanese demonstratives", 

362 "Latn-script", 

363 "LDL", 

364 "MW1913Abbr", 

365 "Number-encoding", 

366 "Nuttall", 

367 "Spanish possessive adjectives", 

368 "Spanish possessive pronouns", 

369 "USRegionDisputed", 

370 "Webster 1913", 

371 "ase-rfr", 

372 "attention", 

373 "attn", 

374 "beer", 

375 "broken ref", 

376 "ca-compass", 

377 "character info", 

378 "character info/var", 

379 "checksense", 

380 "compass-fi", 

381 "copyvio suspected", 

382 "delete", 

383 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

384 "etystub", 

385 "examples", 

386 "hu-corr", 

387 "hu-suff-pron", 

388 "interwiktionary", 

389 "ja-kanjitab", 

390 "ja-kt", 

391 "ko-hanja-search", 

392 "look", 

393 "maintenance box", 

394 "maintenance line", 

395 "mediagenic terms", 

396 "merge", 

397 "missing template", 

398 "morse links", 

399 "move", 

400 "multiple images", 

401 "no inline", 

402 "picdic", 

403 "picdicimg", 

404 "picdiclabel", 

405 "polyominoes", 

406 "predidential nomics", 

407 "punctuation", # This actually gets pre-expanded 

408 "reconstructed", 

409 "request box", 

410 "rf-sound example", 

411 "rfaccents", 

412 "rfap", 

413 "rfaspect", 

414 "rfc", 

415 "rfc-auto", 

416 "rfc-header", 

417 "rfc-level", 

418 "rfc-pron-n", 

419 "rfc-sense", 

420 "rfclarify", 

421 "rfd", 

422 "rfd-redundant", 

423 "rfd-sense", 

424 "rfdate", 

425 "rfdatek", 

426 "rfdef", 

427 "rfe", 

428 "rfe/dowork", 

429 "rfex", 

430 "rfexp", 

431 "rfform", 

432 "rfgender", 

433 "rfi", 

434 "rfinfl", 

435 "rfm", 

436 "rfm-sense", 

437 "rfp", 

438 "rfp-old", 

439 "rfquote", 

440 "rfquote-sense", 

441 "rfquotek", 

442 "rfref", 

443 "rfscript", 

444 "rft2", 

445 "rftaxon", 

446 "rftone", 

447 "rftranslit", 

448 "rfv", 

449 "rfv-etym", 

450 "rfv-pron", 

451 "rfv-quote", 

452 "rfv-sense", 

453 "selfref", 

454 "split", 

455 "stroke order", # XXX consider capturing this? 

456 "stub entry", 

457 "t-needed", 

458 "tbot entry", 

459 "tea room", 

460 "tea room sense", 

461 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

462 "unblock", 

463 "unsupportedpage", 

464 "video frames", 

465 "was wotd", 

466 "wrongtitle", 

467 "zh-forms", 

468 "zh-hanzi-box", 

469 "no entry", 

470} 

471 

472# Template name prefixes used for language-specific panel templates (i.e., 

473# templates that create side boxes or notice boxes or that should generally 

474# be ignored). 

475PANEL_PREFIXES: set[str] = { 

476 "list:compass points/", 

477 "list:Gregorian calendar months/", 

478 "RQ:", 

479} 

480 

481# Templates used for wikipedia links. 

482wikipedia_templates: set[str] = { 

483 "wikipedia", 

484 "slim-wikipedia", 

485 "w", 

486 "W", 

487 "swp", 

488 "wiki", 

489 "Wikipedia", 

490 "wtorw", 

491} 

492for x in PANEL_PREFIXES & wikipedia_templates: 492 ↛ 493line 492 didn't jump to line 493 because the loop on line 492 never started

493 print( 

494 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

495 x 

496 ) 

497 ) 

498 

499# Mapping from a template name (without language prefix) for the main word 

500# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

501# it could validly occur. This is used as just a sanity check to give 

502# warnings about probably incorrect coding in Wiktionary. 

503template_allowed_pos_map: dict[str, list[str]] = { 

504 "abbr": ["abbrev"], 

505 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

506 "plural noun": ["noun", "name"], 

507 "plural-noun": ["noun", "name"], 

508 "proper noun": ["noun", "name"], 

509 "proper-noun": ["name", "noun"], 

510 "prop": ["name", "noun"], 

511 "verb": ["verb", "phrase"], 

512 "gerund": ["verb"], 

513 "particle": ["adv", "particle"], 

514 "adj": ["adj", "adj_noun"], 

515 "pron": ["pron", "noun"], 

516 "name": ["name", "noun"], 

517 "adv": ["adv", "intj", "conj", "particle"], 

518 "phrase": ["phrase", "prep_phrase"], 

519 "noun phrase": ["phrase"], 

520 "ordinal": ["num"], 

521 "number": ["num"], 

522 "pos": ["affix", "name", "num"], 

523 "suffix": ["suffix", "affix"], 

524 "character": ["character"], 

525 "letter": ["character"], 

526 "kanji": ["character"], 

527 "cont": ["abbrev"], 

528 "interj": ["intj"], 

529 "con": ["conj"], 

530 "part": ["particle"], 

531 "prep": ["prep", "postp"], 

532 "postp": ["postp"], 

533 "misspelling": ["noun", "adj", "verb", "adv"], 

534 "part-form": ["verb"], 

535} 

536for k, v in template_allowed_pos_map.items(): 

537 for x in v: 

538 if x not in PARTS_OF_SPEECH: 538 ↛ 539line 538 didn't jump to line 539 because the condition on line 538 was never true

539 print( 

540 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

541 "".format(x, k, v) 

542 ) 

543 assert False 

544 

545 

546# Templates ignored during etymology extraction, i.e., these will not be listed 

547# in the extracted etymology templates. 

548ignored_etymology_templates: list[str] = [ 

549 "...", 

550 "IPAchar", 

551 "ipachar", 

552 "ISBN", 

553 "isValidPageName", 

554 "redlink category", 

555 "deprecated code", 

556 "check deprecated lang param usage", 

557 "para", 

558 "p", 

559 "cite", 

560 "Cite news", 

561 "Cite newsgroup", 

562 "cite paper", 

563 "cite MLLM 1976", 

564 "cite journal", 

565 "cite news/documentation", 

566 "cite paper/documentation", 

567 "cite video game", 

568 "cite video game/documentation", 

569 "cite newsgroup", 

570 "cite newsgroup/documentation", 

571 "cite web/documentation", 

572 "cite news", 

573 "Cite book", 

574 "Cite-book", 

575 "cite book", 

576 "cite web", 

577 "cite-usenet", 

578 "cite-video/documentation", 

579 "Cite-journal", 

580 "rfe", 

581 "catlangname", 

582 "cln", 

583 "langname-lite", 

584 "no deprecated lang param usage", 

585 "mention", 

586 "m", 

587 "m-self", 

588 "link", 

589 "l", 

590 "ll", 

591 "l-self", 

592] 

593# Regexp for matching ignored etymology template names. This adds certain 

594# prefixes to the names listed above. 

595ignored_etymology_templates_re = re.compile( 

596 r"^((cite-|R:|RQ:).*|" 

597 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

598 + r")$" 

599) 

600 

601# Regexp for matching ignored descendants template names. Right now we just 

602# copy the ignored etymology templates 

603ignored_descendants_templates_re = ignored_etymology_templates_re 

604 

605# Set of template names that are used to define usage examples. If the usage 

606# example contains one of these templates, then it its type is set to 

607# "example" 

608usex_templates: set[str] = { 

609 "afex", 

610 "affixusex", 

611 "co", # {{collocation}} acts like a example template, specifically for 

612 # pairs of combinations of words that are more common than you'd 

613 # except would be randomly; hlavní#Czech 

614 "coi", 

615 "collocation", 

616 "el-example", 

617 "el-x", 

618 "example", 

619 "examples", 

620 "he-usex", 

621 "he-x", 

622 "hi-usex", 

623 "hi-x", 

624 "ja-usex-inline", 

625 "ja-usex", 

626 "ja-x", 

627 "jbo-example", 

628 "jbo-x", 

629 "km-usex", 

630 "km-x", 

631 "ko-usex", 

632 "ko-x", 

633 "lo-usex", 

634 "lo-x", 

635 "ne-x", 

636 "ne-usex", 

637 "prefixusex", 

638 "ryu-usex", 

639 "ryu-x", 

640 "shn-usex", 

641 "shn-x", 

642 "suffixusex", 

643 "th-usex", 

644 "th-x", 

645 "ur-usex", 

646 "ur-x", 

647 "usex", 

648 "usex-suffix", 

649 "ux", 

650 "uxi", 

651} 

652 

653stop_head_at_these_templates: set[str] = { 

654 "category", 

655 "cat", 

656 "topics", 

657 "catlangname", 

658 "c", 

659 "C", 

660 "top", 

661 "cln", 

662} 

663 

664# Set of template names that are used to define quotation examples. If the 

665# usage example contains one of these templates, then its type is set to 

666# "quotation". 

667quotation_templates: set[str] = { 

668 "collapse-quote", 

669 "quote-av", 

670 "quote-book", 

671 "quote-GYLD", 

672 "quote-hansard", 

673 "quotei", 

674 "quote-journal", 

675 "quotelite", 

676 "quote-mailing list", 

677 "quote-meta", 

678 "quote-newsgroup", 

679 "quote-song", 

680 "quote-text", 

681 "quote", 

682 "quote-us-patent", 

683 "quote-video game", 

684 "quote-web", 

685 "quote-wikipedia", 

686 "wikiquote", 

687 "Wikiquote", 

688} 

689 

690taxonomy_templates = { 

691 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

692 "taxfmt", 

693 "taxlink", 

694 "taxlink2", 

695 "taxlinknew", 

696 "taxlook", 

697} 

698 

699# Template names, this was exctracted from template_linkage_mappings, 

700# because the code using template_linkage_mappings was actually not used 

701# (but not removed). 

702template_linkages_to_ignore_in_examples: set[str] = { 

703 "syn", 

704 "synonyms", 

705 "ant", 

706 "antonyms", 

707 "hyp", 

708 "hyponyms", 

709 "der", 

710 "derived terms", 

711 "coordinate terms", 

712 "cot", 

713 "rel", 

714 "col", 

715 "inline alt forms", 

716 "alti", 

717 "comeronyms", 

718 "holonyms", 

719 "holo", 

720 "hypernyms", 

721 "hyper", 

722 "meronyms", 

723 "mero", 

724 "troponyms", 

725 "perfectives", 

726 "pf", 

727 "imperfectives", 

728 "impf", 

729 "syndiff", 

730 "synsee", 

731 # not linkage nor example templates 

732 "sense", 

733 "s", 

734 "color panel", 

735 "colour panel", 

736} 

737 

738# Maps template name used in a word sense to a linkage field that it adds. 

739sense_linkage_templates: dict[str, str] = { 

740 "syn": "synonyms", 

741 "synonyms": "synonyms", 

742 "synsee": "synonyms", 

743 "syndiff": "synonyms", 

744 "hyp": "hyponyms", 

745 "hyponyms": "hyponyms", 

746 "ant": "antonyms", 

747 "antonyms": "antonyms", 

748 "alti": "related", 

749 "inline alt forms": "related", 

750 "coordinate terms": "coordinate_terms", 

751 "cot": "coordinate_terms", 

752 "comeronyms": "related", 

753 "holonyms": "holonyms", 

754 "holo": "holonyms", 

755 "hypernyms": "hypernyms", 

756 "hyper": "hypernyms", 

757 "meronyms": "meronyms", 

758 "mero": "meronyms", 

759 "troponyms": "troponyms", 

760 "perfectives": "related", 

761 "pf": "related", 

762 "imperfectives": "related", 

763 "impf": "related", 

764} 

765 

766sense_linkage_templates_tags: dict[str, list[str]] = { 

767 "alti": ["alternative"], 

768 "inline alt forms": ["alternative"], 

769 "comeronyms": ["comeronym"], 

770 "perfectives": ["perfective"], 

771 "pf": ["perfective"], 

772 "imperfectives": ["imperfective"], 

773 "impf": ["imperfective"], 

774} 

775 

776 

777def decode_html_entities(v: Union[str, int]) -> str: 

778 """Decodes HTML entities from a value, converting them to the respective 

779 Unicode characters/strings.""" 

780 if isinstance(v, int): 

781 # I changed this to return str(v) instead of v = str(v), 

782 # but there might have been the intention to have more logic 

783 # here. html.unescape would not do anything special with an integer, 

784 # it needs html escape symbols (&xx;). 

785 return str(v) 

786 return html.unescape(v) 

787 

788 

789def parse_sense_linkage( 

790 wxr: WiktextractContext, 

791 data: SenseData, 

792 name: str, 

793 ht: TemplateArgs, 

794 pos: str, 

795) -> None: 

796 """Parses a linkage (synonym, etc) specified in a word sense.""" 

797 assert isinstance(wxr, WiktextractContext) 

798 assert isinstance(data, dict) 

799 assert isinstance(name, str) 

800 assert isinstance(ht, dict) 

801 field = sense_linkage_templates[name] 

802 field_tags = sense_linkage_templates_tags.get(name, []) 

803 for i in range(2, 20): 

804 w = ht.get(i) or "" 

805 w = clean_node(wxr, data, w) 

806 is_thesaurus = False 

807 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

808 if w.startswith(alias): 808 ↛ 809line 808 didn't jump to line 809 because the condition on line 808 was never true

809 is_thesaurus = True 

810 w = w[len(alias) :] 

811 if w != wxr.wtp.title: 

812 from ...thesaurus import search_thesaurus 

813 

814 lang_code = clean_node(wxr, None, ht.get(1, "")) 

815 for t_data in search_thesaurus( 

816 wxr.thesaurus_db_conn, # type: ignore 

817 w, 

818 lang_code, 

819 pos, 

820 field, # type: ignore 

821 ): 

822 l_data: LinkageData = { 

823 "word": t_data.term, 

824 "source": "Thesaurus:" + w, 

825 } 

826 if len(t_data.tags) > 0: 

827 l_data["tags"] = t_data.tags 

828 if len(t_data.raw_tags) > 0: 

829 l_data["raw_tags"] = t_data.raw_tags 

830 data_append(data, field, l_data) 

831 break 

832 if not w: 

833 break 

834 if is_thesaurus: 834 ↛ 835line 834 didn't jump to line 835 because the condition on line 834 was never true

835 continue 

836 tags: list[str] = [] 

837 topics: list[str] = [] 

838 english: Optional[str] = None 

839 # Try to find qualifiers for this synonym 

840 q = ht.get("q{}".format(i - 1)) 

841 if q: 

842 cls = classify_desc(q) 

843 if cls == "tags": 

844 tagsets1, topics1 = decode_tags(q) 

845 for ts in tagsets1: 

846 tags.extend(ts) 

847 topics.extend(topics1) 

848 elif cls == "english": 848 ↛ 854line 848 didn't jump to line 854 because the condition on line 848 was always true

849 if english: 849 ↛ 850line 849 didn't jump to line 850 because the condition on line 849 was never true

850 english += "; " + q 

851 else: 

852 english = q 

853 # Try to find English translation for this synonym 

854 t = ht.get("t{}".format(i - 1)) 

855 if t: 855 ↛ 856line 855 didn't jump to line 856 because the condition on line 855 was never true

856 if english: 

857 english += "; " + t 

858 else: 

859 english = t 

860 

861 # See if the linkage contains a parenthesized alt 

862 alt = None 

863 m = re.search(r"\(([^)]+)\)$", w) 

864 if m: 864 ↛ 865line 864 didn't jump to line 865 because the condition on line 864 was never true

865 w = w[: m.start()].strip() 

866 alt = m.group(1) 

867 

868 dt = {"word": w} 

869 if field_tags: 869 ↛ 870line 869 didn't jump to line 870 because the condition on line 869 was never true

870 data_extend(dt, "tags", field_tags) 

871 if tags: 

872 data_extend(dt, "tags", tags) 

873 if topics: 873 ↛ 874line 873 didn't jump to line 874 because the condition on line 873 was never true

874 data_extend(dt, "topics", topics) 

875 if english: 

876 dt["english"] = english # DEPRECATED for "translation" 

877 dt["translation"] = english 

878 if alt: 878 ↛ 879line 878 didn't jump to line 879 because the condition on line 878 was never true

879 dt["alt"] = alt 

880 data_append(data, field, dt) 

881 

882 

883EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

884example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

885captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

886 

887 

888def synch_splits_with_args( 

889 line: str, targs: TemplateArgs 

890) -> Optional[list[str]]: 

891 """If it looks like there's something weird with how a line of example 

892 text has been split, this function will do the splitting after counting 

893 occurences of the splitting regex inside the two main template arguments 

894 containing the string data for the original language example and the 

895 English translations. 

896 """ 

897 # Previously, we split without capturing groups, but here we want to 

898 # keep the original splitting hyphen regex intact. 

899 fparts = captured_splitters_re.split(line) 

900 new_parts = [] 

901 # ["First", " – ", "second", " – ", "third..."] from OL argument 

902 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

903 new_parts.append("".join(fparts[:first])) 

904 # Translation argument 

905 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

906 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

907 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

908 new_parts.append("".join(fparts[first + 1 : second])) 

909 

910 if all(new_parts): # no empty strings from the above spaghetti 

911 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

912 return new_parts 

913 else: 

914 return None 

915 

916 

917QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

918QUALIFIERS_RE = re.compile(QUALIFIERS) 

919# (...): ... or (...(...)...): ... 

920 

921 

922def parse_language( 

923 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

924) -> list[WordData]: 

925 """Iterates over the text of the page, returning words (parts-of-speech) 

926 defined on the page one at a time. (Individual word senses for the 

927 same part-of-speech are typically encoded in the same entry.)""" 

928 # imported here to avoid circular import 

929 from .pronunciation import parse_pronunciation 

930 

931 assert isinstance(wxr, WiktextractContext) 

932 assert isinstance(langnode, WikiNode) 

933 assert isinstance(language, str) 

934 assert isinstance(lang_code, str) 

935 # print("parse_language", language) 

936 

937 is_reconstruction = False 

938 word: str = wxr.wtp.title # type: ignore[assignment] 

939 unsupported_prefix = "Unsupported titles/" 

940 if word.startswith(unsupported_prefix): 

941 w = word[len(unsupported_prefix) :] 

942 if w in unsupported_title_map: 942 ↛ 945line 942 didn't jump to line 945 because the condition on line 942 was always true

943 word = unsupported_title_map[w] 

944 else: 

945 wxr.wtp.error( 

946 "Unimplemented unsupported title: {}".format(word), 

947 sortid="page/870", 

948 ) 

949 word = w 

950 elif word.startswith("Reconstruction:"): 

951 word = word[word.find("/") + 1 :] 

952 is_reconstruction = True 

953 

954 base_data: WordData = { 

955 "word": word, 

956 "lang": language, 

957 "lang_code": lang_code, 

958 } 

959 if is_reconstruction: 

960 data_append(base_data, "tags", "reconstruction") 

961 sense_data: SenseData = {} 

962 pos_data: WordData = {} # For a current part-of-speech 

963 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

964 etym_data: WordData = {} # For one etymology 

965 sense_datas: list[SenseData] = [] 

966 sense_ordinal = 0 # The recursive sense parsing messes up the ordering 

967 # Never reset, do not use as data 

968 level_four_datas: list[WordData] = [] 

969 etym_datas: list[WordData] = [] 

970 page_datas: list[WordData] = [] 

971 have_etym = False 

972 inside_level_four = False # This is for checking if the etymology section 

973 # or article has a Pronunciation section, for Chinese mostly; because 

974 # Chinese articles can have three level three sections (two etymology 

975 # sections and pronunciation sections) one after another, we need a kludge 

976 # to better keep track of whether we're in a normal "etym" or inside a 

977 # "level four" (which is what we've turned the level three Pron sections 

978 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

979 # a step. 

980 stack: list[str] = [] # names of items on the "stack" 

981 

982 def merge_base(data: WordData, base: WordData) -> None: 

983 for k, v in base.items(): 

984 # Copy the value to ensure that we don't share lists or 

985 # dicts between structures (even nested ones). 

986 v = copy.deepcopy(v) 

987 if k not in data: 

988 # The list was copied above, so this will not create shared ref 

989 data[k] = v # type: ignore[literal-required] 

990 continue 

991 if data[k] == v: # type: ignore[literal-required] 

992 continue 

993 if ( 993 ↛ 1001line 993 didn't jump to line 1001 because the condition on line 993 was always true

994 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

995 or isinstance( 

996 v, 

997 (list, tuple), # Should this be "and"? 

998 ) 

999 ): 

1000 data[k] = list(data[k]) + list(v) # type: ignore 

1001 elif data[k] != v: # type: ignore[literal-required] 

1002 wxr.wtp.warning( 

1003 "conflicting values for {} in merge_base: " 

1004 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

1005 sortid="page/904", 

1006 ) 

1007 

1008 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

1009 """Remove unnecessary keys from dict values 

1010 in a list comprehension...""" 

1011 if key in pron: 

1012 pron.pop(key) # type: ignore 

1013 return pron 

1014 

1015 # If the result has sounds, eliminate sounds that have a prefix that 

1016 # does not match "word" or one of "forms" 

1017 if "sounds" in data and "word" in data: 

1018 accepted = [data["word"]] 

1019 accepted.extend(f["form"] for f in data.get("forms", dict())) 

1020 data["sounds"] = list( 

1021 s 

1022 for s in data["sounds"] 

1023 if "form" not in s or s["form"] in accepted 

1024 ) 

1025 # If the result has sounds, eliminate sounds that have a pos that 

1026 # does not match "pos" 

1027 if "sounds" in data and "pos" in data: 

1028 data["sounds"] = list( 

1029 complementary_pop(s, "pos") 

1030 for s in data["sounds"] 

1031 # "pos" is not a field of SoundData, correctly, so we're 

1032 # removing it here. It's a kludge on a kludge on a kludge. 

1033 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

1034 ) 

1035 

1036 def push_sense(sorting_ordinal: int | None = None) -> bool: 

1037 """Starts collecting data for a new word sense. This returns True 

1038 if a sense was added.""" 

1039 nonlocal sense_data 

1040 if sorting_ordinal is None: 

1041 sorting_ordinal = sense_ordinal 

1042 tags = sense_data.get("tags", ()) 

1043 if ( 

1044 not sense_data.get("glosses") 

1045 and "translation-hub" not in tags 

1046 and "no-gloss" not in tags 

1047 ): 

1048 return False 

1049 

1050 if ( 1050 ↛ 1060line 1050 didn't jump to line 1060 because the condition on line 1050 was never true

1051 ( 

1052 "participle" in sense_data.get("tags", ()) 

1053 or "infinitive" in sense_data.get("tags", ()) 

1054 ) 

1055 and "alt_of" not in sense_data 

1056 and "form_of" not in sense_data 

1057 and "etymology_text" in etym_data 

1058 and etym_data["etymology_text"] != "" 

1059 ): 

1060 etym = etym_data["etymology_text"] 

1061 etym = etym.split(". ")[0] 

1062 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

1063 if ret is not None: 

1064 tags, lst = ret 

1065 assert isinstance(lst, (list, tuple)) 

1066 if "form-of" in tags: 

1067 data_extend(sense_data, "form_of", lst) 

1068 data_extend(sense_data, "tags", tags) 

1069 elif "alt-of" in tags: 

1070 data_extend(sense_data, "alt_of", lst) 

1071 data_extend(sense_data, "tags", tags) 

1072 

1073 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1073 ↛ 1076line 1073 didn't jump to line 1076 because the condition on line 1073 was never true

1074 "tags", () 

1075 ): 

1076 data_append(sense_data, "tags", "no-gloss") 

1077 

1078 sense_data["__temp_sense_sorting_ordinal"] = sorting_ordinal # type: ignore 

1079 sense_datas.append(sense_data) 

1080 sense_data = {} 

1081 return True 

1082 

1083 def push_pos(sorting_ordinal: int | None = None) -> None: 

1084 """Starts collecting data for a new part-of-speech.""" 

1085 nonlocal pos_data 

1086 nonlocal sense_datas 

1087 push_sense(sorting_ordinal) 

1088 if wxr.wtp.subsection: 

1089 data: WordData = {"senses": sense_datas} 

1090 merge_base(data, pos_data) 

1091 level_four_datas.append(data) 

1092 pos_data = {} 

1093 sense_datas = [] 

1094 wxr.wtp.start_subsection(None) 

1095 

1096 def push_level_four_section(clear_sound_data: bool) -> None: 

1097 """Starts collecting data for a new level four sections, which 

1098 is usually virtual and empty, unless the article has Chinese 

1099 'Pronunciation' sections that are etymology-section-like but 

1100 under etymology, and at the same level in the source. We modify 

1101 the source to demote Pronunciation sections like that to level 

1102 4, and other sections one step lower.""" 

1103 nonlocal level_four_data 

1104 nonlocal level_four_datas 

1105 nonlocal etym_datas 

1106 push_pos() 

1107 # print(f"======\n{etym_data=}") 

1108 # print(f"======\n{etym_datas=}") 

1109 # print(f"======\n{level_four_data=}") 

1110 # print(f"======\n{level_four_datas=}") 

1111 for data in level_four_datas: 

1112 merge_base(data, level_four_data) 

1113 etym_datas.append(data) 

1114 for data in etym_datas: 

1115 merge_base(data, etym_data) 

1116 page_datas.append(data) 

1117 if clear_sound_data: 

1118 level_four_data = {} 

1119 level_four_datas = [] 

1120 etym_datas = [] 

1121 

1122 def push_etym() -> None: 

1123 """Starts collecting data for a new etymology.""" 

1124 nonlocal etym_data 

1125 nonlocal etym_datas 

1126 nonlocal have_etym 

1127 nonlocal inside_level_four 

1128 have_etym = True 

1129 push_level_four_section(False) 

1130 inside_level_four = False 

1131 # etymology section could under pronunciation section 

1132 etym_data = ( 

1133 copy.deepcopy(level_four_data) if len(level_four_data) > 0 else {} 

1134 ) 

1135 

1136 def select_data() -> WordData: 

1137 """Selects where to store data (pos or etym) based on whether we 

1138 are inside a pos (part-of-speech).""" 

1139 # print(f"{wxr.wtp.subsection=}") 

1140 # print(f"{stack=}") 

1141 if wxr.wtp.subsection is not None: 

1142 return pos_data 

1143 if inside_level_four: 

1144 return level_four_data 

1145 if stack[-1] == language: 

1146 return base_data 

1147 return etym_data 

1148 

1149 term_label_templates: list[TemplateData] = [] 

1150 

1151 def head_post_template_fn( 

1152 name: str, ht: TemplateArgs, expansion: str 

1153 ) -> Optional[str]: 

1154 """Handles special templates in the head section of a word. Head 

1155 section is the text after part-of-speech subtitle and before word 

1156 sense list. Typically it generates the bold line for the word, but 

1157 may also contain other useful information that often ends in 

1158 side boxes. We want to capture some of that additional information.""" 

1159 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1160 if is_panel_template(wxr, name): 1160 ↛ 1163line 1160 didn't jump to line 1163 because the condition on line 1160 was never true

1161 # Completely ignore these templates (not even recorded in 

1162 # head_templates) 

1163 return "" 

1164 if name == "head": 

1165 # XXX are these also captured in forms? Should this special case 

1166 # be removed? 

1167 t = ht.get(2, "") 

1168 if t == "pinyin": 1168 ↛ 1169line 1168 didn't jump to line 1169 because the condition on line 1168 was never true

1169 data_append(pos_data, "tags", "Pinyin") 

1170 elif t == "romanization": 1170 ↛ 1171line 1170 didn't jump to line 1171 because the condition on line 1170 was never true

1171 data_append(pos_data, "tags", "romanization") 

1172 if ( 

1173 HEAD_TAG_RE.search(name) is not None 

1174 or name in WORD_LEVEL_HEAD_TEMPLATES 

1175 ): 

1176 args_ht = clean_template_args(wxr, ht) 

1177 cleaned_expansion = clean_node(wxr, None, expansion) 

1178 dt: TemplateData = { 

1179 "name": name, 

1180 "args": args_ht, 

1181 "expansion": cleaned_expansion, 

1182 } 

1183 data_append(pos_data, "head_templates", dt) 

1184 if name in WORD_LEVEL_HEAD_TEMPLATES: 

1185 term_label_templates.append(dt) 

1186 # Squash these, their tags are applied to the whole word, 

1187 # and some cause problems like "term-label" 

1188 return "" 

1189 

1190 # The following are both captured in head_templates and parsed 

1191 # separately 

1192 

1193 if name in wikipedia_templates: 

1194 # Note: various places expect to have content from wikipedia 

1195 # templates, so cannot convert this to empty 

1196 parse_wikipedia_template(wxr, pos_data, ht) 

1197 return None 

1198 

1199 if name == "number box": 1199 ↛ 1201line 1199 didn't jump to line 1201 because the condition on line 1199 was never true

1200 # XXX extract numeric value? 

1201 return "" 

1202 if name == "enum": 

1203 # XXX extract? 

1204 return "" 

1205 if name == "cardinalbox": 1205 ↛ 1208line 1205 didn't jump to line 1208 because the condition on line 1205 was never true

1206 # XXX extract similar to enum? 

1207 # XXX this can also occur in top-level under language 

1208 return "" 

1209 if name == "Han simplified forms": 1209 ↛ 1211line 1209 didn't jump to line 1211 because the condition on line 1209 was never true

1210 # XXX extract? 

1211 return "" 

1212 # if name == "ja-kanji forms": 

1213 # # XXX extract? 

1214 # return "" 

1215 # if name == "vi-readings": 

1216 # # XXX extract? 

1217 # return "" 

1218 # if name == "ja-kanji": 

1219 # # XXX extract? 

1220 # return "" 

1221 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1221 ↛ 1223line 1221 didn't jump to line 1223 because the condition on line 1221 was never true

1222 # XXX extract? 

1223 return "" 

1224 

1225 return None 

1226 

1227 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1228 """Parses the subsection for a part-of-speech under a language on 

1229 a page.""" 

1230 assert isinstance(posnode, WikiNode) 

1231 assert isinstance(pos, str) 

1232 # print("parse_part_of_speech", pos) 

1233 pos_data["pos"] = pos 

1234 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1235 lists: list[list[WikiNode]] = [[]] # list of lists 

1236 first_para = True 

1237 first_head_tmplt = True 

1238 collecting_head = True 

1239 start_of_paragraph = True 

1240 

1241 # XXX extract templates from posnode with recursively_extract 

1242 # that break stuff, like ja-kanji or az-suffix-form. 

1243 # Do the extraction with a list of template names, combined from 

1244 # different lists, then separate out them into different lists 

1245 # that are handled at different points of the POS section. 

1246 # First, extract az-suffix-form, put it in `inflection`, 

1247 # and parse `inflection`'s content when appropriate later. 

1248 # The contents of az-suffix-form (and ja-kanji) that generate 

1249 # divs with "floatright" in their style gets deleted by 

1250 # clean_value, so templates that slip through from here won't 

1251 # break anything. 

1252 # XXX bookmark 

1253 # print("===================") 

1254 # print(posnode.children) 

1255 

1256 floaters, poschildren = recursively_extract( 

1257 posnode.children, 

1258 lambda x: ( 

1259 isinstance(x, WikiNode) 

1260 and ( 

1261 ( 

1262 isinstance(x, TemplateNode) 

1263 and x.template_name in FLOATING_TABLE_TEMPLATES 

1264 ) 

1265 or ( 

1266 x.kind == NodeKind.LINK 

1267 # Need to check for stringiness because some links are 

1268 # broken; for example, if a template is missing an 

1269 # argument, a link might look like `[[{{{1}}}...]]` 

1270 and len(x.largs) > 0 

1271 and len(x.largs[0]) > 0 

1272 and isinstance(x.largs[0][0], str) 

1273 and x.largs[0][0].lower().startswith("file:") # type:ignore[union-attr] 

1274 ) 

1275 ) 

1276 ), 

1277 ) 

1278 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1279 tempnode.largs = [["Inflection"]] 

1280 tempnode.children = floaters 

1281 parse_inflection(tempnode, "Floating Div", pos) 

1282 # print(poschildren) 

1283 # XXX new above 

1284 

1285 if not poschildren: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true

1286 if not floaters: 

1287 wxr.wtp.debug( 

1288 "PoS section without contents", 

1289 sortid="en/page/1051/20230612", 

1290 ) 

1291 else: 

1292 wxr.wtp.debug( 

1293 "PoS section without contents except for a floating table", 

1294 sortid="en/page/1056/20230612", 

1295 ) 

1296 return 

1297 

1298 for node in poschildren: 

1299 if isinstance(node, str): 

1300 for m in re.finditer(r"\n+|[^\n]+", node): 

1301 p = m.group(0) 

1302 if p.startswith("\n\n") and pre: 

1303 first_para = False 

1304 start_of_paragraph = True 

1305 break 

1306 if p and collecting_head: 

1307 pre[-1].append(p) 

1308 continue 

1309 assert isinstance(node, WikiNode) 

1310 kind = node.kind 

1311 if kind == NodeKind.LIST: 

1312 lists[-1].append(node) 

1313 collecting_head = False 

1314 start_of_paragraph = True 

1315 continue 

1316 elif kind in LEVEL_KINDS: 

1317 # Stop parsing section if encountering any kind of 

1318 # level header (like ===Noun=== or ====Further Reading====). 

1319 # At a quick glance, this should be the default behavior, 

1320 # but if some kinds of source articles have sub-sub-sections 

1321 # that should be parsed XXX it should be handled by changing 

1322 # this break. 

1323 break 

1324 elif collecting_head and kind == NodeKind.LINK: 

1325 # We might collect relevant links as they are often pictures 

1326 # relating to the word 

1327 if len(node.largs[0]) >= 1 and isinstance( 1327 ↛ 1342line 1327 didn't jump to line 1342 because the condition on line 1327 was always true

1328 node.largs[0][0], str 

1329 ): 

1330 if node.largs[0][0].startswith( 1330 ↛ 1336line 1330 didn't jump to line 1336 because the condition on line 1330 was never true

1331 ns_title_prefix_tuple(wxr, "Category") 

1332 ): 

1333 # [[Category:...]] 

1334 # We're at the end of the file, probably, so stop 

1335 # here. Otherwise the head will get garbage. 

1336 break 

1337 if node.largs[0][0].startswith( 1337 ↛ 1342line 1337 didn't jump to line 1342 because the condition on line 1337 was always true

1338 ns_title_prefix_tuple(wxr, "File") 

1339 ): 

1340 # Skips file links 

1341 continue 

1342 start_of_paragraph = False 

1343 pre[-1].extend(node.largs[-1]) 

1344 elif kind == NodeKind.HTML: 

1345 if node.sarg == "br": 

1346 if pre[-1]: 1346 ↛ 1298line 1346 didn't jump to line 1298 because the condition on line 1346 was always true

1347 pre.append([]) # Switch to next head 

1348 lists.append([]) # Lists parallels pre 

1349 collecting_head = True 

1350 start_of_paragraph = True 

1351 elif collecting_head and node.sarg not in ( 1351 ↛ 1357line 1351 didn't jump to line 1357 because the condition on line 1351 was never true

1352 "gallery", 

1353 "ref", 

1354 "cite", 

1355 "caption", 

1356 ): 

1357 start_of_paragraph = False 

1358 pre[-1].append(node) 

1359 else: 

1360 start_of_paragraph = False 

1361 elif isinstance(node, TemplateNode): 

1362 # XXX Insert code here that disambiguates between 

1363 # templates that generate word heads and templates 

1364 # that don't. 

1365 # There's head_tag_re that seems like a regex meant 

1366 # to identify head templates. Too bad it's None. 

1367 

1368 # ignore {{category}}, {{cat}}... etc. 

1369 if node.template_name in stop_head_at_these_templates: 

1370 # we've reached a template that should be at the end, 

1371 continue 

1372 

1373 # skip these templates; panel_templates is already used 

1374 # to skip certain templates else, but it also applies to 

1375 # head parsing quite well. 

1376 # node.largs[0][0] should always be str, but can't type-check 

1377 # that. 

1378 if is_panel_template(wxr, node.template_name): 

1379 continue 

1380 # skip these templates 

1381 # if node.largs[0][0] in skip_these_templates_in_head: 

1382 # first_head_tmplt = False # no first_head_tmplt at all 

1383 # start_of_paragraph = False 

1384 # continue 

1385 

1386 if first_head_tmplt and pre[-1]: 

1387 first_head_tmplt = False 

1388 start_of_paragraph = False 

1389 pre[-1].append(node) 

1390 elif pre[-1] and start_of_paragraph: 

1391 pre.append([]) # Switch to the next head 

1392 lists.append([]) # lists parallel pre 

1393 collecting_head = True 

1394 start_of_paragraph = False 

1395 pre[-1].append(node) 

1396 else: 

1397 pre[-1].append(node) 

1398 elif first_para: 

1399 start_of_paragraph = False 

1400 if collecting_head: 1400 ↛ 1298line 1400 didn't jump to line 1298 because the condition on line 1400 was always true

1401 pre[-1].append(node) 

1402 # XXX use template_fn in clean_node to check that the head macro 

1403 # is compatible with the current part-of-speech and generate warning 

1404 # if not. Use template_allowed_pos_map. 

1405 

1406 # Clean up empty pairs, and fix messes with extra newlines that 

1407 # separate templates that are followed by lists wiktextract issue #314 

1408 

1409 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1410 cleaned_lists: list[list[WikiNode]] = [] 

1411 pairless_pre_index = None 

1412 

1413 for pre1, ls in zip(pre, lists): 

1414 if pre1 and not ls: 

1415 pairless_pre_index = len(cleaned_pre) 

1416 if not pre1 and not ls: 1416 ↛ 1418line 1416 didn't jump to line 1418 because the condition on line 1416 was never true

1417 # skip [] + [] 

1418 continue 

1419 if not ls and all( 

1420 (isinstance(x, str) and not x.strip()) for x in pre1 

1421 ): 

1422 # skip ["\n", " "] + [] 

1423 continue 

1424 if ls and not pre1: 

1425 if pairless_pre_index is not None: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 cleaned_lists[pairless_pre_index] = ls 

1427 pairless_pre_index = None 

1428 continue 

1429 cleaned_pre.append(pre1) 

1430 cleaned_lists.append(ls) 

1431 

1432 pre = cleaned_pre 

1433 lists = cleaned_lists 

1434 

1435 there_are_many_heads = len(pre) > 1 

1436 header_tags: list[str] = [] 

1437 header_topics: list[str] = [] 

1438 previous_head_had_list = False 

1439 

1440 if not any(g for g in lists): 

1441 process_gloss_without_list( 

1442 poschildren, pos, pos_data, header_tags, header_topics 

1443 ) 

1444 else: 

1445 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1446 # if len(ls) == 0: 

1447 # # don't have gloss list 

1448 # # XXX add code here to filter out 'garbage', like text 

1449 # # that isn't a head template or head. 

1450 # continue 

1451 

1452 if all(not sl for sl in lists[i:]): 

1453 if i == 0: 1453 ↛ 1454line 1453 didn't jump to line 1454 because the condition on line 1453 was never true

1454 if isinstance(node, str): 

1455 wxr.wtp.debug( 

1456 "first head without list of senses," 

1457 "string: '{}[...]', {}/{}".format( 

1458 node[:20], word, language 

1459 ), 

1460 sortid="page/1689/20221215", 

1461 ) 

1462 if isinstance(node, WikiNode): 

1463 if node.largs and node.largs[0][0] in [ 

1464 "Han char", 

1465 ]: 

1466 # just ignore these templates 

1467 pass 

1468 else: 

1469 wxr.wtp.debug( 

1470 "first head without " 

1471 "list of senses, " 

1472 "template node " 

1473 "{}, {}/{}".format( 

1474 node.largs, word, language 

1475 ), 

1476 sortid="page/1694/20221215", 

1477 ) 

1478 else: 

1479 wxr.wtp.debug( 

1480 "first head without list of senses, " 

1481 "{}/{}".format(word, language), 

1482 sortid="page/1700/20221215", 

1483 ) 

1484 # no break here so that the first head always 

1485 # gets processed. 

1486 else: 

1487 if isinstance(node, str): 1487 ↛ 1488line 1487 didn't jump to line 1488 because the condition on line 1487 was never true

1488 wxr.wtp.debug( 

1489 "later head without list of senses," 

1490 "string: '{}[...]', {}/{}".format( 

1491 node[:20], word, language 

1492 ), 

1493 sortid="page/1708/20221215", 

1494 ) 

1495 if isinstance(node, WikiNode): 1495 ↛ 1507line 1495 didn't jump to line 1507 because the condition on line 1495 was always true

1496 wxr.wtp.debug( 

1497 "later head without list of senses," 

1498 "template node " 

1499 "{}, {}/{}".format( 

1500 node.sarg if node.sarg else node.largs, 

1501 word, 

1502 language, 

1503 ), 

1504 sortid="page/1713/20221215", 

1505 ) 

1506 else: 

1507 wxr.wtp.debug( 

1508 "later head without list of senses, " 

1509 "{}/{}".format(word, language), 

1510 sortid="page/1719/20221215", 

1511 ) 

1512 break 

1513 head_group = i + 1 if there_are_many_heads else None 

1514 # print("parse_part_of_speech: {}: {}: pre={}" 

1515 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1516 

1517 if previous_head_had_list: 

1518 # We use a boolean flag here because we want to be able 

1519 # let the header_tags data pass through after the loop 

1520 # is over without accidentally emptying it, if there are 

1521 # no pos_datas and we need a dummy data. 

1522 header_tags.clear() 

1523 header_topics.clear() 

1524 

1525 process_gloss_header( 

1526 pre1, pos, head_group, pos_data, header_tags, header_topics 

1527 ) 

1528 for ln in ls: 

1529 # Parse each list associated with this head. 

1530 for node in ln.children: 

1531 # Parse nodes in l.children recursively. 

1532 # The recursion function uses push_sense() to 

1533 # add stuff into sense_datas, and returns True or 

1534 # False if something is added, which bubbles upward. 

1535 # If the bubble is "True", then higher levels of 

1536 # the recursion will not push_sense(), because 

1537 # the data is already pushed into a sub-gloss 

1538 # downstream, unless the higher level has examples 

1539 # that need to be put somewhere. 

1540 common_data: SenseData = { 

1541 "tags": list(header_tags), 

1542 "topics": list(header_topics), 

1543 } 

1544 if head_group: 

1545 common_data["head_nr"] = head_group 

1546 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1547 

1548 if len(ls) > 0: 

1549 previous_head_had_list = True 

1550 else: 

1551 previous_head_had_list = False 

1552 

1553 # If there are no senses extracted, add a dummy sense. We want to 

1554 # keep tags extracted from the head for the dummy sense. 

1555 push_sense() # Make sure unfinished data pushed, and start clean sense 

1556 if len(sense_datas) == 0: 

1557 data_extend(sense_data, "tags", header_tags) 

1558 data_extend(sense_data, "topics", header_topics) 

1559 data_append(sense_data, "tags", "no-gloss") 

1560 push_sense() 

1561 

1562 sense_datas.sort(key=lambda x: x.get("__temp_sense_sorting_ordinal", 0)) # type: ignore 

1563 

1564 for sd in sense_datas: 

1565 if "__temp_sense_sorting_ordinal" in sd: 1565 ↛ 1564line 1565 didn't jump to line 1564 because the condition on line 1565 was always true

1566 del sd["__temp_sense_sorting_ordinal"] # type: ignore 

1567 

1568 def process_gloss_header( 

1569 header_nodes: list[Union[WikiNode, str]], 

1570 pos_type: str, 

1571 header_group: Optional[int], 

1572 pos_data: WordData, 

1573 header_tags: list[str], 

1574 header_topics: list[str], 

1575 ) -> None: 

1576 ruby = [] 

1577 links: list[str] = [] 

1578 

1579 # process template parse nodes here 

1580 new_nodes = [] 

1581 info_template_data = [] 

1582 for node in header_nodes: 

1583 # print(f"{node=}") 

1584 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1585 if info_data or info_out: 

1586 if info_data: 1586 ↛ 1588line 1586 didn't jump to line 1588 because the condition on line 1586 was always true

1587 info_template_data.append(info_data) 

1588 if info_out: # including just the original node 1588 ↛ 1589line 1588 didn't jump to line 1589 because the condition on line 1588 was never true

1589 new_nodes.append(info_out) 

1590 else: 

1591 new_nodes.append(node) 

1592 header_nodes = new_nodes 

1593 

1594 if info_template_data: 

1595 if "info_templates" not in pos_data: 1595 ↛ 1598line 1595 didn't jump to line 1598 because the condition on line 1595 was always true

1596 pos_data["info_templates"] = info_template_data 

1597 else: 

1598 pos_data["info_templates"].extend(info_template_data) 

1599 

1600 if not word.isalnum(): 

1601 # `-` is kosher, add more of these if needed. 

1602 if word.replace("-", "").isalnum(): 

1603 pass 

1604 else: 

1605 # if the word contains non-letter or -number characters, it 

1606 # might have something that messes with split-at-semi-comma; we 

1607 # collect links so that we can skip splitting them. 

1608 exp = wxr.wtp.parse( 

1609 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1610 ) 

1611 link_nodes, _ = recursively_extract( 

1612 exp.children, 

1613 lambda x: isinstance(x, WikiNode) 

1614 and x.kind == NodeKind.LINK, 

1615 ) 

1616 for ln in link_nodes: 

1617 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1618 if not ltext.isalnum(): 

1619 links.append(ltext) 

1620 if word not in links: 1620 ↛ 1623line 1620 didn't jump to line 1623 because the condition on line 1620 was always true

1621 links.append(word) 

1622 

1623 if lang_code == "ja": 

1624 exp = wxr.wtp.parse( 

1625 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1626 ) 

1627 rub, _ = recursively_extract( 

1628 exp.children, 

1629 lambda x: isinstance(x, WikiNode) 

1630 and x.kind == NodeKind.HTML 

1631 and x.sarg == "ruby", 

1632 ) 

1633 if rub is not None: 1633 ↛ 1678line 1633 didn't jump to line 1678 because the condition on line 1633 was always true

1634 for r in rub: 

1635 if TYPE_CHECKING: 

1636 # we know the lambda above in recursively_extract 

1637 # returns only WikiNodes in rub 

1638 assert isinstance(r, WikiNode) 

1639 rt = parse_ruby(wxr, r) 

1640 if rt is not None: 1640 ↛ 1634line 1640 didn't jump to line 1634 because the condition on line 1640 was always true

1641 ruby.append(rt) 

1642 elif lang_code == "vi": 

1643 # Handle vi-readings templates that have a weird structures for 

1644 # Chu Nom vietnamese characters heads 

1645 # https://en.wiktionary.org/wiki/Template:vi-readings 

1646 new_header_nodes = [] 

1647 related_readings: list[LinkageData] = [] 

1648 for node in header_nodes: 

1649 if ( 1649 ↛ 1673line 1649 didn't jump to line 1673 because the condition on line 1649 was always true

1650 isinstance(node, TemplateNode) 

1651 and node.template_name == "vi-readings" 

1652 ): 

1653 print(node.template_parameters) 

1654 for parameter, tag in ( 

1655 ("hanviet", "han-viet-reading"), 

1656 ("nom", "nom-reading"), 

1657 # we ignore the fanqie parameter "phienthiet" 

1658 ): 

1659 arg = node.template_parameters.get(parameter) 

1660 if arg is not None: 1660 ↛ 1654line 1660 didn't jump to line 1654 because the condition on line 1660 was always true

1661 text = clean_node(wxr, None, arg) 

1662 for w in text.split(","): 

1663 # ignore - separated references 

1664 if "-" in w: 

1665 w = w[: w.index("-")] 

1666 w = w.strip() 

1667 related_readings.append( 

1668 LinkageData(word=w, tags=[tag]) 

1669 ) 

1670 continue 

1671 

1672 # Skip the vi-reading template for the rest of the head parsing 

1673 new_header_nodes.append(node) 

1674 if len(related_readings) > 0: 1674 ↛ 1678line 1674 didn't jump to line 1678 because the condition on line 1674 was always true

1675 data_extend(pos_data, "related", related_readings) 

1676 header_nodes = new_header_nodes 

1677 

1678 header_text = clean_node( 

1679 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1680 ) 

1681 

1682 if not header_text.strip(): 

1683 return 

1684 

1685 term_label_tags: list[str] = [] 

1686 term_label_topics: list[str] = [] 

1687 if len(term_label_templates) > 0: 

1688 # parse term label templates; if there are other similar kinds 

1689 # of templates in headers that you want to squash and apply as 

1690 # tags, you can add them to WORD_LEVEL_HEAD_TEMPLATES 

1691 for templ_data in term_label_templates: 

1692 # print(templ_data) 

1693 expan = templ_data.get("expansion", "").strip("().,; ") 

1694 if not expan: 1694 ↛ 1695line 1694 didn't jump to line 1695 because the condition on line 1694 was never true

1695 continue 

1696 tlb_tagsets, tlb_topics = decode_tags(expan) 

1697 for tlb_tags in tlb_tagsets: 

1698 if len(tlb_tags) > 0 and not any( 

1699 t.startswith("error-") for t in tlb_tags 

1700 ): 

1701 term_label_tags.extend(tlb_tags) 

1702 term_label_topics.extend(tlb_topics) 

1703 # print(f"{tlb_tagsets=}, {tlb_topicsets=}") 

1704 

1705 header_text = re.sub(r"\s+", " ", header_text) 

1706 # print(f"{header_text=}") 

1707 parse_word_head( 

1708 wxr, 

1709 pos_type, 

1710 header_text, 

1711 pos_data, 

1712 is_reconstruction, 

1713 header_group, 

1714 ruby=ruby, 

1715 links=links, 

1716 ) 

1717 if "tags" in pos_data: 

1718 # pos_data can get "tags" data from some source; type-checkers 

1719 # doesn't like it, so let's ignore it. 

1720 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1721 del pos_data["tags"] # type: ignore[typeddict-item] 

1722 if len(term_label_tags) > 0: 

1723 header_tags.extend(term_label_tags) 

1724 if len(term_label_topics) > 0: 

1725 header_topics.extend(term_label_topics) 

1726 

1727 def process_gloss_without_list( 

1728 nodes: list[Union[WikiNode, str]], 

1729 pos_type: str, 

1730 pos_data: WordData, 

1731 header_tags: list[str], 

1732 header_topics: list[str], 

1733 ) -> None: 

1734 # gloss text might not inside a list 

1735 header_nodes: list[Union[str, WikiNode]] = [] 

1736 gloss_nodes: list[Union[str, WikiNode]] = [] 

1737 for node in strip_nodes(nodes): 

1738 if isinstance(node, WikiNode): 

1739 if isinstance(node, TemplateNode): 

1740 if node.template_name in ( 

1741 "zh-see", 

1742 "ja-see", 

1743 "ja-see-kango", 

1744 ): 

1745 continue # soft redirect 

1746 elif ( 

1747 node.template_name == "head" 

1748 or node.template_name.startswith(f"{lang_code}-") 

1749 ): 

1750 header_nodes.append(node) 

1751 continue 

1752 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 1752 ↛ 1754line 1752 didn't jump to line 1754 because the condition on line 1752 was always true

1753 break 

1754 gloss_nodes.append(node) 

1755 

1756 if len(header_nodes) > 0: 

1757 process_gloss_header( 

1758 header_nodes, 

1759 pos_type, 

1760 None, 

1761 pos_data, 

1762 header_tags, 

1763 header_topics, 

1764 ) 

1765 if len(gloss_nodes) > 0: 

1766 process_gloss_contents( 

1767 gloss_nodes, 

1768 pos_type, 

1769 {"tags": list(header_tags), "topics": list(header_topics)}, 

1770 ) 

1771 

1772 def parse_sense_node( 

1773 node: Union[str, WikiNode], # never receives str 

1774 sense_base: SenseData, 

1775 pos: str, 

1776 ) -> bool: 

1777 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1778 Uses push_sense() to attempt adding data to pos_data in the scope 

1779 of parse_language() when it reaches deep in the recursion. push_sense() 

1780 returns True if it succeeds, and that is bubbled up the stack; if 

1781 a sense was added downstream, the higher levels (whose shared data 

1782 was already added by a subsense) do not push_sense(), unless it 

1783 has examples that need to be put somewhere. 

1784 """ 

1785 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1786 

1787 nonlocal sense_ordinal 

1788 my_ordinal = sense_ordinal # copies, not a reference 

1789 sense_ordinal += 1 # only use for sorting 

1790 

1791 if not isinstance(node, WikiNode): 1791 ↛ 1793line 1791 didn't jump to line 1793 because the condition on line 1791 was never true

1792 # This doesn't seem to ever happen in practice. 

1793 wxr.wtp.debug( 

1794 "{}: parse_sense_node called with" 

1795 "something that isn't a WikiNode".format(pos), 

1796 sortid="page/1287/20230119", 

1797 ) 

1798 return False 

1799 

1800 if node.kind != NodeKind.LIST_ITEM: 1800 ↛ 1801line 1800 didn't jump to line 1801 because the condition on line 1800 was never true

1801 wxr.wtp.debug( 

1802 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1803 ) 

1804 return False 

1805 

1806 if node.sarg == ":": 

1807 # Skip example entries at the highest level, ones without 

1808 # a sense ("...#") above them. 

1809 # If node.sarg is exactly and only ":", then it's at 

1810 # the highest level; lower levels would have more 

1811 # "indentation", like "#:" or "##:" 

1812 return False 

1813 

1814 # If a recursion call succeeds in push_sense(), bubble it up with 

1815 # `added`. 

1816 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1817 added = False 

1818 

1819 gloss_template_args: set[str] = set() 

1820 

1821 # For LISTs and LIST_ITEMS, their argument is something like 

1822 # "##" or "##:", and using that we can rudimentally determine 

1823 # list 'depth' if need be, and also what kind of list or 

1824 # entry it is; # is for normal glosses, : for examples (indent) 

1825 # and * is used for quotations on wiktionary. 

1826 current_depth = node.sarg 

1827 

1828 children = node.children 

1829 

1830 # subentries, (presumably) a list 

1831 # of subglosses below this. The list's 

1832 # argument ends with #, and its depth should 

1833 # be bigger than parent node. 

1834 subentries = [ 

1835 x 

1836 for x in children 

1837 if isinstance(x, WikiNode) 

1838 and x.kind == NodeKind.LIST 

1839 and x.sarg == current_depth + "#" 

1840 ] 

1841 

1842 # sublists of examples and quotations. .sarg 

1843 # does not end with "#". 

1844 others = [ 

1845 x 

1846 for x in children 

1847 if isinstance(x, WikiNode) 

1848 and x.kind == NodeKind.LIST 

1849 and x.sarg != current_depth + "#" 

1850 ] 

1851 

1852 # the actual contents of this particular node. 

1853 # can be a gloss (or a template that expands into 

1854 # many glosses which we can't easily pre-expand) 

1855 # or could be an "outer gloss" with more specific 

1856 # subglosses, or could be a qualfier for the subglosses. 

1857 contents = [ 

1858 x 

1859 for x in children 

1860 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1861 ] 

1862 # If this entry has sublists of entries, we should combine 

1863 # gloss information from both the "outer" and sublist content. 

1864 # Sometimes the outer gloss 

1865 # is more non-gloss or tags, sometimes it is a coarse sense 

1866 # and the inner glosses are more specific. The outer one 

1867 # does not seem to have qualifiers. 

1868 

1869 # If we have one sublist with one element, treat it 

1870 # specially as it may be a Wiktionary error; raise 

1871 # that nested element to the same level. 

1872 # XXX If need be, this block can be easily removed in 

1873 # the current recursive logicand the result is one sense entry 

1874 # with both glosses in the glosses list, as you would 

1875 # expect. If the higher entry has examples, there will 

1876 # be a higher entry with some duplicated data. 

1877 if len(subentries) == 1: 

1878 slc = subentries[0].children 

1879 if len(slc) == 1: 

1880 # copy current node and modify it so it doesn't 

1881 # loop infinitely. 

1882 cropped_node = copy.copy(node) 

1883 cropped_node.children = [ 

1884 x 

1885 for x in children 

1886 if not ( 

1887 isinstance(x, WikiNode) 

1888 and x.kind == NodeKind.LIST 

1889 and x.sarg == current_depth + "#" 

1890 ) 

1891 ] 

1892 added |= parse_sense_node(cropped_node, sense_base, pos) 

1893 nonlocal sense_data # this kludge causes duplicated raw_ 

1894 # glosses data if this is not done; 

1895 # if the top-level (cropped_node) 

1896 # does not push_sense() properly or 

1897 # parse_sense_node() returns early, 

1898 # sense_data is not reset. This happens 

1899 # for example when you have a no-gloss 

1900 # string like "(intransitive)": 

1901 # no gloss, push_sense() returns early 

1902 # and sense_data has duplicate data with 

1903 # sense_base 

1904 sense_data = {} 

1905 added |= parse_sense_node(slc[0], sense_base, pos) 

1906 return added 

1907 

1908 return process_gloss_contents( 

1909 contents, 

1910 pos, 

1911 sense_base, 

1912 subentries, 

1913 others, 

1914 gloss_template_args, 

1915 added, 

1916 my_ordinal, 

1917 ) 

1918 

1919 def process_gloss_contents( 

1920 contents: list[Union[str, WikiNode]], 

1921 pos: str, 

1922 sense_base: SenseData, 

1923 subentries: list[WikiNode] = [], 

1924 others: list[WikiNode] = [], 

1925 gloss_template_args: Set[str] = set(), 

1926 added: bool = False, 

1927 sorting_ordinal: int | None = None, 

1928 ) -> bool: 

1929 def sense_template_fn( 

1930 name: str, ht: TemplateArgs, is_gloss: bool = False 

1931 ) -> Optional[str]: 

1932 # print(f"sense_template_fn: {name}, {ht}") 

1933 if name in wikipedia_templates: 

1934 # parse_wikipedia_template(wxr, pos_data, ht) 

1935 return None 

1936 if is_panel_template(wxr, name): 

1937 return "" 

1938 if name in INFO_TEMPLATE_FUNCS: 

1939 info_data, info_exp = parse_info_template_arguments( 

1940 wxr, name, ht, "sense" 

1941 ) 

1942 if info_data or info_exp: 1942 ↛ 1948line 1942 didn't jump to line 1948 because the condition on line 1942 was always true

1943 if info_data: 1943 ↛ 1945line 1943 didn't jump to line 1945 because the condition on line 1943 was always true

1944 data_append(sense_base, "info_templates", info_data) 

1945 if info_exp and isinstance(info_exp, str): 1945 ↛ 1947line 1945 didn't jump to line 1947 because the condition on line 1945 was always true

1946 return info_exp 

1947 return "" 

1948 if name in ("defdate",): 

1949 date = clean_node(wxr, None, ht.get(1, ())) 

1950 if part_two := ht.get(2): 1950 ↛ 1952line 1950 didn't jump to line 1952 because the condition on line 1950 was never true

1951 # Unicode mdash, not '-' 

1952 date += "–" + clean_node(wxr, None, part_two) 

1953 refs: dict[str, ReferenceData] = {} 

1954 # ref, refn, ref2, ref2n, ref3, ref3n 

1955 # ref1 not valid 

1956 for k, v in sorted( 

1957 (k, v) for k, v in ht.items() if isinstance(k, str) 

1958 ): 

1959 if m := re.match(r"ref(\d?)(n?)", k): 1959 ↛ 1956line 1959 didn't jump to line 1956 because the condition on line 1959 was always true

1960 ref_v = clean_node(wxr, None, v) 

1961 if m.group(1) not in refs: # empty string or digit 

1962 refs[m.group(1)] = ReferenceData() 

1963 if m.group(2): 

1964 refs[m.group(1)]["refn"] = ref_v 

1965 else: 

1966 refs[m.group(1)]["text"] = ref_v 

1967 data_append( 

1968 sense_base, 

1969 "attestations", 

1970 AttestationData(date=date, references=list(refs.values())), 

1971 ) 

1972 return "" 

1973 if name == "senseid": 

1974 langid = clean_node(wxr, None, ht.get(1, ())) 

1975 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1976 if re.match(r"Q\d+$", arg): 

1977 data_append(sense_base, "wikidata", arg) 

1978 data_append(sense_base, "senseid", langid + ":" + arg) 

1979 if name in sense_linkage_templates: 

1980 # print(f"SENSE_TEMPLATE_FN: {name}") 

1981 parse_sense_linkage(wxr, sense_base, name, ht, pos) 

1982 return "" 

1983 if name == "†" or name == "zh-obsolete": 

1984 data_append(sense_base, "tags", "obsolete") 

1985 return "" 

1986 if name in { 

1987 "ux", 

1988 "uxi", 

1989 "usex", 

1990 "afex", 

1991 "prefixusex", 

1992 "ko-usex", 

1993 "ko-x", 

1994 "hi-x", 

1995 "ja-usex-inline", 

1996 "ja-x", 

1997 "quotei", 

1998 "he-x", 

1999 "hi-x", 

2000 "km-x", 

2001 "ne-x", 

2002 "shn-x", 

2003 "th-x", 

2004 "ur-x", 

2005 }: 

2006 # Usage examples are captured separately below. We don't 

2007 # want to expand them into glosses even when unusual coding 

2008 # is used in the entry. 

2009 # These templates may slip through inside another item, but 

2010 # currently we're separating out example entries (..#:) 

2011 # well enough that there seems to very little contamination. 

2012 if is_gloss: 

2013 wxr.wtp.wiki_notice( 

2014 "Example template is used for gloss text", 

2015 sortid="extractor.en.page.sense_template_fn/1415", 

2016 ) 

2017 else: 

2018 return "" 

2019 if name == "w": 2019 ↛ 2020line 2019 didn't jump to line 2020 because the condition on line 2019 was never true

2020 if ht.get(2) == "Wp": 

2021 return "" 

2022 for v in ht.values(): 

2023 v = v.strip() 

2024 if v and "<" not in v: 

2025 gloss_template_args.add(v) 

2026 return None 

2027 

2028 def extract_link_texts(item: GeneralNode) -> None: 

2029 """Recursively extracts link texts from the gloss source. This 

2030 information is used to select whether to remove final "." from 

2031 form_of/alt_of (e.g., ihm/Hunsrik).""" 

2032 if isinstance(item, (list, tuple)): 

2033 for x in item: 

2034 extract_link_texts(x) 

2035 return 

2036 if isinstance(item, str): 

2037 # There seem to be HTML sections that may futher contain 

2038 # unparsed links. 

2039 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 2039 ↛ 2040line 2039 didn't jump to line 2040 because the loop on line 2039 never started

2040 print("ITER:", m.group(0)) 

2041 v = m.group(1).split("|")[-1].strip() 

2042 if v: 

2043 gloss_template_args.add(v) 

2044 return 

2045 if not isinstance(item, WikiNode): 2045 ↛ 2046line 2045 didn't jump to line 2046 because the condition on line 2045 was never true

2046 return 

2047 if item.kind == NodeKind.LINK: 

2048 v = item.largs[-1] 

2049 if ( 2049 ↛ 2055line 2049 didn't jump to line 2055 because the condition on line 2049 was always true

2050 isinstance(v, list) 

2051 and len(v) == 1 

2052 and isinstance(v[0], str) 

2053 ): 

2054 gloss_template_args.add(v[0].strip()) 

2055 for x in item.children: 

2056 extract_link_texts(x) 

2057 

2058 extract_link_texts(contents) 

2059 

2060 # get the raw text of non-list contents of this node, and other stuff 

2061 # like tag and category data added to sense_base 

2062 # cast = no-op type-setter for the type-checker 

2063 partial_template_fn = cast( 

2064 TemplateFnCallable, 

2065 partial(sense_template_fn, is_gloss=True), 

2066 ) 

2067 rawgloss = clean_node( 

2068 wxr, 

2069 sense_base, 

2070 contents, 

2071 template_fn=partial_template_fn, 

2072 collect_links=True, 

2073 ) 

2074 

2075 if not rawgloss: 2075 ↛ 2076line 2075 didn't jump to line 2076 because the condition on line 2075 was never true

2076 return False 

2077 

2078 # remove manually typed ordered list text at the start("1. ") 

2079 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

2080 

2081 # get stuff like synonyms and categories from "others", 

2082 # maybe examples and quotations 

2083 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

2084 

2085 # The gloss could contain templates that produce more list items. 

2086 # This happens commonly with, e.g., {{inflection of|...}}. Split 

2087 # to parts. However, e.g. Interlingua generates multiple glosses 

2088 # in HTML directly without Wikitext markup, so we must also split 

2089 # by just newlines. 

2090 subglosses = rawgloss.splitlines() 

2091 

2092 if len(subglosses) == 0: 2092 ↛ 2093line 2092 didn't jump to line 2093 because the condition on line 2092 was never true

2093 return False 

2094 

2095 if any(s.startswith("#") for s in subglosses): 

2096 subtree = wxr.wtp.parse(rawgloss) 

2097 # from wikitextprocessor.parser import print_tree 

2098 # print("SUBTREE GENERATED BY TEMPLATE:") 

2099 # print_tree(subtree) 

2100 new_subentries = [ 

2101 x 

2102 for x in subtree.children 

2103 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2104 ] 

2105 

2106 new_others = [ 

2107 x 

2108 for x in subtree.children 

2109 if isinstance(x, WikiNode) 

2110 and x.kind == NodeKind.LIST 

2111 and not x.sarg.endswith("#") 

2112 ] 

2113 

2114 new_contents = [ 

2115 clean_node(wxr, [], x) 

2116 for x in subtree.children 

2117 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2118 ] 

2119 

2120 subentries = subentries or new_subentries 

2121 others = others or new_others 

2122 subglosses = new_contents 

2123 rawgloss = "".join(subglosses) 

2124 # Generate no gloss for translation hub pages, but add the 

2125 # "translation-hub" tag for them 

2126 if rawgloss == "(This entry is a translation hub.)": 2126 ↛ 2127line 2126 didn't jump to line 2127 because the condition on line 2126 was never true

2127 data_append(sense_data, "tags", "translation-hub") 

2128 return push_sense(sorting_ordinal) 

2129 

2130 # Remove certain substrings specific to outer glosses 

2131 strip_ends = [", particularly:"] 

2132 for x in strip_ends: 

2133 if rawgloss.endswith(x): 

2134 rawgloss = rawgloss[: -len(x)].strip() 

2135 break 

2136 

2137 # A single gloss, or possibly an outer gloss. 

2138 # Check if the possible outer gloss starts with 

2139 # parenthesized tags/topics 

2140 

2141 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 

2142 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

2143 m = QUALIFIERS_RE.match(rawgloss) 

2144 # (...): ... or (...(...)...): ... 

2145 if m: 

2146 q = m.group(1) 

2147 rawgloss = rawgloss[m.end() :].strip() 

2148 parse_sense_qualifier(wxr, q, sense_base) 

2149 if rawgloss == "A pejorative:": 2149 ↛ 2150line 2149 didn't jump to line 2150 because the condition on line 2149 was never true

2150 data_append(sense_base, "tags", "pejorative") 

2151 rawgloss = "" 

2152 elif rawgloss == "Short forms.": 2152 ↛ 2153line 2152 didn't jump to line 2153 because the condition on line 2152 was never true

2153 data_append(sense_base, "tags", "abbreviation") 

2154 rawgloss = "" 

2155 elif rawgloss == "Technical or specialized senses.": 2155 ↛ 2156line 2155 didn't jump to line 2156 because the condition on line 2155 was never true

2156 rawgloss = "" 

2157 elif rawgloss.startswith("inflection of "): 

2158 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

2159 if parsed is not None: 2159 ↛ 2168line 2159 didn't jump to line 2168 because the condition on line 2159 was always true

2160 tags, origins = parsed 

2161 if origins is not None: 2161 ↛ 2163line 2161 didn't jump to line 2163 because the condition on line 2161 was always true

2162 data_extend(sense_base, "form_of", origins) 

2163 if tags is not None: 2163 ↛ 2166line 2163 didn't jump to line 2166 because the condition on line 2163 was always true

2164 data_extend(sense_base, "tags", tags) 

2165 else: 

2166 data_append(sense_base, "tags", "form-of") 

2167 else: 

2168 data_append(sense_base, "tags", "form-of") 

2169 if rawgloss: 2169 ↛ 2200line 2169 didn't jump to line 2200 because the condition on line 2169 was always true

2170 # Code duplicating a lot of clean-up operations from later in 

2171 # this block. We want to clean up the "supergloss" as much as 

2172 # possible, in almost the same way as a normal gloss. 

2173 supergloss = rawgloss 

2174 

2175 if supergloss.startswith("; "): 2175 ↛ 2176line 2175 didn't jump to line 2176 because the condition on line 2175 was never true

2176 supergloss = supergloss[1:].strip() 

2177 

2178 if supergloss.startswith(("^†", "†")): 

2179 data_append(sense_base, "tags", "obsolete") 

2180 supergloss = supergloss[2:].strip() 

2181 elif supergloss.startswith("^‡"): 2181 ↛ 2182line 2181 didn't jump to line 2182 because the condition on line 2181 was never true

2182 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

2183 supergloss = supergloss[2:].strip() 

2184 

2185 # remove [14th century...] style brackets at the end 

2186 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

2187 

2188 if supergloss.startswith((",", ":")): 

2189 supergloss = supergloss[1:] 

2190 supergloss = supergloss.strip() 

2191 if supergloss.startswith("N. of "): 2191 ↛ 2192line 2191 didn't jump to line 2192 because the condition on line 2191 was never true

2192 supergloss = "Name of " + supergloss[6:] 

2193 supergloss = supergloss[2:] 

2194 data_append(sense_base, "glosses", supergloss) 

2195 if supergloss in ("A person:",): 

2196 data_append(sense_base, "tags", "g-person") 

2197 

2198 # The main recursive call (except for the exceptions at the 

2199 # start of this function). 

2200 for sublist in subentries: 

2201 if not ( 2201 ↛ 2204line 2201 didn't jump to line 2204 because the condition on line 2201 was never true

2202 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

2203 ): 

2204 wxr.wtp.debug( 

2205 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

2206 f"with items that are not LISTs", 

2207 sortid="page/1511/20230119", 

2208 ) 

2209 continue 

2210 for item in sublist.children: 

2211 if not ( 2211 ↛ 2215line 2211 didn't jump to line 2215 because the condition on line 2211 was never true

2212 isinstance(item, WikiNode) 

2213 and item.kind == NodeKind.LIST_ITEM 

2214 ): 

2215 continue 

2216 # copy sense_base to prevent cross-contamination between 

2217 # subglosses and other subglosses and superglosses 

2218 sense_base2 = copy.deepcopy(sense_base) 

2219 if parse_sense_node(item, sense_base2, pos): 2219 ↛ 2210line 2219 didn't jump to line 2210 because the condition on line 2219 was always true

2220 added = True 

2221 

2222 # Capture examples. 

2223 # This is called after the recursive calls above so that 

2224 # sense_base is not contaminated with meta-data from 

2225 # example entries for *this* gloss. 

2226 examples = [] 

2227 if wxr.config.capture_examples: 2227 ↛ 2231line 2227 didn't jump to line 2231 because the condition on line 2227 was always true

2228 examples = extract_examples(others, sense_base) 

2229 

2230 # push_sense() succeeded somewhere down-river, so skip this level 

2231 if added: 

2232 if examples: 

2233 # this higher-up gloss has examples that we do not want to skip 

2234 wxr.wtp.debug( 

2235 "'{}[...]' gloss has examples we want to keep, " 

2236 "but there are subglosses.".format(repr(rawgloss[:30])), 

2237 sortid="page/1498/20230118", 

2238 ) 

2239 else: 

2240 return True 

2241 

2242 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2243 # after the gloss, but these sentences don't seem to be intended 

2244 # as glosses. Skip them. 

2245 indexed_subglosses = list( 

2246 (i, gl) 

2247 for i, gl in enumerate(subglosses) 

2248 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2249 ) 

2250 

2251 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2251 ↛ 2252line 2251 didn't jump to line 2252 because the condition on line 2251 was never true

2252 gl = indexed_subglosses[0][1].strip() 

2253 if gl.endswith(":"): 

2254 gl = gl[:-1].strip() 

2255 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2256 if parsed is not None: 

2257 infl_tags, infl_dts = parsed 

2258 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2259 # Interpret others as a particular form under 

2260 # "inflection of" 

2261 data_extend(sense_base, "tags", infl_tags) 

2262 data_extend(sense_base, "form_of", infl_dts) 

2263 indexed_subglosses = indexed_subglosses[1:] 

2264 elif not infl_dts: 

2265 data_extend(sense_base, "tags", infl_tags) 

2266 indexed_subglosses = indexed_subglosses[1:] 

2267 

2268 # Create senses for remaining subglosses 

2269 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2270 gloss = gloss.strip() 

2271 if not gloss and len(indexed_subglosses) > 1: 2271 ↛ 2272line 2271 didn't jump to line 2272 because the condition on line 2271 was never true

2272 continue 

2273 # Push a new sense (if the last one is not empty) 

2274 if push_sense(sorting_ordinal): 2274 ↛ 2275line 2274 didn't jump to line 2275 because the condition on line 2274 was never true

2275 added = True 

2276 # if gloss not in sense_data.get("raw_glosses", ()): 

2277 # data_append(sense_data, "raw_glosses", gloss) 

2278 if i == 0 and examples: 

2279 # In a multi-line gloss, associate examples 

2280 # with only one of them. 

2281 # XXX or you could use gloss_i == len(indexed_subglosses) 

2282 # to associate examples with the *last* one. 

2283 data_extend(sense_data, "examples", examples) 

2284 if gloss.startswith("; ") and gloss_i > 0: 2284 ↛ 2285line 2284 didn't jump to line 2285 because the condition on line 2284 was never true

2285 gloss = gloss[1:].strip() 

2286 # If the gloss starts with †, mark as obsolete 

2287 if gloss.startswith("^†"): 2287 ↛ 2288line 2287 didn't jump to line 2288 because the condition on line 2287 was never true

2288 data_append(sense_data, "tags", "obsolete") 

2289 gloss = gloss[2:].strip() 

2290 elif gloss.startswith("^‡"): 2290 ↛ 2291line 2290 didn't jump to line 2291 because the condition on line 2290 was never true

2291 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2292 gloss = gloss[2:].strip() 

2293 # Copy data for all senses to this sense 

2294 for k, v in sense_base.items(): 

2295 if isinstance(v, (list, tuple)): 

2296 if k != "tags": 

2297 # Tags handled below (countable/uncountable special) 

2298 data_extend(sense_data, k, v) 

2299 else: 

2300 assert k not in ("tags", "categories", "topics") 

2301 sense_data[k] = v # type:ignore[literal-required] 

2302 # Parse the gloss for this particular sense 

2303 m = QUALIFIERS_RE.match(gloss) 

2304 # (...): ... or (...(...)...): ... 

2305 if m: 

2306 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2307 gloss = gloss[m.end() :].strip() 

2308 

2309 # Remove common suffix "[from 14th c.]" and similar 

2310 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2311 

2312 # Check to make sure we don't have unhandled list items in gloss 

2313 ofs = max(gloss.find("#"), gloss.find("* ")) 

2314 if ofs > 10 and "(#)" not in gloss: 

2315 wxr.wtp.debug( 

2316 "gloss may contain unhandled list items: {}".format(gloss), 

2317 sortid="page/1412", 

2318 ) 

2319 elif "\n" in gloss: 2319 ↛ 2320line 2319 didn't jump to line 2320 because the condition on line 2319 was never true

2320 wxr.wtp.debug( 

2321 "gloss contains newline: {}".format(gloss), 

2322 sortid="page/1416", 

2323 ) 

2324 

2325 # Kludge, some glosses have a comma after initial qualifiers in 

2326 # parentheses 

2327 if gloss.startswith((",", ":")): 

2328 gloss = gloss[1:] 

2329 gloss = gloss.strip() 

2330 if gloss.endswith(":"): 

2331 gloss = gloss[:-1].strip() 

2332 if gloss.startswith("N. of "): 2332 ↛ 2333line 2332 didn't jump to line 2333 because the condition on line 2332 was never true

2333 gloss = "Name of " + gloss[6:] 

2334 if gloss.startswith("†"): 2334 ↛ 2335line 2334 didn't jump to line 2335 because the condition on line 2334 was never true

2335 data_append(sense_data, "tags", "obsolete") 

2336 gloss = gloss[1:] 

2337 elif gloss.startswith("^†"): 2337 ↛ 2338line 2337 didn't jump to line 2338 because the condition on line 2337 was never true

2338 data_append(sense_data, "tags", "obsolete") 

2339 gloss = gloss[2:] 

2340 

2341 # Copy tags from sense_base if any. This will not copy 

2342 # countable/uncountable if either was specified in the sense, 

2343 # as sometimes both are specified in word head but only one 

2344 # in individual senses. 

2345 countability_tags = [] 

2346 base_tags = sense_base.get("tags", ()) 

2347 sense_tags = sense_data.get("tags", ()) 

2348 for tag in base_tags: 

2349 if tag in ("countable", "uncountable"): 

2350 if tag not in countability_tags: 2350 ↛ 2352line 2350 didn't jump to line 2352 because the condition on line 2350 was always true

2351 countability_tags.append(tag) 

2352 continue 

2353 if tag not in sense_tags: 

2354 data_append(sense_data, "tags", tag) 

2355 if countability_tags: 

2356 if ( 2356 ↛ 2365line 2356 didn't jump to line 2365 because the condition on line 2356 was always true

2357 "countable" not in sense_tags 

2358 and "uncountable" not in sense_tags 

2359 ): 

2360 data_extend(sense_data, "tags", countability_tags) 

2361 

2362 # If outer gloss specifies a form-of ("inflection of", see 

2363 # aquamarine/German), try to parse the inner glosses as 

2364 # tags for an inflected form. 

2365 if "form-of" in sense_base.get("tags", ()): 

2366 parsed = parse_alt_or_inflection_of( 

2367 wxr, gloss, gloss_template_args 

2368 ) 

2369 if parsed is not None: 2369 ↛ 2375line 2369 didn't jump to line 2375 because the condition on line 2369 was always true

2370 infl_tags, infl_dts = parsed 

2371 if not infl_dts and infl_tags: 2371 ↛ 2375line 2371 didn't jump to line 2375 because the condition on line 2371 was always true

2372 # Interpret as a particular form under "inflection of" 

2373 data_extend(sense_data, "tags", infl_tags) 

2374 

2375 if not gloss: 2375 ↛ 2376line 2375 didn't jump to line 2376 because the condition on line 2375 was never true

2376 data_append(sense_data, "tags", "empty-gloss") 

2377 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 

2378 if ( 2378 ↛ 2389line 2378 didn't jump to line 2389 because the condition on line 2378 was always true

2379 gloss_i == 0 

2380 and len(sense_data.get("glosses", tuple())) >= 1 

2381 ): 

2382 # If we added a "high-level gloss" from rawgloss, but this 

2383 # is that same gloss_i, add this instead of the raw_gloss 

2384 # from before if they're different: the rawgloss was not 

2385 # cleaned exactly the same as this later gloss 

2386 sense_data["glosses"][-1] = gloss 

2387 else: 

2388 # Add the gloss for the sense. 

2389 data_append(sense_data, "glosses", gloss) 

2390 

2391 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2392 # two abbreviations in the same sense, both generated by the 

2393 # {{abbreviation of|...}} template. Handle these with some magic. 

2394 position = 0 

2395 split_glosses = [] 

2396 for m in re.finditer(r"Abbreviation of ", gloss): 

2397 if m.start() != position: 2397 ↛ 2396line 2397 didn't jump to line 2396 because the condition on line 2397 was always true

2398 split_glosses.append(gloss[position : m.start()]) 

2399 position = m.start() 

2400 split_glosses.append(gloss[position:]) 

2401 for gloss in split_glosses: 

2402 # Check if this gloss describes an alt-of or inflection-of 

2403 if ( 

2404 lang_code != "en" 

2405 and " " not in gloss 

2406 and distw([word], gloss) < 0.3 

2407 ): 

2408 # Don't try to parse gloss if it is one word 

2409 # that is close to the word itself for non-English words 

2410 # (probable translations of a tag/form name) 

2411 continue 

2412 parsed = parse_alt_or_inflection_of( 

2413 wxr, gloss, gloss_template_args 

2414 ) 

2415 if parsed is None: 

2416 continue 

2417 tags, dts = parsed 

2418 if not dts and tags: 

2419 data_extend(sense_data, "tags", tags) 

2420 continue 

2421 for dt in dts: # type:ignore[union-attr] 

2422 ftags = list(tag for tag in tags if tag != "form-of") 

2423 if "alt-of" in tags: 

2424 data_extend(sense_data, "tags", ftags) 

2425 data_append(sense_data, "alt_of", dt) 

2426 elif "compound-of" in tags: 2426 ↛ 2427line 2426 didn't jump to line 2427 because the condition on line 2426 was never true

2427 data_extend(sense_data, "tags", ftags) 

2428 data_append(sense_data, "compound_of", dt) 

2429 elif "synonym-of" in tags: 2429 ↛ 2430line 2429 didn't jump to line 2430 because the condition on line 2429 was never true

2430 data_extend(dt, "tags", ftags) 

2431 data_append(sense_data, "synonyms", dt) 

2432 elif tags and dt.get("word", "").startswith("of "): 2432 ↛ 2433line 2432 didn't jump to line 2433 because the condition on line 2432 was never true

2433 dt["word"] = dt["word"][3:] 

2434 data_append(sense_data, "tags", "form-of") 

2435 data_extend(sense_data, "tags", ftags) 

2436 data_append(sense_data, "form_of", dt) 

2437 elif "form-of" in tags: 2437 ↛ 2421line 2437 didn't jump to line 2421 because the condition on line 2437 was always true

2438 data_extend(sense_data, "tags", tags) 

2439 data_append(sense_data, "form_of", dt) 

2440 

2441 if len(sense_data) == 0: 

2442 if len(sense_base.get("tags", [])) == 0: 2442 ↛ 2444line 2442 didn't jump to line 2444 because the condition on line 2442 was always true

2443 del sense_base["tags"] 

2444 sense_data.update(sense_base) 

2445 if push_sense(sorting_ordinal): 2445 ↛ 2449line 2445 didn't jump to line 2449 because the condition on line 2445 was always true

2446 # push_sense succeded in adding a sense to pos_data 

2447 added = True 

2448 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2449 return added 

2450 

2451 def parse_inflection( 

2452 node: WikiNode, section: str, pos: Optional[str] 

2453 ) -> None: 

2454 """Parses inflection data (declension, conjugation) from the given 

2455 page. This retrieves the actual inflection template 

2456 parameters, which are very useful for applications that need 

2457 to learn the inflection classes and generate inflected 

2458 forms.""" 

2459 assert isinstance(node, WikiNode) 

2460 assert isinstance(section, str) 

2461 assert pos is None or isinstance(pos, str) 

2462 # print("parse_inflection:", node) 

2463 

2464 if pos is None: 2464 ↛ 2465line 2464 didn't jump to line 2465 because the condition on line 2464 was never true

2465 wxr.wtp.debug( 

2466 "inflection table outside part-of-speech", sortid="page/1812" 

2467 ) 

2468 return 

2469 

2470 def inflection_template_fn( 

2471 name: str, ht: TemplateArgs 

2472 ) -> Optional[str]: 

2473 # print("decl_conj_template_fn", name, ht) 

2474 if is_panel_template(wxr, name): 2474 ↛ 2475line 2474 didn't jump to line 2475 because the condition on line 2474 was never true

2475 return "" 

2476 if name in ("is-u-mutation",): 2476 ↛ 2479line 2476 didn't jump to line 2479 because the condition on line 2476 was never true

2477 # These are not to be captured as an exception to the 

2478 # generic code below 

2479 return None 

2480 m = re.search( 

2481 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2482 r"declension|inflection|mut|mutation)($|-)", 

2483 name, 

2484 ) 

2485 if m: 

2486 args_ht = clean_template_args(wxr, ht) 

2487 dt = {"name": name, "args": args_ht} 

2488 data_append(pos_data, "inflection_templates", dt) 

2489 

2490 return None 

2491 

2492 # Convert the subtree back to Wikitext, then expand all and parse, 

2493 # capturing templates in the process 

2494 text = wxr.wtp.node_to_wikitext(node.children) 

2495 

2496 # Split text into separate sections for each to-level template 

2497 brace_matches = re.split(r"((?:^|\n)\s*{\||\n\s*\|}|{{+|}}+)", text) 

2498 # ["{{", "template", "}}"] or ["^{|", "table contents", "\n|}"] 

2499 # The (?:...) creates a non-capturing regex group; if it was capturing, 

2500 # like the group around it, it would create elements in brace_matches, 

2501 # including None if it doesn't match. 

2502 # 20250114: Added {| and |} into the regex because tables were being 

2503 # cut into pieces by this code. Issue #973, introduction of two-part 

2504 # book-end templates similar to trans-top and tran-bottom. 

2505 template_sections = [] 

2506 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2507 # Because there is the possibility of triple curly braces 

2508 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2509 # count nesting depth using pairs of two brackets, but 

2510 # instead use singular braces ("{ }"). 

2511 # Because template delimiters should be balanced, regardless 

2512 # of whether {{ or {{{ is used, and because we only care 

2513 # about the outer-most delimiters (the highest level template) 

2514 # we can just count the single braces when those single 

2515 # braces are part of a group. 

2516 table_nesting = 0 

2517 # However, if we have a stray table ({| ... |}) that should always 

2518 # be its own section, and should prevent templates from cutting it 

2519 # into sections. 

2520 

2521 # print(f"Parse inflection: {text=}") 

2522 # print(f"Brace matches: {repr('///'.join(brace_matches))}") 

2523 if len(brace_matches) > 1: 

2524 tsection: list[str] = [] 

2525 after_templates = False # kludge to keep any text 

2526 # before first template 

2527 # with the first template; 

2528 # otherwise, text 

2529 # goes with preceding template 

2530 for m in brace_matches: 

2531 if m.startswith("\n; ") and after_templates: 2531 ↛ 2532line 2531 didn't jump to line 2532 because the condition on line 2531 was never true

2532 after_templates = False 

2533 template_sections.append(tsection) 

2534 tsection = [] 

2535 tsection.append(m) 

2536 elif m.startswith("{{") or m.endswith("{|"): 

2537 if ( 

2538 template_nesting == 0 

2539 and after_templates 

2540 and table_nesting == 0 

2541 ): 

2542 template_sections.append(tsection) 

2543 tsection = [] 

2544 # start new section 

2545 after_templates = True 

2546 if m.startswith("{{"): 

2547 template_nesting += 1 

2548 else: 

2549 # m.endswith("{|") 

2550 table_nesting += 1 

2551 tsection.append(m) 

2552 elif m.startswith("}}") or m.endswith("|}"): 

2553 if m.startswith("}}"): 

2554 template_nesting -= 1 

2555 if template_nesting < 0: 2555 ↛ 2556line 2555 didn't jump to line 2556 because the condition on line 2555 was never true

2556 wxr.wtp.error( 

2557 "Negatively nested braces, " 

2558 "couldn't split inflection templates, " 

2559 "{}/{} section {}".format( 

2560 word, language, section 

2561 ), 

2562 sortid="page/1871", 

2563 ) 

2564 template_sections = [] # use whole text 

2565 break 

2566 else: 

2567 table_nesting -= 1 

2568 if table_nesting < 0: 2568 ↛ 2569line 2568 didn't jump to line 2569 because the condition on line 2568 was never true

2569 wxr.wtp.error( 

2570 "Negatively nested table braces, " 

2571 "couldn't split inflection section, " 

2572 "{}/{} section {}".format( 

2573 word, language, section 

2574 ), 

2575 sortid="page/20250114", 

2576 ) 

2577 template_sections = [] # use whole text 

2578 break 

2579 tsection.append(m) 

2580 else: 

2581 tsection.append(m) 

2582 if tsection: # dangling tsection 2582 ↛ 2590line 2582 didn't jump to line 2590 because the condition on line 2582 was always true

2583 template_sections.append(tsection) 

2584 # Why do it this way around? The parser has a preference 

2585 # to associate bits outside of tables with the preceding 

2586 # table (`after`-variable), so a new tsection begins 

2587 # at {{ and everything before it belongs to the previous 

2588 # template. 

2589 

2590 texts = [] 

2591 if not template_sections: 

2592 texts = [text] 

2593 else: 

2594 for tsection in template_sections: 

2595 texts.append("".join(tsection)) 

2596 if template_nesting != 0: 2596 ↛ 2597line 2596 didn't jump to line 2597 because the condition on line 2596 was never true

2597 wxr.wtp.error( 

2598 "Template nesting error: " 

2599 "template_nesting = {} " 

2600 "couldn't split inflection templates, " 

2601 "{}/{} section {}".format( 

2602 template_nesting, word, language, section 

2603 ), 

2604 sortid="page/1896", 

2605 ) 

2606 texts = [text] 

2607 for text in texts: 

2608 tree = wxr.wtp.parse( 

2609 text, expand_all=True, template_fn=inflection_template_fn 

2610 ) 

2611 

2612 if not text.strip(): 

2613 continue 

2614 

2615 # Parse inflection tables from the section. The data is stored 

2616 # under "forms". 

2617 if wxr.config.capture_inflections: 2617 ↛ 2607line 2617 didn't jump to line 2607 because the condition on line 2617 was always true

2618 tablecontext = None 

2619 m = re.search(r"{{([^}{|]+)\|?", text) 

2620 if m: 

2621 template_name = m.group(1) 

2622 tablecontext = TableContext(template_name) 

2623 

2624 parse_inflection_section( 

2625 wxr, 

2626 pos_data, 

2627 word, 

2628 language, 

2629 pos, 

2630 section, 

2631 tree, 

2632 tablecontext=tablecontext, 

2633 ) 

2634 

2635 def get_subpage_section( 

2636 title: str, subtitle: str, seqs: list[Union[list[str], tuple[str, ...]]] 

2637 ) -> Optional[Union[WikiNode, str]]: 

2638 """Loads a subpage of the given page, and finds the section 

2639 for the given language, part-of-speech, and section title. This 

2640 is used for finding translations and other sections on subpages.""" 

2641 assert isinstance(language, str) 

2642 assert isinstance(title, str) 

2643 assert isinstance(subtitle, str) 

2644 assert isinstance(seqs, (list, tuple)) 

2645 for seq in seqs: 

2646 for x in seq: 

2647 assert isinstance(x, str) 

2648 subpage_title = word + "/" + subtitle 

2649 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2650 if subpage_content is None: 

2651 wxr.wtp.error( 

2652 "/translations not found despite " 

2653 "{{see translation subpage|...}}", 

2654 sortid="page/1934", 

2655 ) 

2656 return None 

2657 

2658 def recurse( 

2659 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2660 ) -> Optional[Union[str, WikiNode]]: 

2661 # print(f"seq: {seq}") 

2662 if not seq: 

2663 return node 

2664 if not isinstance(node, WikiNode): 

2665 return None 

2666 # print(f"node.kind: {node.kind}") 

2667 if node.kind in LEVEL_KINDS: 

2668 t = clean_node(wxr, None, node.largs[0]) 

2669 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2670 if t.lower() == seq[0].lower(): 

2671 seq = seq[1:] 

2672 if not seq: 

2673 return node 

2674 for n in node.children: 

2675 ret = recurse(n, seq) 

2676 if ret is not None: 

2677 return ret 

2678 return None 

2679 

2680 tree = wxr.wtp.parse( 

2681 subpage_content, 

2682 pre_expand=True, 

2683 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2684 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2685 ) 

2686 assert tree.kind == NodeKind.ROOT 

2687 for seq in seqs: 

2688 ret = recurse(tree, seq) 

2689 if ret is None: 

2690 wxr.wtp.debug( 

2691 "Failed to find subpage section {}/{} seq {}".format( 

2692 title, subtitle, seq 

2693 ), 

2694 sortid="page/1963", 

2695 ) 

2696 return ret 

2697 

2698 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2699 """Parses translations for a word. This may also pull in translations 

2700 from separate translation subpages.""" 

2701 assert isinstance(data, dict) 

2702 assert isinstance(xlatnode, WikiNode) 

2703 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2704 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2705 # print("parse_translations xlatnode={}".format(xlatnode)) 

2706 if not wxr.config.capture_translations: 2706 ↛ 2707line 2706 didn't jump to line 2707 because the condition on line 2706 was never true

2707 return 

2708 sense_parts: list[Union[WikiNode, str]] = [] 

2709 sense: Optional[str] = None 

2710 

2711 def parse_translation_item( 

2712 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

2713 ) -> None: 

2714 nonlocal sense 

2715 assert isinstance(contents, list) 

2716 assert lang is None or isinstance(lang, str) 

2717 # print("PARSE_TRANSLATION_ITEM:", contents) 

2718 

2719 langcode: Optional[str] = None 

2720 if sense is None: 

2721 sense = clean_node(wxr, data, sense_parts).strip() 

2722 # print("sense <- clean_node: ", sense) 

2723 idx = sense.find("See also translations at") 

2724 if idx > 0: 2724 ↛ 2725line 2724 didn't jump to line 2725 because the condition on line 2724 was never true

2725 wxr.wtp.debug( 

2726 "Skipping translation see also: {}".format(sense), 

2727 sortid="page/2361", 

2728 ) 

2729 sense = sense[:idx].strip() 

2730 if sense.endswith(":"): 2730 ↛ 2731line 2730 didn't jump to line 2731 because the condition on line 2730 was never true

2731 sense = sense[:-1].strip() 

2732 if sense.endswith("—"): 2732 ↛ 2733line 2732 didn't jump to line 2733 because the condition on line 2732 was never true

2733 sense = sense[:-1].strip() 

2734 translations_from_template: list[str] = [] 

2735 

2736 def translation_item_template_fn( 

2737 name: str, ht: TemplateArgs 

2738 ) -> Optional[str]: 

2739 nonlocal langcode 

2740 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

2741 if is_panel_template(wxr, name): 

2742 return "" 

2743 if name in ("t+check", "t-check", "t-needed"): 

2744 # We ignore these templates. They seem to have outright 

2745 # garbage in some entries, and very varying formatting in 

2746 # others. These should be transitory and unreliable 

2747 # anyway. 

2748 return "__IGNORE__" 

2749 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

2750 code = ht.get(1) 

2751 if code: 2751 ↛ 2761line 2751 didn't jump to line 2761 because the condition on line 2751 was always true

2752 if langcode and code != langcode: 

2753 wxr.wtp.debug( 

2754 "inconsistent language codes {} vs " 

2755 "{} in translation item: {!r} {}".format( 

2756 langcode, code, name, ht 

2757 ), 

2758 sortid="page/2386", 

2759 ) 

2760 langcode = code 

2761 tr = ht.get(2) 

2762 if tr: 

2763 tr = clean_node(wxr, None, [tr]) 

2764 translations_from_template.append(tr) 

2765 return None 

2766 if name == "t-egy": 

2767 langcode = "egy" 

2768 return None 

2769 if name == "ttbc": 

2770 code = ht.get(1) 

2771 if code: 2771 ↛ 2773line 2771 didn't jump to line 2773 because the condition on line 2771 was always true

2772 langcode = code 

2773 return None 

2774 if name == "trans-see": 2774 ↛ 2775line 2774 didn't jump to line 2775 because the condition on line 2774 was never true

2775 wxr.wtp.error( 

2776 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

2777 ) 

2778 return "" 

2779 if name.endswith("-top"): 2779 ↛ 2780line 2779 didn't jump to line 2780 because the condition on line 2779 was never true

2780 return "" 

2781 if name.endswith("-bottom"): 2781 ↛ 2782line 2781 didn't jump to line 2782 because the condition on line 2781 was never true

2782 return "" 

2783 if name.endswith("-mid"): 2783 ↛ 2784line 2783 didn't jump to line 2784 because the condition on line 2783 was never true

2784 return "" 

2785 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

2786 # .format(name), 

2787 # sortid="page/2414") 

2788 return None 

2789 

2790 sublists = list( 

2791 x 

2792 for x in contents 

2793 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2794 ) 

2795 contents = list( 

2796 x 

2797 for x in contents 

2798 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2799 ) 

2800 

2801 item = clean_node( 

2802 wxr, data, contents, template_fn=translation_item_template_fn 

2803 ) 

2804 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

2805 

2806 # Parse the translation item. 

2807 if item: 2807 ↛ exitline 2807 didn't return from function 'parse_translation_item' because the condition on line 2807 was always true

2808 lang = parse_translation_item_text( 

2809 wxr, 

2810 word, 

2811 data, 

2812 item, 

2813 sense, 

2814 lang, 

2815 langcode, 

2816 translations_from_template, 

2817 is_reconstruction, 

2818 ) 

2819 

2820 # Handle sublists. They are frequently used for different 

2821 # scripts for the language and different variants of the 

2822 # language. We will include the lower-level header as a 

2823 # tag in those cases. 

2824 for listnode in sublists: 

2825 assert listnode.kind == NodeKind.LIST 

2826 for node in listnode.children: 

2827 if not isinstance(node, WikiNode): 2827 ↛ 2828line 2827 didn't jump to line 2828 because the condition on line 2827 was never true

2828 continue 

2829 if node.kind == NodeKind.LIST_ITEM: 2829 ↛ 2826line 2829 didn't jump to line 2826 because the condition on line 2829 was always true

2830 parse_translation_item(node.children, lang=lang) 

2831 

2832 def parse_translation_template(node: WikiNode) -> None: 

2833 assert isinstance(node, WikiNode) 

2834 

2835 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

2836 nonlocal sense_parts 

2837 nonlocal sense 

2838 if is_panel_template(wxr, name): 

2839 return "" 

2840 if name == "see also": 

2841 # XXX capture 

2842 # XXX for example, "/" has top-level list containing 

2843 # see also items. So also should parse those. 

2844 return "" 

2845 if name == "trans-see": 

2846 # XXX capture 

2847 return "" 

2848 if name == "see translation subpage": 2848 ↛ 2849line 2848 didn't jump to line 2849 because the condition on line 2848 was never true

2849 sense_parts = [] 

2850 sense = None 

2851 sub = ht.get(1, "") 

2852 if sub: 

2853 m = re.match( 

2854 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

2855 ) 

2856 else: 

2857 m = None 

2858 etym = "" 

2859 etym_numbered = "" 

2860 pos = "" 

2861 if m: 

2862 etym_numbered = m.group(1) 

2863 etym = m.group(2) 

2864 pos = m.group(3) 

2865 if not sub: 

2866 wxr.wtp.debug( 

2867 "no part-of-speech in " 

2868 "{{see translation subpage|...}}, " 

2869 "defaulting to just wxr.wtp.section " 

2870 "(= language)", 

2871 sortid="page/2468", 

2872 ) 

2873 # seq sent to get_subpage_section without sub and pos 

2874 seq = [ 

2875 language, 

2876 TRANSLATIONS_TITLE, 

2877 ] 

2878 elif ( 

2879 m 

2880 and etym.lower().strip() in ETYMOLOGY_TITLES 

2881 and pos.lower() in POS_TITLES 

2882 ): 

2883 seq = [ 

2884 language, 

2885 etym_numbered, 

2886 pos, 

2887 TRANSLATIONS_TITLE, 

2888 ] 

2889 elif sub.lower() in POS_TITLES: 

2890 # seq with sub but not pos 

2891 seq = [ 

2892 language, 

2893 sub, 

2894 TRANSLATIONS_TITLE, 

2895 ] 

2896 else: 

2897 # seq with sub and pos 

2898 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2899 if pos.lower() not in POS_TITLES: 

2900 wxr.wtp.debug( 

2901 "unhandled see translation subpage: " 

2902 "language={} sub={} " 

2903 "wxr.wtp.subsection={}".format( 

2904 language, sub, wxr.wtp.subsection 

2905 ), 

2906 sortid="page/2478", 

2907 ) 

2908 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

2909 subnode = get_subpage_section( 

2910 wxr.wtp.title or "MISSING_TITLE", 

2911 TRANSLATIONS_TITLE, 

2912 [seq], 

2913 ) 

2914 if subnode is None or not isinstance(subnode, WikiNode): 

2915 # Failed to find the normal subpage section 

2916 # seq with sub and pos 

2917 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

2918 # print(f"{language=}, {pos=}, {TRANSLATIONS_TITLE=}") 

2919 seqs: list[list[str] | tuple[str, ...]] = [ 

2920 [TRANSLATIONS_TITLE], 

2921 [language, pos], 

2922 ] 

2923 subnode = get_subpage_section( 

2924 wxr.wtp.title or "MISSING_TITLE", 

2925 TRANSLATIONS_TITLE, 

2926 seqs, 

2927 ) 

2928 if subnode is not None and isinstance(subnode, WikiNode): 

2929 parse_translations(data, subnode) 

2930 return "" 

2931 if name in ( 

2932 "c", 

2933 "C", 

2934 "categorize", 

2935 "cat", 

2936 "catlangname", 

2937 "topics", 

2938 "top", 

2939 "qualifier", 

2940 "cln", 

2941 ): 

2942 # These are expanded in the default way 

2943 return None 

2944 if name in ( 

2945 "trans-top", 

2946 "trans-top-see", 

2947 ): 

2948 # XXX capture id from trans-top? Capture sense here 

2949 # instead of trying to parse it from expanded content? 

2950 if ht.get(1): 

2951 sense_parts = [] 

2952 sense = ht.get(1) 

2953 else: 

2954 sense_parts = [] 

2955 sense = None 

2956 return None 

2957 if name in ( 

2958 "trans-bottom", 

2959 "trans-mid", 

2960 "checktrans-mid", 

2961 "checktrans-bottom", 

2962 ): 

2963 return None 

2964 if name == "checktrans-top": 

2965 sense_parts = [] 

2966 sense = None 

2967 return "" 

2968 if name == "trans-top-also": 

2969 # XXX capture? 

2970 sense_parts = [] 

2971 sense = None 

2972 return "" 

2973 wxr.wtp.error( 

2974 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

2975 name, ht 

2976 ), 

2977 sortid="page/2517", 

2978 ) 

2979 return "" 

2980 

2981 wxr.wtp.expand( 

2982 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

2983 ) 

2984 

2985 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

2986 nonlocal sense 

2987 nonlocal sense_parts 

2988 for node in xlatnode.children: 

2989 # print(node) 

2990 if isinstance(node, str): 

2991 if sense: 

2992 if not node.isspace(): 

2993 wxr.wtp.debug( 

2994 "skipping string in the middle of " 

2995 "translations: {}".format(node), 

2996 sortid="page/2530", 

2997 ) 

2998 continue 

2999 # Add a part to the sense 

3000 sense_parts.append(node) 

3001 sense = None 

3002 continue 

3003 assert isinstance(node, WikiNode) 

3004 kind = node.kind 

3005 if kind == NodeKind.LIST: 

3006 for item in node.children: 

3007 if not isinstance(item, WikiNode): 3007 ↛ 3008line 3007 didn't jump to line 3008 because the condition on line 3007 was never true

3008 continue 

3009 if item.kind != NodeKind.LIST_ITEM: 3009 ↛ 3010line 3009 didn't jump to line 3010 because the condition on line 3009 was never true

3010 continue 

3011 if item.sarg == ":": 3011 ↛ 3012line 3011 didn't jump to line 3012 because the condition on line 3011 was never true

3012 continue 

3013 parse_translation_item(item.children) 

3014 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3014 ↛ 3018line 3014 didn't jump to line 3018 because the condition on line 3014 was never true

3015 # Silently skip list items that are just indented; these 

3016 # are used for text between translations, such as indicating 

3017 # translations that need to be checked. 

3018 pass 

3019 elif kind == NodeKind.TEMPLATE: 

3020 parse_translation_template(node) 

3021 elif kind in ( 3021 ↛ 3026line 3021 didn't jump to line 3026 because the condition on line 3021 was never true

3022 NodeKind.TABLE, 

3023 NodeKind.TABLE_ROW, 

3024 NodeKind.TABLE_CELL, 

3025 ): 

3026 parse_translation_recurse(node) 

3027 elif kind == NodeKind.HTML: 

3028 if node.attrs.get("class") == "NavFrame": 3028 ↛ 3034line 3028 didn't jump to line 3034 because the condition on line 3028 was never true

3029 # Reset ``sense_parts`` (and force recomputing 

3030 # by clearing ``sense``) as each NavFrame specifies 

3031 # its own sense. This helps eliminate garbage coming 

3032 # from text at the beginning at the translations 

3033 # section. 

3034 sense_parts = [] 

3035 sense = None 

3036 # for item in node.children: 

3037 # if not isinstance(item, WikiNode): 

3038 # continue 

3039 # parse_translation_recurse(item) 

3040 parse_translation_recurse(node) 

3041 elif kind in LEVEL_KINDS: 3041 ↛ 3043line 3041 didn't jump to line 3043 because the condition on line 3041 was never true

3042 # Sub-levels will be recursed elsewhere 

3043 pass 

3044 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 

3045 parse_translation_recurse(node) 

3046 elif kind == NodeKind.PREFORMATTED: 3046 ↛ 3047line 3046 didn't jump to line 3047 because the condition on line 3046 was never true

3047 print("parse_translation_recurse: PREFORMATTED:", node) 

3048 elif kind == NodeKind.LINK: 3048 ↛ 3102line 3048 didn't jump to line 3102 because the condition on line 3048 was always true

3049 arg0 = node.largs[0] 

3050 # Kludge: I've seen occasional normal links to translation 

3051 # subpages from main pages (e.g., language/English/Noun 

3052 # in July 2021) instead of the normal 

3053 # {{see translation subpage|...}} template. This should 

3054 # handle them. Note: must be careful not to read other 

3055 # links, particularly things like in "human being": 

3056 # "a human being -- see [[man/translations]]" (group title) 

3057 if ( 3057 ↛ 3065line 3057 didn't jump to line 3065 because the condition on line 3057 was never true

3058 isinstance(arg0, (list, tuple)) 

3059 and arg0 

3060 and isinstance(arg0[0], str) 

3061 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3062 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3063 == wxr.wtp.title 

3064 ): 

3065 wxr.wtp.debug( 

3066 "translations subpage link found on main " 

3067 "page instead " 

3068 "of normal {{see translation subpage|...}}", 

3069 sortid="page/2595", 

3070 ) 

3071 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3072 if sub.lower() in POS_TITLES: 

3073 seq = [ 

3074 language, 

3075 sub, 

3076 TRANSLATIONS_TITLE, 

3077 ] 

3078 subnode = get_subpage_section( 

3079 wxr.wtp.title, 

3080 TRANSLATIONS_TITLE, 

3081 [seq], 

3082 ) 

3083 if subnode is not None and isinstance( 

3084 subnode, WikiNode 

3085 ): 

3086 parse_translations(data, subnode) 

3087 else: 

3088 wxr.wtp.error( 

3089 "/translations link outside part-of-speech" 

3090 ) 

3091 

3092 if ( 

3093 len(arg0) >= 1 

3094 and isinstance(arg0[0], str) 

3095 and not arg0[0].lower().startswith("category:") 

3096 ): 

3097 for x in node.largs[-1]: 

3098 if isinstance(x, str): 3098 ↛ 3101line 3098 didn't jump to line 3101 because the condition on line 3098 was always true

3099 sense_parts.append(x) 

3100 else: 

3101 parse_translation_recurse(x) 

3102 elif not sense: 

3103 sense_parts.append(node) 

3104 else: 

3105 wxr.wtp.debug( 

3106 "skipping text between translation items/senses: " 

3107 "{}".format(node), 

3108 sortid="page/2621", 

3109 ) 

3110 

3111 # Main code of parse_translation(). We want ``sense`` to be assigned 

3112 # regardless of recursion levels, and thus the code is structured 

3113 # to define at this level and recurse in parse_translation_recurse(). 

3114 parse_translation_recurse(xlatnode) 

3115 

3116 def parse_etymology(data: WordData, node: LevelNode) -> None: 

3117 """Parses an etymology section.""" 

3118 assert isinstance(data, dict) 

3119 assert isinstance(node, WikiNode) 

3120 

3121 templates: list[TemplateData] = [] 

3122 

3123 # Counter for preventing the capture of etymology templates 

3124 # when we are inside templates that we want to ignore (i.e., 

3125 # not capture). 

3126 ignore_count = 0 

3127 

3128 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3129 nonlocal ignore_count 

3130 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3131 return "" 

3132 if re.match(ignored_etymology_templates_re, name): 

3133 ignore_count += 1 

3134 return None 

3135 

3136 # CONTINUE_HERE 

3137 

3138 def etym_post_template_fn( 

3139 name: str, ht: TemplateArgs, expansion: str 

3140 ) -> None: 

3141 nonlocal ignore_count 

3142 if name in wikipedia_templates: 

3143 parse_wikipedia_template(wxr, data, ht) 

3144 return None 

3145 if re.match(ignored_etymology_templates_re, name): 

3146 ignore_count -= 1 

3147 return None 

3148 if ignore_count == 0: 3148 ↛ 3154line 3148 didn't jump to line 3154 because the condition on line 3148 was always true

3149 ht = clean_template_args(wxr, ht) 

3150 expansion = clean_node(wxr, None, expansion) 

3151 templates.append( 

3152 {"name": name, "args": ht, "expansion": expansion} 

3153 ) 

3154 return None 

3155 

3156 # Remove any subsections 

3157 contents = list( 

3158 x 

3159 for x in node.children 

3160 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3161 ) 

3162 # Convert to text, also capturing templates using post_template_fn 

3163 text = clean_node( 

3164 wxr, 

3165 None, 

3166 contents, 

3167 template_fn=etym_template_fn, 

3168 post_template_fn=etym_post_template_fn, 

3169 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3170 # Save the collected information. 

3171 if len(text) > 0: 

3172 data["etymology_text"] = text 

3173 if len(templates) > 0: 

3174 # Some etymology templates, like Template:root do not generate 

3175 # text, so they should be added here. Elsewhere, we check 

3176 # for Template:root and add some text to the expansion to please 

3177 # the validation. 

3178 data["etymology_templates"] = templates 

3179 

3180 for child_node in node.find_child_recursively( 3180 ↛ exitline 3180 didn't return from function 'parse_etymology' because the loop on line 3180 didn't complete

3181 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3182 ): 

3183 if child_node.kind in LEVEL_KIND_FLAGS: 

3184 break 

3185 elif isinstance( 3185 ↛ 3188line 3185 didn't jump to line 3188 because the condition on line 3185 was never true

3186 child_node, TemplateNode 

3187 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3188 if "etymology_examples" not in data: 

3189 data["etymology_examples"] = [] 

3190 data["etymology_examples"].extend( 

3191 extract_template_zh_x( 

3192 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3193 ) 

3194 ) 

3195 

3196 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3197 """This recurses into a subtree in the parse tree for a page.""" 

3198 nonlocal etym_data 

3199 nonlocal pos_data 

3200 nonlocal inside_level_four 

3201 

3202 redirect_list: list[str] = [] # for `zh-see` template 

3203 

3204 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3205 """This is called for otherwise unprocessed parts of the page. 

3206 We still expand them so that e.g. Category links get captured.""" 

3207 if name in wikipedia_templates: 

3208 data = select_data() 

3209 parse_wikipedia_template(wxr, data, ht) 

3210 return None 

3211 if is_panel_template(wxr, name): 

3212 return "" 

3213 return None 

3214 

3215 for node in treenode.children: 

3216 if not isinstance(node, WikiNode): 

3217 # print(" X{}".format(repr(node)[:40])) 

3218 continue 

3219 if isinstance(node, TemplateNode): 

3220 if process_soft_redirect_template(wxr, node, redirect_list): 

3221 continue 

3222 elif node.template_name == "zh-forms": 

3223 extract_zh_forms_template(wxr, node, select_data()) 

3224 elif ( 

3225 node.template_name.endswith("-kanjitab") 

3226 or node.template_name == "ja-kt" 

3227 ): 

3228 extract_ja_kanjitab_template(wxr, node, select_data()) 

3229 

3230 if not isinstance(node, LevelNode): 

3231 # XXX handle e.g. wikipedia links at the top of a language 

3232 # XXX should at least capture "also" at top of page 

3233 if node.kind in ( 

3234 NodeKind.HLINE, 

3235 NodeKind.LIST, 

3236 NodeKind.LIST_ITEM, 

3237 ): 

3238 continue 

3239 # print(" UNEXPECTED: {}".format(node)) 

3240 # Clean the node to collect category links 

3241 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3242 continue 

3243 t = clean_node( 

3244 wxr, etym_data, node.sarg if node.sarg else node.largs 

3245 ) 

3246 t = t.lower() 

3247 # XXX these counts were never implemented fully, and even this 

3248 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3249 wxr.config.section_counts[t] += 1 

3250 # print("PROCESS_CHILDREN: T:", repr(t)) 

3251 if t in IGNORED_TITLES: 

3252 pass 

3253 elif t.startswith(PRONUNCIATION_TITLE): 

3254 # Chinese Pronunciation section kludge; we demote these to 

3255 # be level 4 instead of 3 so that they're part of a larger 

3256 # etymology hierarchy; usually the data here is empty and 

3257 # acts as an inbetween between POS and Etymology data 

3258 if lang_code in ("zh",): 

3259 inside_level_four = True 

3260 if t.startswith(PRONUNCIATION_TITLE + " "): 

3261 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3262 # and each of them may have senses under Definition 

3263 push_level_four_section(True) 

3264 wxr.wtp.start_subsection(None) 

3265 if wxr.config.capture_pronunciation: 3265 ↛ 3373line 3265 didn't jump to line 3373 because the condition on line 3265 was always true

3266 data = select_data() 

3267 parse_pronunciation( 

3268 wxr, 

3269 node, 

3270 data, 

3271 etym_data, 

3272 have_etym, 

3273 base_data, 

3274 lang_code, 

3275 ) 

3276 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3277 push_etym() 

3278 wxr.wtp.start_subsection(None) 

3279 if wxr.config.capture_etymologies: 3279 ↛ 3373line 3279 didn't jump to line 3373 because the condition on line 3279 was always true

3280 m = re.search(r"\s(\d+)$", t) 

3281 if m: 

3282 etym_data["etymology_number"] = int(m.group(1)) 

3283 parse_etymology(etym_data, node) 

3284 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 

3285 data = select_data() 

3286 extract_descendant_section(wxr, data, node, False) 

3287 elif ( 

3288 t in PROTO_ROOT_DERIVED_TITLES 

3289 and pos == "root" 

3290 and is_reconstruction 

3291 and wxr.config.capture_descendants 

3292 ): 

3293 data = select_data() 

3294 extract_descendant_section(wxr, data, node, True) 

3295 elif t == TRANSLATIONS_TITLE: 

3296 data = select_data() 

3297 parse_translations(data, node) 

3298 elif t in INFLECTION_TITLES: 

3299 parse_inflection(node, t, pos) 

3300 elif t == "alternative forms": 

3301 extract_alt_form_section(wxr, select_data(), node) 

3302 else: 

3303 lst = t.split() 

3304 while len(lst) > 1 and lst[-1].isdigit(): 3304 ↛ 3305line 3304 didn't jump to line 3305 because the condition on line 3304 was never true

3305 lst = lst[:-1] 

3306 t_no_number = " ".join(lst).lower() 

3307 if t_no_number in POS_TITLES: 

3308 push_pos() 

3309 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3310 pos = dt["pos"] or "MISSING_POS" 

3311 wxr.wtp.start_subsection(t) 

3312 if "debug" in dt: 

3313 wxr.wtp.debug( 

3314 "{} in section {}".format(dt["debug"], t), 

3315 sortid="page/2755", 

3316 ) 

3317 if "warning" in dt: 3317 ↛ 3318line 3317 didn't jump to line 3318 because the condition on line 3317 was never true

3318 wxr.wtp.wiki_notice( 

3319 "{} in section {}".format(dt["warning"], t), 

3320 sortid="page/2759", 

3321 ) 

3322 if "error" in dt: 3322 ↛ 3323line 3322 didn't jump to line 3323 because the condition on line 3322 was never true

3323 wxr.wtp.error( 

3324 "{} in section {}".format(dt["error"], t), 

3325 sortid="page/2763", 

3326 ) 

3327 if "note" in dt: 3327 ↛ 3328line 3327 didn't jump to line 3328 because the condition on line 3327 was never true

3328 wxr.wtp.note( 

3329 "{} in section {}".format(dt["note"], t), 

3330 sortid="page/20251017a", 

3331 ) 

3332 if "wiki_notice" in dt: 3332 ↛ 3333line 3332 didn't jump to line 3333 because the condition on line 3332 was never true

3333 wxr.wtp.wiki_notice( 

3334 "{} in section {}".format(dt["wiki_notices"], t), 

3335 sortid="page/20251017b", 

3336 ) 

3337 # Parse word senses for the part-of-speech 

3338 parse_part_of_speech(node, pos) 

3339 if "tags" in dt: 

3340 for pdata in sense_datas: 

3341 data_extend(pdata, "tags", dt["tags"]) 

3342 elif t_no_number in LINKAGE_TITLES: 

3343 # print(f"LINKAGE_TITLES NODE {node=}") 

3344 rel = LINKAGE_TITLES[t_no_number] 

3345 data = select_data() 

3346 parse_linkage( 

3347 wxr, 

3348 data, 

3349 rel, 

3350 node, 

3351 word, 

3352 sense_datas, 

3353 is_reconstruction, 

3354 ) 

3355 elif t_no_number == COMPOUNDS_TITLE: 

3356 data = select_data() 

3357 if wxr.config.capture_compounds: 3357 ↛ 3373line 3357 didn't jump to line 3373 because the condition on line 3357 was always true

3358 parse_linkage( 

3359 wxr, 

3360 data, 

3361 "derived", 

3362 node, 

3363 word, 

3364 sense_datas, 

3365 is_reconstruction, 

3366 ) 

3367 

3368 # XXX parse interesting templates also from other sections. E.g., 

3369 # {{Letter|...}} in ===See also=== 

3370 # Also <gallery> 

3371 

3372 # Recurse to children of this node, processing subtitles therein 

3373 stack.append(t) 

3374 process_children(node, pos) 

3375 stack.pop() 

3376 

3377 if len(redirect_list) > 0: 

3378 if len(pos_data) > 0: 

3379 pos_data["redirects"] = redirect_list 

3380 if "pos" not in pos_data: 3380 ↛ 3381line 3380 didn't jump to line 3381 because the condition on line 3380 was never true

3381 pos_data["pos"] = "soft-redirect" 

3382 else: 

3383 new_page_data = copy.deepcopy(base_data) 

3384 new_page_data["redirects"] = redirect_list 

3385 if "pos" not in new_page_data: 3385 ↛ 3387line 3385 didn't jump to line 3387 because the condition on line 3385 was always true

3386 new_page_data["pos"] = "soft-redirect" 

3387 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3388 page_datas.append(new_page_data) 

3389 

3390 def extract_examples( 

3391 others: list[WikiNode], sense_base: SenseData 

3392 ) -> list[ExampleData]: 

3393 """Parses through a list of definitions and quotes to find examples. 

3394 Returns a list of example dicts to be added to sense data. Adds 

3395 meta-data, mostly categories, into sense_base.""" 

3396 assert isinstance(others, list) 

3397 examples: list[ExampleData] = [] 

3398 

3399 for sub in others: 

3400 if not sub.sarg.endswith((":", "*")): 3400 ↛ 3401line 3400 didn't jump to line 3401 because the condition on line 3400 was never true

3401 continue 

3402 for item in sub.children: 

3403 if not isinstance(item, WikiNode): 3403 ↛ 3404line 3403 didn't jump to line 3404 because the condition on line 3403 was never true

3404 continue 

3405 if item.kind != NodeKind.LIST_ITEM: 3405 ↛ 3406line 3405 didn't jump to line 3406 because the condition on line 3405 was never true

3406 continue 

3407 usex_type = None 

3408 example_template_args = [] 

3409 example_template_names = [] 

3410 taxons = set() 

3411 

3412 # Bypass this function when parsing Chinese, Japanese and 

3413 # quotation templates. 

3414 new_example_lists = extract_example_list_item( 

3415 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3416 ) 

3417 if len(new_example_lists) > 0: 

3418 examples.extend(new_example_lists) 

3419 continue 

3420 

3421 def usex_template_fn( 

3422 name: str, ht: TemplateArgs 

3423 ) -> Optional[str]: 

3424 nonlocal usex_type 

3425 if is_panel_template(wxr, name): 

3426 return "" 

3427 if name in usex_templates: 

3428 usex_type = "example" 

3429 example_template_args.append(ht) 

3430 example_template_names.append(name) 

3431 elif name in quotation_templates: 

3432 usex_type = "quotation" 

3433 elif name in taxonomy_templates: 3433 ↛ 3434line 3433 didn't jump to line 3434 because the condition on line 3433 was never true

3434 taxons.update(ht.get(1, "").split()) 

3435 for prefix in template_linkages_to_ignore_in_examples: 

3436 if re.search( 

3437 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3438 ): 

3439 return "" 

3440 return None 

3441 

3442 # bookmark 

3443 ruby: list[tuple[str, str]] = [] 

3444 contents = item.children 

3445 if lang_code == "ja": 

3446 # Capture ruby contents if this is a Japanese language 

3447 # example. 

3448 # print(contents) 

3449 if ( 3449 ↛ 3454line 3449 didn't jump to line 3454 because the condition on line 3449 was never true

3450 contents 

3451 and isinstance(contents, str) 

3452 and re.match(r"\s*$", contents[0]) 

3453 ): 

3454 contents = contents[1:] 

3455 exp = wxr.wtp.parse( 

3456 wxr.wtp.node_to_wikitext(contents), 

3457 # post_template_fn=head_post_template_fn, 

3458 expand_all=True, 

3459 ) 

3460 rub, rest = extract_ruby(wxr, exp.children) 

3461 if rub: 

3462 for rtup in rub: 

3463 ruby.append(rtup) 

3464 contents = rest 

3465 subtext = clean_node( 

3466 wxr, sense_base, contents, template_fn=usex_template_fn 

3467 ) 

3468 

3469 frozen_taxons = frozenset(taxons) 

3470 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3471 

3472 # print(f"{subtext=}") 

3473 subtext = re.sub( 

3474 r"\s*\(please add an English " 

3475 r"translation of this " 

3476 r"(example|usage example|quote)\)", 

3477 "", 

3478 subtext, 

3479 ).strip() 

3480 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3481 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3482 # print("subtext:", repr(subtext)) 

3483 

3484 lines = subtext.splitlines() 

3485 # print(lines) 

3486 

3487 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3488 lines = list( 

3489 x 

3490 for x in lines 

3491 if not re.match( 

3492 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3493 r"Synonym: |Antonym: |Hyponym: |" 

3494 r"Hypernyms: |Derived terms: |" 

3495 r"Related terms: |" 

3496 r"Hypernym: |Derived term: |" 

3497 r"Coordinate terms:|" 

3498 r"Related term: |" 

3499 r"For more quotations using )", 

3500 x, 

3501 ) 

3502 ) 

3503 tr = "" 

3504 ref = "" 

3505 roman = "" 

3506 # for line in lines: 

3507 # print("LINE:", repr(line)) 

3508 # print(classify_desc(line)) 

3509 if len(lines) == 1 and lang_code != "en": 

3510 parts = example_splitter_re.split(lines[0]) 

3511 if ( 3511 ↛ 3519line 3511 didn't jump to line 3519 because the condition on line 3511 was never true

3512 len(parts) > 2 

3513 and len(example_template_args) == 1 

3514 and any( 

3515 ("―" in s) or ("—" in s) 

3516 for s in example_template_args[0].values() 

3517 ) 

3518 ): 

3519 if nparts := synch_splits_with_args( 

3520 lines[0], example_template_args[0] 

3521 ): 

3522 parts = nparts 

3523 if ( 3523 ↛ 3528line 3523 didn't jump to line 3528 because the condition on line 3523 was never true

3524 len(example_template_args) == 1 

3525 and "lit" in example_template_args[0] 

3526 ): 

3527 # ugly brute-force kludge in case there's a lit= arg 

3528 literally = example_template_args[0].get("lit", "") 

3529 if literally: 

3530 literally = ( 

3531 " (literally, “" 

3532 + clean_value(wxr, literally) 

3533 + "”)" 

3534 ) 

3535 else: 

3536 literally = "" 

3537 if ( 3537 ↛ 3576line 3537 didn't jump to line 3576 because the condition on line 3537 was never true

3538 len(example_template_args) == 1 

3539 and len(parts) == 2 

3540 and len(example_template_args[0]) 

3541 - ( 

3542 # horrible kludge to ignore these arguments 

3543 # when calculating how many there are 

3544 sum( 

3545 s in example_template_args[0] 

3546 for s in ( 

3547 "lit", # generates text, but we handle it 

3548 "inline", 

3549 "noenum", 

3550 "nocat", 

3551 "sort", 

3552 ) 

3553 ) 

3554 ) 

3555 == 3 

3556 and clean_value( 

3557 wxr, example_template_args[0].get(2, "") 

3558 ) 

3559 == parts[0].strip() 

3560 and clean_value( 

3561 wxr, 

3562 ( 

3563 example_template_args[0].get(3) 

3564 or example_template_args[0].get("translation") 

3565 or example_template_args[0].get("t", "") 

3566 ) 

3567 + literally, # in case there's a lit= argument 

3568 ) 

3569 == parts[1].strip() 

3570 ): 

3571 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3572 # is a pretty reliable 'heuristic', so we use it here 

3573 # before the others. To be extra sure the template 

3574 # doesn't do anything weird, we compare the arguments 

3575 # and the output to each other. 

3576 lines = [parts[0].strip()] 

3577 tr = parts[1].strip() 

3578 elif ( 

3579 len(parts) == 2 

3580 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3581 ): 

3582 # These other branches just do some simple heuristics w/ 

3583 # the expanded output of the template (if applicable). 

3584 lines = [parts[0].strip()] 

3585 tr = parts[1].strip() 

3586 elif ( 3586 ↛ 3592line 3586 didn't jump to line 3592 because the condition on line 3586 was never true

3587 len(parts) == 3 

3588 and classify_desc2(parts[1]) 

3589 in ("romanization", "english") 

3590 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3591 ): 

3592 lines = [parts[0].strip()] 

3593 roman = parts[1].strip() 

3594 tr = parts[2].strip() 

3595 else: 

3596 parts = re.split(r"\s+-\s+", lines[0]) 

3597 if ( 3597 ↛ 3601line 3597 didn't jump to line 3601 because the condition on line 3597 was never true

3598 len(parts) == 2 

3599 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3600 ): 

3601 lines = [parts[0].strip()] 

3602 tr = parts[1].strip() 

3603 elif len(lines) > 1: 

3604 if any( 

3605 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3606 ) and not (len(example_template_names) == 1): 

3607 refs: list[str] = [] 

3608 for i in range(len(lines)): 3608 ↛ 3614line 3608 didn't jump to line 3614 because the loop on line 3608 didn't complete

3609 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 3609 ↛ 3610line 3609 didn't jump to line 3610 because the condition on line 3609 was never true

3610 break 

3611 refs.append(lines[i].strip()) 

3612 if re.search(r"[]\d:)]\s*$", lines[i]): 

3613 break 

3614 ref = " ".join(refs) 

3615 lines = lines[i + 1 :] 

3616 if ( 

3617 lang_code != "en" 

3618 and len(lines) >= 2 

3619 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3620 ): 

3621 i = len(lines) - 1 

3622 while ( 3622 ↛ 3627line 3622 didn't jump to line 3627 because the condition on line 3622 was never true

3623 i > 1 

3624 and classify_desc2(lines[i - 1]) 

3625 in ENGLISH_TEXTS 

3626 ): 

3627 i -= 1 

3628 tr = "\n".join(lines[i:]) 

3629 lines = lines[:i] 

3630 if len(lines) >= 2: 

3631 if classify_desc2(lines[-1]) == "romanization": 

3632 roman = lines[-1].strip() 

3633 lines = lines[:-1] 

3634 

3635 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 

3636 ref = lines[0] 

3637 lines = lines[1:] 

3638 elif lang_code != "en" and len(lines) == 2: 

3639 cls1 = classify_desc2(lines[0]) 

3640 cls2 = classify_desc2(lines[1]) 

3641 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3642 tr = lines[1] 

3643 lines = [lines[0]] 

3644 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 3644 ↛ 3645line 3644 didn't jump to line 3645 because the condition on line 3644 was never true

3645 tr = lines[0] 

3646 lines = [lines[1]] 

3647 elif ( 3647 ↛ 3654line 3647 didn't jump to line 3654 because the condition on line 3647 was never true

3648 re.match(r"^[#*]*:+", lines[1]) 

3649 and classify_desc2( 

3650 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3651 ) 

3652 in ENGLISH_TEXTS 

3653 ): 

3654 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3655 lines = [lines[0]] 

3656 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3657 # Both were classified as English, but 

3658 # presumably one is not. Assume first is 

3659 # non-English, as that seems more common. 

3660 tr = lines[1] 

3661 lines = [lines[0]] 

3662 elif ( 

3663 usex_type != "quotation" 

3664 and lang_code != "en" 

3665 and len(lines) == 3 

3666 ): 

3667 cls1 = classify_desc2(lines[0]) 

3668 cls2 = classify_desc2(lines[1]) 

3669 cls3 = classify_desc2(lines[2]) 

3670 if ( 

3671 cls3 == "english" 

3672 and cls2 in ("english", "romanization") 

3673 and cls1 != "english" 

3674 ): 

3675 tr = lines[2].strip() 

3676 roman = lines[1].strip() 

3677 lines = [lines[0].strip()] 

3678 elif ( 3678 ↛ 3686line 3678 didn't jump to line 3686 because the condition on line 3678 was never true

3679 usex_type == "quotation" 

3680 and lang_code != "en" 

3681 and len(lines) > 2 

3682 ): 

3683 # for x in lines: 

3684 # print(" LINE: {}: {}" 

3685 # .format(classify_desc2(x), x)) 

3686 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3687 ref = lines[0] 

3688 lines = lines[2:] 

3689 cls1 = classify_desc2(lines[-1]) 

3690 if cls1 == "english": 

3691 i = len(lines) - 1 

3692 while ( 

3693 i > 1 

3694 and classify_desc2(lines[i - 1]) 

3695 == ENGLISH_TEXTS 

3696 ): 

3697 i -= 1 

3698 tr = "\n".join(lines[i:]) 

3699 lines = lines[:i] 

3700 

3701 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3702 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3703 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3704 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3705 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3706 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3707 ref = re.sub( 

3708 r", (volume |number |page )?“?" 

3709 r"\(please specify ([^)]|\(s\))*\)”?|" 

3710 ", text here$", 

3711 "", 

3712 ref, 

3713 ) 

3714 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3715 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3716 subtext = "\n".join(x for x in lines if x) 

3717 if not tr and lang_code != "en": 

3718 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3719 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 3719 ↛ 3720line 3719 didn't jump to line 3720 because the condition on line 3719 was never true

3720 tr = m.group(2) 

3721 subtext = subtext[: m.start()] + m.group(1) 

3722 elif lines: 

3723 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

3724 if ( 3724 ↛ 3728line 3724 didn't jump to line 3728 because the condition on line 3724 was never true

3725 len(parts) == 2 

3726 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3727 ): 

3728 subtext = parts[0].strip() 

3729 tr = parts[1].strip() 

3730 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

3731 subtext = re.sub( 

3732 r"(please add an English translation of " 

3733 r"this (quote|usage example))", 

3734 "", 

3735 subtext, 

3736 ) 

3737 subtext = re.sub( 

3738 r"\s*→New International Version " "translation$", 

3739 "", 

3740 subtext, 

3741 ) # e.g. pis/Tok Pisin (Bible) 

3742 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

3743 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

3744 note = None 

3745 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

3746 if ( 3746 ↛ 3754line 3746 didn't jump to line 3754 because the condition on line 3746 was never true

3747 m is not None 

3748 and lang_code != "en" 

3749 and ( 

3750 m.group(1).startswith("with ") 

3751 or classify_desc2(m.group(1)) == "english" 

3752 ) 

3753 ): 

3754 note = m.group(1) 

3755 subtext = subtext[m.end() :] 

3756 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

3757 ref = re.sub(r",\s*→ISBN", "", ref) 

3758 ref = ref.strip() 

3759 if ref.endswith(":") or ref.endswith(","): 

3760 ref = ref[:-1].strip() 

3761 ref = re.sub(r"\s+,\s+", ", ", ref) 

3762 ref = re.sub(r"\s+", " ", ref) 

3763 if ref and not subtext: 3763 ↛ 3764line 3763 didn't jump to line 3764 because the condition on line 3763 was never true

3764 subtext = ref 

3765 ref = "" 

3766 if subtext: 

3767 dt: ExampleData = {"text": subtext} 

3768 if ref: 

3769 dt["ref"] = ref 

3770 if tr: 

3771 dt["english"] = tr # DEPRECATED for "translation" 

3772 dt["translation"] = tr 

3773 if usex_type: 

3774 dt["type"] = usex_type 

3775 if note: 3775 ↛ 3776line 3775 didn't jump to line 3776 because the condition on line 3775 was never true

3776 dt["note"] = note 

3777 if roman: 

3778 dt["roman"] = roman 

3779 if ruby: 

3780 dt["ruby"] = ruby 

3781 examples.append(dt) 

3782 

3783 return examples 

3784 

3785 # Main code of parse_language() 

3786 # Process the section 

3787 stack.append(language) 

3788 process_children(langnode, None) 

3789 stack.pop() 

3790 

3791 # Finalize word entires 

3792 push_etym() 

3793 ret = [] 

3794 for data in page_datas: 

3795 merge_base(data, base_data) 

3796 ret.append(data) 

3797 

3798 # Copy all tags to word senses 

3799 for data in ret: 

3800 if "senses" not in data: 3800 ↛ 3801line 3800 didn't jump to line 3801 because the condition on line 3800 was never true

3801 continue 

3802 # WordData should not have a 'tags' field, but if it does, it's 

3803 # deleted and its contents removed and placed in each sense; 

3804 # that's why the type ignores. 

3805 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

3806 if "tags" in data: 

3807 del data["tags"] # type: ignore[typeddict-item] 

3808 for sense in data["senses"]: 

3809 data_extend(sense, "tags", tags) 

3810 

3811 return ret 

3812 

3813 

3814def parse_wikipedia_template( 

3815 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

3816) -> None: 

3817 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

3818 assert isinstance(wxr, WiktextractContext) 

3819 assert isinstance(data, dict) 

3820 assert isinstance(ht, dict) 

3821 langid = clean_node(wxr, data, ht.get("lang", ())) 

3822 pagename = ( 

3823 clean_node(wxr, data, ht.get(1, ())) 

3824 or wxr.wtp.title 

3825 or "MISSING_PAGE_TITLE" 

3826 ) 

3827 if langid: 

3828 data_append(data, "wikipedia", langid + ":" + pagename) 

3829 else: 

3830 data_append(data, "wikipedia", pagename) 

3831 

3832 

3833def parse_top_template( 

3834 wxr: WiktextractContext, node: WikiNode, data: WordData 

3835) -> None: 

3836 """Parses a template that occurs on the top-level in a page, before any 

3837 language subtitles.""" 

3838 assert isinstance(wxr, WiktextractContext) 

3839 assert isinstance(node, WikiNode) 

3840 assert isinstance(data, dict) 

3841 

3842 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3843 if name in wikipedia_templates: 

3844 parse_wikipedia_template(wxr, data, ht) 

3845 return None 

3846 if is_panel_template(wxr, name): 

3847 return "" 

3848 if name in ("reconstruction",): 3848 ↛ 3849line 3848 didn't jump to line 3849 because the condition on line 3848 was never true

3849 return "" 

3850 if name.lower() == "also" or name.lower().startswith("also/"): 

3851 # XXX shows related words that might really have been the intended 

3852 # word, capture them 

3853 return "" 

3854 if name == "see also": 3854 ↛ 3856line 3854 didn't jump to line 3856 because the condition on line 3854 was never true

3855 # XXX capture 

3856 return "" 

3857 if name == "cardinalbox": 3857 ↛ 3859line 3857 didn't jump to line 3859 because the condition on line 3857 was never true

3858 # XXX capture 

3859 return "" 

3860 if name == "character info": 3860 ↛ 3862line 3860 didn't jump to line 3862 because the condition on line 3860 was never true

3861 # XXX capture 

3862 return "" 

3863 if name == "commonscat": 3863 ↛ 3865line 3863 didn't jump to line 3865 because the condition on line 3863 was never true

3864 # XXX capture link to Wikimedia commons 

3865 return "" 

3866 if name == "wrongtitle": 3866 ↛ 3869line 3866 didn't jump to line 3869 because the condition on line 3866 was never true

3867 # XXX this should be captured to replace page title with the 

3868 # correct title. E.g. ⿰亻革家 

3869 return "" 

3870 if name == "wikidata": 3870 ↛ 3871line 3870 didn't jump to line 3871 because the condition on line 3870 was never true

3871 arg = clean_node(wxr, data, ht.get(1, ())) 

3872 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

3873 data_append(data, "wikidata", arg) 

3874 return "" 

3875 wxr.wtp.debug( 

3876 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

3877 sortid="page/2870", 

3878 ) 

3879 return "" 

3880 

3881 clean_node(wxr, None, [node], template_fn=top_template_fn) 

3882 

3883 

3884def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

3885 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

3886 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

3887 that are next to each other.""" 

3888 

3889 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

3890 # section get overwritten. In this case, let's just combine the two. 

3891 

3892 # In Chinese entries, Pronunciation can be preceded on the 

3893 # same level 3 by its Etymology *and* Glyph Origin sections: 

3894 # ===Glyph Origin=== 

3895 # ===Etymology=== 

3896 # ===Pronunciation=== 

3897 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

3898 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

3899 # are now level 6 

3900 

3901 # Known lowercase PoS names are in part_of_speech_map 

3902 # Known lowercase linkage section names are in linkage_map 

3903 

3904 old = re.split( 

3905 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

3906 ) 

3907 

3908 parts = [] 

3909 npar = 4 # Number of parentheses in above expression 

3910 parts.append(old[0]) 

3911 prev_level = None 

3912 level = None 

3913 skip_level_title = False # When combining etymology sections 

3914 for i in range(1, len(old), npar + 1): 

3915 left = old[i] 

3916 right = old[i + npar - 1] 

3917 # remove Wikilinks in title 

3918 title = re.sub(r"^\[\[", "", old[i + 1]) 

3919 title = re.sub(r"\]\]$", "", title) 

3920 prev_level = level 

3921 level = len(left) 

3922 part = old[i + npar] 

3923 if level != len(right): 3923 ↛ 3924line 3923 didn't jump to line 3924 because the condition on line 3923 was never true

3924 wxr.wtp.debug( 

3925 "subtitle has unbalanced levels: " 

3926 "{!r} has {} on the left and {} on the right".format( 

3927 title, left, right 

3928 ), 

3929 sortid="page/2904", 

3930 ) 

3931 lc = title.lower() 

3932 if name_to_code(title, "en") != "": 

3933 if level > 2: 3933 ↛ 3934line 3933 didn't jump to line 3934 because the condition on line 3933 was never true

3934 wxr.wtp.debug( 

3935 "subtitle has language name {} at level {}".format( 

3936 title, level 

3937 ), 

3938 sortid="page/2911", 

3939 ) 

3940 level = 2 

3941 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

3942 if level > 3: 3942 ↛ 3943line 3942 didn't jump to line 3943 because the condition on line 3942 was never true

3943 wxr.wtp.debug( 

3944 "etymology section {} at level {}".format(title, level), 

3945 sortid="page/2917", 

3946 ) 

3947 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 

3948 # sections cheek-to-cheek 

3949 skip_level_title = True 

3950 # Modify the title of previous ("Glyph Origin") section, in 

3951 # case we have a meaningful title like "Etymology 1" 

3952 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

3953 level = 3 

3954 elif lc.startswith(PRONUNCIATION_TITLE): 

3955 # Pronunciation is now a level between POS and Etymology, so 

3956 # we need to shift everything down by one 

3957 level = 4 

3958 elif lc in POS_TITLES: 

3959 level = 5 

3960 elif lc == TRANSLATIONS_TITLE: 

3961 level = 6 

3962 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 

3963 level = 6 

3964 elif lc in INFLECTION_TITLES: 

3965 level = 6 

3966 elif lc == DESCENDANTS_TITLE: 

3967 level = 6 

3968 elif title in PROTO_ROOT_DERIVED_TITLES: 3968 ↛ 3969line 3968 didn't jump to line 3969 because the condition on line 3968 was never true

3969 level = 6 

3970 elif lc in IGNORED_TITLES: 

3971 level = 6 

3972 else: 

3973 level = 6 

3974 if skip_level_title: 

3975 skip_level_title = False 

3976 parts.append(part) 

3977 else: 

3978 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

3979 parts.append(part) 

3980 # print("=" * level, title) 

3981 # if level != len(left): 

3982 # print(" FIXED LEVEL OF {} {} -> {}" 

3983 # .format(title, len(left), level)) 

3984 

3985 text = "".join(parts) 

3986 # print(text) 

3987 return text 

3988 

3989 

3990def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

3991 # Skip translation pages 

3992 if word.endswith("/" + TRANSLATIONS_TITLE): 3992 ↛ 3993line 3992 didn't jump to line 3993 because the condition on line 3992 was never true

3993 return [] 

3994 

3995 if wxr.config.verbose: 3995 ↛ 3996line 3995 didn't jump to line 3996 because the condition on line 3995 was never true

3996 logger.info(f"Parsing page: {word}") 

3997 

3998 wxr.config.word = word 

3999 wxr.wtp.start_page(word) 

4000 

4001 # Remove <noinclude> and similar tags from main pages. They 

4002 # should not appear there, but at least net/Elfdala has one and it 

4003 # is probably not the only one. 

4004 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4005 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4006 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4007 

4008 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4009 # pages that have, for example, Translations section under Linkage, or 

4010 # Translations section on the same level as Noun. Enforce a proper 

4011 # hierarchy by manipulating the subtitle levels in certain cases. 

4012 text = fix_subtitle_hierarchy(wxr, text) 

4013 

4014 # Parse the page, pre-expanding those templates that are likely to 

4015 # influence parsing 

4016 tree = wxr.wtp.parse( 

4017 text, 

4018 pre_expand=True, 

4019 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4020 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4021 ) 

4022 # from wikitextprocessor.parser import print_tree 

4023 # print("PAGE PARSE:", print_tree(tree)) 

4024 

4025 top_data: WordData = {} 

4026 

4027 # Iterate over top-level titles, which should be languages for normal 

4028 # pages 

4029 by_lang = defaultdict(list) 

4030 for langnode in tree.children: 

4031 if not isinstance(langnode, WikiNode): 

4032 continue 

4033 if langnode.kind == NodeKind.TEMPLATE: 

4034 parse_top_template(wxr, langnode, top_data) 

4035 continue 

4036 if langnode.kind == NodeKind.LINK: 

4037 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4038 continue 

4039 if langnode.kind != NodeKind.LEVEL2: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true

4040 wxr.wtp.debug( 

4041 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4042 ) 

4043 continue 

4044 lang = clean_node( 

4045 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4046 ) 

4047 lang_code = name_to_code(lang, "en") 

4048 if lang_code == "": 4048 ↛ 4049line 4048 didn't jump to line 4049 because the condition on line 4048 was never true

4049 wxr.wtp.debug( 

4050 f"unrecognized language name: {lang}", sortid="page/3019" 

4051 ) 

4052 if ( 

4053 wxr.config.capture_language_codes 

4054 and lang_code not in wxr.config.capture_language_codes 

4055 ): 

4056 continue 

4057 wxr.wtp.start_section(lang) 

4058 

4059 # Collect all words from the page. 

4060 # print(f"{langnode=}") 

4061 datas = parse_language(wxr, langnode, lang, lang_code) 

4062 

4063 # Propagate fields resulting from top-level templates to this 

4064 # part-of-speech. 

4065 for data in datas: 

4066 if "lang" not in data: 4066 ↛ 4067line 4066 didn't jump to line 4067 because the condition on line 4066 was never true

4067 wxr.wtp.debug( 

4068 "internal error -- no lang in data: {}".format(data), 

4069 sortid="page/3034", 

4070 ) 

4071 continue 

4072 for k, v in top_data.items(): 

4073 assert isinstance(v, (list, tuple)) 

4074 data_extend(data, k, v) 

4075 by_lang[data["lang"]].append(data) 

4076 

4077 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4078 # field. FIX OR REMOVE. 

4079 # Do some post-processing on the words. For example, we may distribute 

4080 # conjugation information to all the words. 

4081 ret = [] 

4082 for lang, lang_datas in by_lang.items(): 

4083 ret.extend(lang_datas) 

4084 

4085 for x in ret: 

4086 if x["word"] != word: 

4087 if word.startswith("Unsupported titles/"): 

4088 wxr.wtp.debug( 

4089 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4090 sortid="20231101/3578page.py", 

4091 ) 

4092 else: 

4093 wxr.wtp.debug( 

4094 f"DIFFERENT ORIGINAL TITLE: '{word}' -> '{x['word']}'", 

4095 sortid="20231101/3582page.py", 

4096 ) 

4097 x["original_title"] = word 

4098 # validate tag data 

4099 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4100 return ret 

4101 

4102 

4103def recursively_separate_raw_tags( 

4104 wxr: WiktextractContext, data: dict[str, Any] 

4105) -> None: 

4106 if not isinstance(data, dict): 4106 ↛ 4107line 4106 didn't jump to line 4107 because the condition on line 4106 was never true

4107 wxr.wtp.error( 

4108 "'data' is not dict; most probably " 

4109 "data has a list that contains at least one dict and " 

4110 "at least one non-dict item", 

4111 sortid="en/page-4016/20240419", 

4112 ) 

4113 return 

4114 new_tags: list[str] = [] 

4115 raw_tags: list[str] = data.get("raw_tags", []) 

4116 for field, val in data.items(): 

4117 if field == "tags": 

4118 for tag in val: 

4119 if tag not in valid_tags: 

4120 raw_tags.append(tag) 

4121 else: 

4122 new_tags.append(tag) 

4123 if isinstance(val, list): 

4124 if len(val) > 0 and isinstance(val[0], dict): 

4125 for d in val: 

4126 recursively_separate_raw_tags(wxr, d) 

4127 if "tags" in data and not new_tags: 

4128 del data["tags"] 

4129 elif new_tags: 

4130 data["tags"] = new_tags 

4131 if raw_tags: 

4132 data["raw_tags"] = raw_tags 

4133 

4134 

4135def process_soft_redirect_template( 

4136 wxr: WiktextractContext, 

4137 template_node: TemplateNode, 

4138 redirect_pages: list[str], 

4139) -> bool: 

4140 # return `True` if the template is soft redirect template 

4141 if template_node.template_name == "zh-see": 

4142 # https://en.wiktionary.org/wiki/Template:zh-see 

4143 title = clean_node( 

4144 wxr, None, template_node.template_parameters.get(1, "") 

4145 ) 

4146 if title != "": 4146 ↛ 4148line 4146 didn't jump to line 4148 because the condition on line 4146 was always true

4147 redirect_pages.append(title) 

4148 return True 

4149 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4150 # https://en.wiktionary.org/wiki/Template:ja-see 

4151 for key, value in template_node.template_parameters.items(): 

4152 if isinstance(key, int): 4152 ↛ 4151line 4152 didn't jump to line 4151 because the condition on line 4152 was always true

4153 title = clean_node(wxr, None, value) 

4154 if title != "": 4154 ↛ 4151line 4154 didn't jump to line 4151 because the condition on line 4154 was always true

4155 redirect_pages.append(title) 

4156 return True 

4157 return False 

4158 

4159 

4160ZH_FORMS_TAGS = { 

4161 "trad.": "Traditional-Chinese", 

4162 "simp.": "Simplified-Chinese", 

4163 "alternative forms": "alternative", 

4164 "2nd round simp.": "Second-Round-Simplified-Chinese", 

4165} 

4166 

4167 

4168def extract_zh_forms_template( 

4169 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4170): 

4171 # https://en.wiktionary.org/wiki/Template:zh-forms 

4172 lit_meaning = clean_node( 

4173 wxr, None, t_node.template_parameters.get("lit", "") 

4174 ) 

4175 if lit_meaning != "": 

4176 base_data["literal_meaning"] = lit_meaning 

4177 expanded_node = wxr.wtp.parse( 

4178 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4179 ) 

4180 for table in expanded_node.find_child(NodeKind.TABLE): 

4181 for row in table.find_child(NodeKind.TABLE_ROW): 

4182 row_header = "" 

4183 row_header_tags: list[str] = [] 

4184 header_has_span = False 

4185 for cell in row.find_child( 

4186 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

4187 ): 

4188 if cell.kind == NodeKind.TABLE_HEADER_CELL: 

4189 row_header, row_header_tags, header_has_span = ( 

4190 extract_zh_forms_header_cell(wxr, base_data, cell) 

4191 ) 

4192 elif not header_has_span: 

4193 extract_zh_forms_data_cell( 

4194 wxr, base_data, cell, row_header, row_header_tags 

4195 ) 

4196 

4197 if "forms" in base_data and len(base_data["forms"]) == 0: 4197 ↛ 4198line 4197 didn't jump to line 4198 because the condition on line 4197 was never true

4198 del base_data["forms"] 

4199 

4200 

4201def extract_zh_forms_header_cell( 

4202 wxr: WiktextractContext, base_data: WordData, header_cell: WikiNode 

4203) -> tuple[str, list[str], bool]: 

4204 row_header = "" 

4205 row_header_tags = [] 

4206 header_has_span = False 

4207 first_span_index = len(header_cell.children) 

4208 for index, span_tag in header_cell.find_html("span", with_index=True): 

4209 if index < first_span_index: 4209 ↛ 4211line 4209 didn't jump to line 4211 because the condition on line 4209 was always true

4210 first_span_index = index 

4211 header_has_span = True 

4212 row_header = clean_node(wxr, None, header_cell.children[:first_span_index]) 

4213 for raw_tag in row_header.split(" and "): 

4214 raw_tag = raw_tag.strip() 

4215 if raw_tag != "": 

4216 row_header_tags.append(raw_tag) 

4217 for span_tag in header_cell.find_html_recursively("span"): 

4218 span_lang = span_tag.attrs.get("lang", "") 

4219 form_nodes = [] 

4220 sup_title = "" 

4221 for node in span_tag.children: 

4222 if isinstance(node, HTMLNode) and node.tag == "sup": 4222 ↛ 4223line 4222 didn't jump to line 4223 because the condition on line 4222 was never true

4223 for sup_span in node.find_html("span"): 

4224 sup_title = sup_span.attrs.get("title", "") 

4225 else: 

4226 form_nodes.append(node) 

4227 if span_lang in ["zh-Hant", "zh-Hans"]: 

4228 for word in clean_node(wxr, None, form_nodes).split("/"): 

4229 if word not in [wxr.wtp.title, ""]: 

4230 form = {"form": word} 

4231 for raw_tag in row_header_tags: 

4232 if raw_tag in ZH_FORMS_TAGS: 4232 ↛ 4235line 4232 didn't jump to line 4235 because the condition on line 4232 was always true

4233 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4234 else: 

4235 data_append(form, "raw_tags", raw_tag) 

4236 if sup_title != "": 4236 ↛ 4237line 4236 didn't jump to line 4237 because the condition on line 4236 was never true

4237 data_append(form, "raw_tags", sup_title) 

4238 data_append(base_data, "forms", form) 

4239 return row_header, row_header_tags, header_has_span 

4240 

4241 

4242TagLiteral = Literal["tags", "raw_tags"] 

4243TAG_LITERALS_TUPLE: tuple[TagLiteral, ...] = ("tags", "raw_tags") 

4244 

4245 

4246def extract_zh_forms_data_cell( 

4247 wxr: WiktextractContext, 

4248 base_data: WordData, 

4249 cell: WikiNode, 

4250 row_header: str, 

4251 row_header_tags: list[str], 

4252) -> None: 

4253 from .zh_pron_tags import ZH_PRON_TAGS 

4254 

4255 forms: list[FormData] = [] 

4256 for top_span_tag in cell.find_html("span"): 

4257 span_style = top_span_tag.attrs.get("style", "") 

4258 span_lang = top_span_tag.attrs.get("lang", "") 

4259 if span_style == "white-space:nowrap;": 

4260 extract_zh_forms_data_cell( 

4261 wxr, base_data, top_span_tag, row_header, row_header_tags 

4262 ) 

4263 elif "font-size:80%" in span_style: 

4264 raw_tag = clean_node(wxr, None, top_span_tag) 

4265 if raw_tag != "": 4265 ↛ 4256line 4265 didn't jump to line 4256 because the condition on line 4265 was always true

4266 for form in forms: 

4267 if raw_tag in ZH_PRON_TAGS: 4267 ↛ 4273line 4267 didn't jump to line 4273 because the condition on line 4267 was always true

4268 tr_tag = ZH_PRON_TAGS[raw_tag] 

4269 if isinstance(tr_tag, list): 4269 ↛ 4270line 4269 didn't jump to line 4270 because the condition on line 4269 was never true

4270 data_extend(form, "tags", tr_tag) 

4271 elif isinstance(tr_tag, str): 4271 ↛ 4266line 4271 didn't jump to line 4266 because the condition on line 4271 was always true

4272 data_append(form, "tags", tr_tag) 

4273 elif raw_tag in valid_tags: 

4274 data_append(form, "tags", raw_tag) 

4275 else: 

4276 data_append(form, "raw_tags", raw_tag) 

4277 elif span_lang in ["zh-Hant", "zh-Hans", "zh"]: 4277 ↛ 4256line 4277 didn't jump to line 4256 because the condition on line 4277 was always true

4278 word = clean_node(wxr, None, top_span_tag) 

4279 if word not in ["", "/", wxr.wtp.title]: 

4280 form = {"form": word} 

4281 if row_header != "anagram": 4281 ↛ 4287line 4281 didn't jump to line 4287 because the condition on line 4281 was always true

4282 for raw_tag in row_header_tags: 

4283 if raw_tag in ZH_FORMS_TAGS: 4283 ↛ 4286line 4283 didn't jump to line 4286 because the condition on line 4283 was always true

4284 data_append(form, "tags", ZH_FORMS_TAGS[raw_tag]) 

4285 else: 

4286 data_append(form, "raw_tags", raw_tag) 

4287 if span_lang == "zh-Hant": 

4288 data_append(form, "tags", "Traditional-Chinese") 

4289 elif span_lang == "zh-Hans": 

4290 data_append(form, "tags", "Simplified-Chinese") 

4291 forms.append(form) 

4292 

4293 if row_header == "anagram": 4293 ↛ 4294line 4293 didn't jump to line 4294 because the condition on line 4293 was never true

4294 for form in forms: 

4295 l_data: LinkageData = {"word": form["form"]} 

4296 for key in TAG_LITERALS_TUPLE: 

4297 if key in form: 

4298 l_data[key] = form[key] 

4299 data_append(base_data, "anagrams", l_data) 

4300 else: 

4301 data_extend(base_data, "forms", forms) 

4302 

4303 

4304def extract_ja_kanjitab_template( 

4305 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordData 

4306): 

4307 # https://en.wiktionary.org/wiki/Template:ja-kanjitab 

4308 expanded_node = wxr.wtp.parse( 

4309 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

4310 ) 

4311 for table in expanded_node.find_child(NodeKind.TABLE): 

4312 is_alt_form_table = False 

4313 for row in table.find_child(NodeKind.TABLE_ROW): 

4314 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

4315 header_text = clean_node(wxr, None, header_node) 

4316 if header_text.startswith("Alternative spelling"): 

4317 is_alt_form_table = True 

4318 if not is_alt_form_table: 

4319 continue 

4320 forms = [] 

4321 for row in table.find_child(NodeKind.TABLE_ROW): 

4322 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

4323 for child_node in cell_node.children: 

4324 if isinstance(child_node, HTMLNode): 

4325 if child_node.tag == "span": 

4326 word = clean_node(wxr, None, child_node) 

4327 if word != "": 4327 ↛ 4323line 4327 didn't jump to line 4323 because the condition on line 4327 was always true

4328 forms.append( 

4329 { 

4330 "form": word, 

4331 "tags": ["alternative", "kanji"], 

4332 } 

4333 ) 

4334 elif child_node.tag == "small": 

4335 raw_tag = clean_node(wxr, None, child_node).strip( 

4336 "()" 

4337 ) 

4338 if raw_tag != "" and len(forms) > 0: 4338 ↛ 4323line 4338 didn't jump to line 4323 because the condition on line 4338 was always true

4339 data_append( 

4340 forms[-1], 

4341 "tags" 

4342 if raw_tag in valid_tags 

4343 else "raw_tags", 

4344 raw_tag, 

4345 ) 

4346 data_extend(base_data, "forms", forms) 

4347 for link_node in expanded_node.find_child(NodeKind.LINK): 

4348 clean_node(wxr, base_data, link_node)