Coverage for src/wiktextract/extractor/en/page.py: 44%

1905 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1# Code for parsing information from a single Wiktionary page. 

2# 

3# Copyright (c) 2018-2022 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import copy 

6import html 

7import re 

8import sys 

9from collections import defaultdict 

10from functools import partial 

11from typing import ( 

12 TYPE_CHECKING, 

13 Any, 

14 Iterable, 

15 Iterator, 

16 Optional, 

17 Set, 

18 Union, 

19 cast, 

20) 

21 

22from mediawiki_langcodes import get_all_names, name_to_code 

23from wikitextprocessor.core import TemplateArgs, TemplateFnCallable 

24from wikitextprocessor.parser import ( 

25 LEVEL_KIND_FLAGS, 

26 GeneralNode, 

27 NodeKind, 

28 TemplateNode, 

29 WikiNode, 

30) 

31 

32from ...clean import clean_template_args, clean_value 

33from ...datautils import ( 

34 data_append, 

35 data_extend, 

36 ns_title_prefix_tuple, 

37) 

38from ...page import ( 

39 LEVEL_KINDS, 

40 clean_node, 

41 is_panel_template, 

42 recursively_extract, 

43) 

44from ...tags import valid_tags 

45from ...wxr_context import WiktextractContext 

46from ...wxr_logging import logger 

47from ..ruby import extract_ruby, parse_ruby 

48from ..share import strip_nodes 

49from .example import extract_example_list_item, extract_template_zh_x 

50from .form_descriptions import ( 

51 classify_desc, 

52 decode_tags, 

53 distw, 

54 parse_alt_or_inflection_of, 

55 parse_sense_qualifier, 

56 parse_word_head, 

57) 

58from .inflection import TableContext, parse_inflection_section 

59from .info_templates import ( 

60 INFO_TEMPLATE_FUNCS, 

61 parse_info_template_arguments, 

62 parse_info_template_node, 

63) 

64from .linkages import parse_linkage_item_text 

65from .parts_of_speech import PARTS_OF_SPEECH 

66from .section_titles import ( 

67 COMPOUNDS_TITLE, 

68 DESCENDANTS_TITLE, 

69 ETYMOLOGY_TITLES, 

70 IGNORED_TITLES, 

71 INFLECTION_TITLES, 

72 LINKAGE_TITLES, 

73 POS_TITLES, 

74 PRONUNCIATION_TITLE, 

75 PROTO_ROOT_DERIVED_TITLES, 

76 TRANSLATIONS_TITLE, 

77) 

78from .translations import parse_translation_item_text 

79from .type_utils import ( 

80 DescendantData, 

81 ExampleData, 

82 FormData, 

83 LinkageData, 

84 SenseData, 

85 SoundData, 

86 TemplateData, 

87 WordData, 

88) 

89from .unsupported_titles import unsupported_title_map 

90 

91# When determining whether a string is 'english', classify_desc 

92# might return 'taxonomic' which is English text 99% of the time. 

93ENGLISH_TEXTS = ("english", "taxonomic") 

94 

95# Matches head tag 

96HEAD_TAG_RE = re.compile( 

97 r"^(head|Han char|arabic-noun|arabic-noun-form|" 

98 r"hangul-symbol|syllable-hangul)$|" 

99 + r"^(latin|" 

100 + "|".join(lang_code for lang_code, *_ in get_all_names("en")) 

101 + r")-(" 

102 + "|".join( 

103 [ 

104 "abbr", 

105 "adj", 

106 "adjective", 

107 "adjective form", 

108 "adjective-form", 

109 "adv", 

110 "adverb", 

111 "affix", 

112 "animal command", 

113 "art", 

114 "article", 

115 "aux", 

116 "bound pronoun", 

117 "bound-pronoun", 

118 "Buyla", 

119 "card num", 

120 "card-num", 

121 "cardinal", 

122 "chunom", 

123 "classifier", 

124 "clitic", 

125 "cls", 

126 "cmene", 

127 "cmavo", 

128 "colloq-verb", 

129 "colverbform", 

130 "combining form", 

131 "combining-form", 

132 "comparative", 

133 "con", 

134 "concord", 

135 "conj", 

136 "conjunction", 

137 "conjug", 

138 "cont", 

139 "contr", 

140 "converb", 

141 "daybox", 

142 "decl", 

143 "decl noun", 

144 "def", 

145 "dem", 

146 "det", 

147 "determ", 

148 "Deva", 

149 "ending", 

150 "entry", 

151 "form", 

152 "fuhivla", 

153 "gerund", 

154 "gismu", 

155 "hanja", 

156 "hantu", 

157 "hanzi", 

158 "head", 

159 "ideophone", 

160 "idiom", 

161 "inf", 

162 "indef", 

163 "infixed pronoun", 

164 "infixed-pronoun", 

165 "infl", 

166 "inflection", 

167 "initialism", 

168 "int", 

169 "interfix", 

170 "interj", 

171 "interjection", 

172 "jyut", 

173 "latin", 

174 "letter", 

175 "locative", 

176 "lujvo", 

177 "monthbox", 

178 "mutverb", 

179 "name", 

180 "nisba", 

181 "nom", 

182 "noun", 

183 "noun form", 

184 "noun-form", 

185 "noun plural", 

186 "noun-plural", 

187 "nounprefix", 

188 "num", 

189 "number", 

190 "numeral", 

191 "ord", 

192 "ordinal", 

193 "par", 

194 "part", 

195 "part form", 

196 "part-form", 

197 "participle", 

198 "particle", 

199 "past", 

200 "past neg", 

201 "past-neg", 

202 "past participle", 

203 "past-participle", 

204 "perfect participle", 

205 "perfect-participle", 

206 "personal pronoun", 

207 "personal-pronoun", 

208 "pref", 

209 "prefix", 

210 "phrase", 

211 "pinyin", 

212 "plural noun", 

213 "plural-noun", 

214 "pos", 

215 "poss-noun", 

216 "post", 

217 "postp", 

218 "postposition", 

219 "PP", 

220 "pp", 

221 "ppron", 

222 "pred", 

223 "predicative", 

224 "prep", 

225 "prep phrase", 

226 "prep-phrase", 

227 "preposition", 

228 "present participle", 

229 "present-participle", 

230 "pron", 

231 "prondem", 

232 "pronindef", 

233 "pronoun", 

234 "prop", 

235 "proper noun", 

236 "proper-noun", 

237 "proper noun form", 

238 "proper-noun form", 

239 "proper noun-form", 

240 "proper-noun-form", 

241 "prov", 

242 "proverb", 

243 "prpn", 

244 "prpr", 

245 "punctuation mark", 

246 "punctuation-mark", 

247 "regnoun", 

248 "rel", 

249 "rom", 

250 "romanji", 

251 "root", 

252 "sign", 

253 "suff", 

254 "suffix", 

255 "syllable", 

256 "symbol", 

257 "verb", 

258 "verb form", 

259 "verb-form", 

260 "verbal noun", 

261 "verbal-noun", 

262 "verbnec", 

263 "vform", 

264 ] 

265 ) 

266 + r")(-|/|\+|$)" 

267) 

268 

269FLOATING_TABLE_TEMPLATES: set[str] = { 

270 # az-suffix-form creates a style=floatright div that is otherwise 

271 # deleted; if it is not pre-expanded, we can intercept the template 

272 # so we add this set into do_not_pre_expand, and intercept the 

273 # templates in parse_part_of_speech 

274 "az-suffix-forms", 

275 "az-inf-p", 

276 "kk-suffix-forms", 

277 "ky-suffix-forms", 

278 "tr-inf-p", 

279 "tr-suffix-forms", 

280 "tt-suffix-forms", 

281 "uz-suffix-forms", 

282} 

283# These two should contain template names that should always be 

284# pre-expanded when *first* processing the tree, or not pre-expanded 

285# so that the template are left in place with their identifying 

286# name intact for later filtering. 

287 

288DO_NOT_PRE_EXPAND_TEMPLATES: set[str] = set() 

289DO_NOT_PRE_EXPAND_TEMPLATES.update(FLOATING_TABLE_TEMPLATES) 

290 

291# Additional templates to be expanded in the pre-expand phase 

292ADDITIONAL_EXPAND_TEMPLATES: set[str] = { 

293 "multitrans", 

294 "multitrans-nowiki", 

295 "trans-top", 

296 "trans-top-also", 

297 "trans-bottom", 

298 "checktrans-top", 

299 "checktrans-bottom", 

300 "col1", 

301 "col2", 

302 "col3", 

303 "col4", 

304 "col5", 

305 "col1-u", 

306 "col2-u", 

307 "col3-u", 

308 "col4-u", 

309 "col5-u", 

310 "check deprecated lang param usage", 

311 "deprecated code", 

312 "ru-verb-alt-ё", 

313 "ru-noun-alt-ё", 

314 "ru-adj-alt-ё", 

315 "ru-proper noun-alt-ё", 

316 "ru-pos-alt-ё", 

317 "ru-alt-ё", 

318 "inflection of", 

319 "no deprecated lang param usage", 

320} 

321 

322# Inverse linkage for those that have them 

323linkage_inverses: dict[str, str] = { 

324 # XXX this is not currently used, move to post-processing 

325 "synonyms": "synonyms", 

326 "hypernyms": "hyponyms", 

327 "hyponyms": "hypernyms", 

328 "holonyms": "meronyms", 

329 "meronyms": "holonyms", 

330 "derived": "derived_from", 

331 "coordinate_terms": "coordinate_terms", 

332 "troponyms": "hypernyms", 

333 "antonyms": "antonyms", 

334 "instances": "instance_of", 

335 "related": "related", 

336} 

337 

338# Templates that are used to form panels on pages and that 

339# should be ignored in various positions 

340PANEL_TEMPLATES: set[str] = { 

341 "Character info", 

342 "CJKV", 

343 "French personal pronouns", 

344 "French possessive adjectives", 

345 "French possessive pronouns", 

346 "Han etym", 

347 "Japanese demonstratives", 

348 "Latn-script", 

349 "LDL", 

350 "MW1913Abbr", 

351 "Number-encoding", 

352 "Nuttall", 

353 "Spanish possessive adjectives", 

354 "Spanish possessive pronouns", 

355 "USRegionDisputed", 

356 "Webster 1913", 

357 "ase-rfr", 

358 "attention", 

359 "attn", 

360 "beer", 

361 "broken ref", 

362 "ca-compass", 

363 "character info", 

364 "character info/var", 

365 "checksense", 

366 "compass-fi", 

367 "copyvio suspected", 

368 "delete", 

369 "dial syn", # Currently ignore these, but could be useful in Chinese/Korean 

370 "etystub", 

371 "examples", 

372 "hu-corr", 

373 "hu-suff-pron", 

374 "interwiktionary", 

375 "ja-kanjitab", 

376 "ko-hanja-search", 

377 "look", 

378 "maintenance box", 

379 "maintenance line", 

380 "mediagenic terms", 

381 "merge", 

382 "missing template", 

383 "morse links", 

384 "move", 

385 "multiple images", 

386 "no inline", 

387 "picdic", 

388 "picdicimg", 

389 "picdiclabel", 

390 "polyominoes", 

391 "predidential nomics", 

392 "punctuation", # This actually gets pre-expanded 

393 "reconstructed", 

394 "request box", 

395 "rf-sound example", 

396 "rfaccents", 

397 "rfap", 

398 "rfaspect", 

399 "rfc", 

400 "rfc-auto", 

401 "rfc-header", 

402 "rfc-level", 

403 "rfc-pron-n", 

404 "rfc-sense", 

405 "rfclarify", 

406 "rfd", 

407 "rfd-redundant", 

408 "rfd-sense", 

409 "rfdate", 

410 "rfdatek", 

411 "rfdef", 

412 "rfe", 

413 "rfe/dowork", 

414 "rfex", 

415 "rfexp", 

416 "rfform", 

417 "rfgender", 

418 "rfi", 

419 "rfinfl", 

420 "rfm", 

421 "rfm-sense", 

422 "rfp", 

423 "rfp-old", 

424 "rfquote", 

425 "rfquote-sense", 

426 "rfquotek", 

427 "rfref", 

428 "rfscript", 

429 "rft2", 

430 "rftaxon", 

431 "rftone", 

432 "rftranslit", 

433 "rfv", 

434 "rfv-etym", 

435 "rfv-pron", 

436 "rfv-quote", 

437 "rfv-sense", 

438 "selfref", 

439 "split", 

440 "stroke order", # XXX consider capturing this? 

441 "stub entry", 

442 "t-needed", 

443 "tbot entry", 

444 "tea room", 

445 "tea room sense", 

446 # "ttbc", - XXX needed in at least on/Preposition/Translation page 

447 "unblock", 

448 "unsupportedpage", 

449 "video frames", 

450 "was wotd", 

451 "wrongtitle", 

452 "zh-forms", 

453 "zh-hanzi-box", 

454} 

455 

456# lookup table for the tags of Chinese dialectal synonyms 

457zh_tag_lookup: dict[str, list[str]] = { 

458 "Formal": ["formal"], 

459 "Written-Standard-Chinese": ["Standard-Chinese"], 

460 "historical or Internet slang": ["historical", "internet-slang"], 

461 "now usually derogatory or offensive": ["offensive", "derogatory"], 

462 "lofty": [], 

463} 

464 

465# Template name prefixes used for language-specific panel templates (i.e., 

466# templates that create side boxes or notice boxes or that should generally 

467# be ignored). 

468PANEL_PREFIXES: set[str] = { 

469 "list:compass points/", 

470 "list:Gregorian calendar months/", 

471 "RQ:", 

472} 

473 

474# Templates used for wikipedia links. 

475wikipedia_templates: set[str] = { 

476 "wikipedia", 

477 "slim-wikipedia", 

478 "w", 

479 "W", 

480 "swp", 

481 "wiki", 

482 "Wikipedia", 

483 "wtorw", 

484} 

485for x in PANEL_PREFIXES & wikipedia_templates: 485 ↛ 486line 485 didn't jump to line 486 because the loop on line 485 never started

486 print( 

487 "WARNING: {!r} in both panel_templates and wikipedia_templates".format( 

488 x 

489 ) 

490 ) 

491 

492# Mapping from a template name (without language prefix) for the main word 

493# (e.g., fi-noun, fi-adj, en-verb) to permitted parts-of-speech in which 

494# it could validly occur. This is used as just a sanity check to give 

495# warnings about probably incorrect coding in Wiktionary. 

496template_allowed_pos_map: dict[str, list[str]] = { 

497 "abbr": ["abbrev"], 

498 "noun": ["noun", "abbrev", "pron", "name", "num", "adj_noun"], 

499 "plural noun": ["noun", "name"], 

500 "plural-noun": ["noun", "name"], 

501 "proper noun": ["noun", "name"], 

502 "proper-noun": ["name", "noun"], 

503 "prop": ["name", "noun"], 

504 "verb": ["verb", "phrase"], 

505 "gerund": ["verb"], 

506 "particle": ["adv", "particle"], 

507 "adj": ["adj", "adj_noun"], 

508 "pron": ["pron", "noun"], 

509 "name": ["name", "noun"], 

510 "adv": ["adv", "intj", "conj", "particle"], 

511 "phrase": ["phrase", "prep_phrase"], 

512 "noun phrase": ["phrase"], 

513 "ordinal": ["num"], 

514 "number": ["num"], 

515 "pos": ["affix", "name", "num"], 

516 "suffix": ["suffix", "affix"], 

517 "character": ["character"], 

518 "letter": ["character"], 

519 "kanji": ["character"], 

520 "cont": ["abbrev"], 

521 "interj": ["intj"], 

522 "con": ["conj"], 

523 "part": ["particle"], 

524 "prep": ["prep", "postp"], 

525 "postp": ["postp"], 

526 "misspelling": ["noun", "adj", "verb", "adv"], 

527 "part-form": ["verb"], 

528} 

529for k, v in template_allowed_pos_map.items(): 

530 for x in v: 

531 if x not in PARTS_OF_SPEECH: 531 ↛ 532line 531 didn't jump to line 532 because the condition on line 531 was never true

532 print( 

533 "BAD PART OF SPEECH {!r} IN template_allowed_pos_map: {}={}" 

534 "".format(x, k, v) 

535 ) 

536 assert False 

537 

538 

539# Templates ignored during etymology extraction, i.e., these will not be listed 

540# in the extracted etymology templates. 

541ignored_etymology_templates: list[str] = [ 

542 "...", 

543 "IPAchar", 

544 "ipachar", 

545 "ISBN", 

546 "isValidPageName", 

547 "redlink category", 

548 "deprecated code", 

549 "check deprecated lang param usage", 

550 "para", 

551 "p", 

552 "cite", 

553 "Cite news", 

554 "Cite newsgroup", 

555 "cite paper", 

556 "cite MLLM 1976", 

557 "cite journal", 

558 "cite news/documentation", 

559 "cite paper/documentation", 

560 "cite video game", 

561 "cite video game/documentation", 

562 "cite newsgroup", 

563 "cite newsgroup/documentation", 

564 "cite web/documentation", 

565 "cite news", 

566 "Cite book", 

567 "Cite-book", 

568 "cite book", 

569 "cite web", 

570 "cite-usenet", 

571 "cite-video/documentation", 

572 "Cite-journal", 

573 "rfe", 

574 "catlangname", 

575 "cln", 

576 "langname-lite", 

577 "no deprecated lang param usage", 

578 "mention", 

579 "m", 

580 "m-self", 

581 "link", 

582 "l", 

583 "ll", 

584 "l-self", 

585] 

586# Regexp for matching ignored etymology template names. This adds certain 

587# prefixes to the names listed above. 

588ignored_etymology_templates_re = re.compile( 

589 r"^((cite-|R:|RQ:).*|" 

590 + r"|".join(re.escape(x) for x in ignored_etymology_templates) 

591 + r")$" 

592) 

593 

594# Regexp for matching ignored descendants template names. Right now we just 

595# copy the ignored etymology templates 

596ignored_descendants_templates_re = ignored_etymology_templates_re 

597 

598# Set of template names that are used to define usage examples. If the usage 

599# example contains one of these templates, then it its type is set to 

600# "example" 

601usex_templates: set[str] = { 

602 "afex", 

603 "affixusex", 

604 "co", # {{collocation}} acts like a example template, specifically for 

605 # pairs of combinations of words that are more common than you'd 

606 # except would be randomly; hlavní#Czech 

607 "coi", 

608 "collocation", 

609 "el-example", 

610 "el-x", 

611 "example", 

612 "examples", 

613 "he-usex", 

614 "he-x", 

615 "hi-usex", 

616 "hi-x", 

617 "ja-usex-inline", 

618 "ja-usex", 

619 "ja-x", 

620 "jbo-example", 

621 "jbo-x", 

622 "km-usex", 

623 "km-x", 

624 "ko-usex", 

625 "ko-x", 

626 "lo-usex", 

627 "lo-x", 

628 "ne-x", 

629 "ne-usex", 

630 "prefixusex", 

631 "ryu-usex", 

632 "ryu-x", 

633 "shn-usex", 

634 "shn-x", 

635 "suffixusex", 

636 "th-usex", 

637 "th-x", 

638 "ur-usex", 

639 "ur-x", 

640 "usex", 

641 "usex-suffix", 

642 "ux", 

643 "uxi", 

644} 

645 

646stop_head_at_these_templates: set[str] = { 

647 "category", 

648 "cat", 

649 "topics", 

650 "catlangname", 

651 "c", 

652 "C", 

653 "top", 

654 "cln", 

655} 

656 

657# Set of template names that are used to define quotation examples. If the 

658# usage example contains one of these templates, then its type is set to 

659# "quotation". 

660quotation_templates: set[str] = { 

661 "collapse-quote", 

662 "quote-av", 

663 "quote-book", 

664 "quote-GYLD", 

665 "quote-hansard", 

666 "quotei", 

667 "quote-journal", 

668 "quotelite", 

669 "quote-mailing list", 

670 "quote-meta", 

671 "quote-newsgroup", 

672 "quote-song", 

673 "quote-text", 

674 "quote", 

675 "quote-us-patent", 

676 "quote-video game", 

677 "quote-web", 

678 "quote-wikipedia", 

679 "wikiquote", 

680 "Wikiquote", 

681} 

682 

683taxonomy_templates = { 

684 # argument 1 should be the taxonomic name, frex. "Lupus lupus" 

685 "taxfmt", 

686 "taxlink", 

687 "taxlink2", 

688 "taxlinknew", 

689 "taxlook", 

690} 

691 

692# Template name component to linkage section listing. Integer section means 

693# default section, starting at that argument. 

694# XXX not used anymore, except for the first elements: moved to 

695# template_linkages 

696# template_linkage_mappings: list[list[Union[str, int]]] = [ 

697# ["syn", "synonyms"], 

698# ["synonyms", "synonyms"], 

699# ["ant", "antonyms"], 

700# ["antonyms", "antonyms"], 

701# ["hyp", "hyponyms"], 

702# ["hyponyms", "hyponyms"], 

703# ["der", "derived"], 

704# ["derived terms", "derived"], 

705# ["coordinate terms", "coordinate_terms"], 

706# ["rel", "related"], 

707# ["col", 2], 

708# ] 

709 

710# Template names, this was exctracted from template_linkage_mappings, 

711# because the code using template_linkage_mappings was actually not used 

712# (but not removed). 

713template_linkages: set[str] = { 

714 "syn", 

715 "synonyms", 

716 "ant", 

717 "antonyms", 

718 "hyp", 

719 "hyponyms", 

720 "der", 

721 "derived terms", 

722 "coordinate terms", 

723 "rel", 

724 "col", 

725} 

726 

727# Maps template name used in a word sense to a linkage field that it adds. 

728sense_linkage_templates: dict[str, str] = { 

729 "syn": "synonyms", 

730 "synonyms": "synonyms", 

731 "hyp": "hyponyms", 

732 "hyponyms": "hyponyms", 

733 "ant": "antonyms", 

734 "antonyms": "antonyms", 

735} 

736 

737 

738def decode_html_entities(v: Union[str, int]) -> str: 

739 """Decodes HTML entities from a value, converting them to the respective 

740 Unicode characters/strings.""" 

741 if isinstance(v, int): 

742 # I changed this to return str(v) instead of v = str(v), 

743 # but there might have been the intention to have more logic 

744 # here. html.unescape would not do anything special with an integer, 

745 # it needs html escape symbols (&xx;). 

746 return str(v) 

747 return html.unescape(v) 

748 

749 

750def parse_sense_linkage( 

751 wxr: WiktextractContext, 

752 data: SenseData, 

753 name: str, 

754 ht: TemplateArgs, 

755) -> None: 

756 """Parses a linkage (synonym, etc) specified in a word sense.""" 

757 assert isinstance(wxr, WiktextractContext) 

758 assert isinstance(data, dict) 

759 assert isinstance(name, str) 

760 assert isinstance(ht, dict) 

761 field = sense_linkage_templates[name] 

762 for i in range(2, 20): 

763 w = ht.get(i) or "" 

764 w = clean_node(wxr, data, w) 

765 for alias in ns_title_prefix_tuple(wxr, "Thesaurus"): 

766 if w.startswith(alias): 

767 w = w[len(alias) :] 

768 break 

769 if not w: 

770 break 

771 tags: list[str] = [] 

772 topics: list[str] = [] 

773 english: Optional[str] = None 

774 # Try to find qualifiers for this synonym 

775 q = ht.get("q{}".format(i - 1)) 

776 if q: 

777 cls = classify_desc(q) 

778 if cls == "tags": 

779 tagsets1, topics1 = decode_tags(q) 

780 for ts in tagsets1: 

781 tags.extend(ts) 

782 topics.extend(topics1) 

783 elif cls == "english": 

784 if english: 

785 english += "; " + q 

786 else: 

787 english = q 

788 # Try to find English translation for this synonym 

789 t = ht.get("t{}".format(i - 1)) 

790 if t: 

791 if english: 

792 english += "; " + t 

793 else: 

794 english = t 

795 

796 # See if the linkage contains a parenthesized alt 

797 alt = None 

798 m = re.search(r"\(([^)]+)\)$", w) 

799 if m: 

800 w = w[: m.start()].strip() 

801 alt = m.group(1) 

802 

803 dt = {"word": w} 

804 if tags: 

805 data_extend(dt, "tags", tags) 

806 if topics: 

807 data_extend(dt, "topics", topics) 

808 if english: 

809 dt["english"] = english 

810 if alt: 

811 dt["alt"] = alt 

812 data_append(data, field, dt) 

813 

814 

815EXAMPLE_SPLITTERS = r"\s*[―—]+\s*" 

816example_splitter_re = re.compile(EXAMPLE_SPLITTERS) 

817captured_splitters_re = re.compile(r"(" + EXAMPLE_SPLITTERS + r")") 

818 

819 

820def synch_splits_with_args( 

821 line: str, targs: TemplateArgs 

822) -> Optional[list[str]]: 

823 """If it looks like there's something weird with how a line of example 

824 text has been split, this function will do the splitting after counting 

825 occurences of the splitting regex inside the two main template arguments 

826 containing the string data for the original language example and the 

827 English translations. 

828 """ 

829 # Previously, we split without capturing groups, but here we want to 

830 # keep the original splitting hyphen regex intact. 

831 fparts = captured_splitters_re.split(line) 

832 new_parts = [] 

833 # ["First", " – ", "second", " – ", "third..."] from OL argument 

834 first = 1 + (2 * len(example_splitter_re.findall(targs.get(2, "")))) 

835 new_parts.append("".join(fparts[:first])) 

836 # Translation argument 

837 tr_arg = targs.get(3) or targs.get("translation") or targs.get("t", "") 

838 # +2 = + 1 to skip the "expected" hyphen, + 1 as the `1 +` above. 

839 second = first + 2 + (2 * len(example_splitter_re.findall(tr_arg))) 

840 new_parts.append("".join(fparts[first + 1 : second])) 

841 

842 if all(new_parts): # no empty strings from the above spaghetti 

843 new_parts.extend(fparts[second + 1 :: 2]) # skip rest of hyphens 

844 return new_parts 

845 else: 

846 return None 

847 

848 

849QUALIFIERS = r"^\((([^()]|\([^()]*\))*)\):?\s*" 

850QUALIFIERS_RE = re.compile(QUALIFIERS) 

851# (...): ... or (...(...)...): ... 

852 

853 

854def parse_language( 

855 wxr: WiktextractContext, langnode: WikiNode, language: str, lang_code: str 

856) -> list[WordData]: 

857 """Iterates over the text of the page, returning words (parts-of-speech) 

858 defined on the page one at a time. (Individual word senses for the 

859 same part-of-speech are typically encoded in the same entry.)""" 

860 # imported here to avoid circular import 

861 from .pronunciation import parse_pronunciation 

862 

863 assert isinstance(wxr, WiktextractContext) 

864 assert isinstance(langnode, WikiNode) 

865 assert isinstance(language, str) 

866 assert isinstance(lang_code, str) 

867 # print("parse_language", language) 

868 

869 is_reconstruction = False 

870 word: str = wxr.wtp.title # type: ignore[assignment] 

871 unsupported_prefix = "Unsupported titles/" 

872 if word.startswith(unsupported_prefix): 

873 w = word[len(unsupported_prefix) :] 

874 if w in unsupported_title_map: 874 ↛ 877line 874 didn't jump to line 877 because the condition on line 874 was always true

875 word = unsupported_title_map[w] 

876 else: 

877 wxr.wtp.error( 

878 "Unimplemented unsupported title: {}".format(word), 

879 sortid="page/870", 

880 ) 

881 word = w 

882 elif word.startswith("Reconstruction:"): 882 ↛ 883line 882 didn't jump to line 883 because the condition on line 882 was never true

883 word = word[word.find("/") + 1 :] 

884 is_reconstruction = True 

885 

886 base_data: WordData = { 

887 "word": word, 

888 "lang": language, 

889 "lang_code": lang_code, 

890 } 

891 if is_reconstruction: 891 ↛ 892line 891 didn't jump to line 892 because the condition on line 891 was never true

892 data_append(base_data, "tags", "reconstruction") 

893 sense_data: SenseData = {} 

894 pos_data: WordData = {} # For a current part-of-speech 

895 level_four_data: WordData = {} # Chinese Pronunciation-sections in-between 

896 etym_data: WordData = {} # For one etymology 

897 pos_datas: list[SenseData] = [] 

898 level_four_datas: list[WordData] = [] 

899 etym_datas: list[WordData] = [] 

900 page_datas: list[WordData] = [] 

901 have_etym = False 

902 inside_level_four = False # This is for checking if the etymology section 

903 # or article has a Pronunciation section, for Chinese mostly; because 

904 # Chinese articles can have three level three sections (two etymology 

905 # sections and pronunciation sections) one after another, we need a kludge 

906 # to better keep track of whether we're in a normal "etym" or inside a 

907 # "level four" (which is what we've turned the level three Pron sections 

908 # into in the fix_subtitle_hierarchy(); all other sections are demoted by 

909 # a step. 

910 stack: list[str] = [] # names of items on the "stack" 

911 

912 def merge_base(data: WordData, base: WordData) -> None: 

913 for k, v in base.items(): 

914 # Copy the value to ensure that we don't share lists or 

915 # dicts between structures (even nested ones). 

916 v = copy.deepcopy(v) 

917 if k not in data: 

918 # The list was copied above, so this will not create shared ref 

919 data[k] = v # type: ignore[literal-required] 

920 continue 

921 if data[k] == v: # type: ignore[literal-required] 921 ↛ 923line 921 didn't jump to line 923 because the condition on line 921 was always true

922 continue 

923 if ( 

924 isinstance(data[k], (list, tuple)) # type: ignore[literal-required] 

925 or isinstance( 

926 v, 

927 (list, tuple), # Should this be "and"? 

928 ) 

929 ): 

930 data[k] = list(data[k]) + list(v) # type: ignore 

931 elif data[k] != v: # type: ignore[literal-required] 

932 wxr.wtp.warning( 

933 "conflicting values for {} in merge_base: " 

934 "{!r} vs {!r}".format(k, data[k], v), # type: ignore[literal-required] 

935 sortid="page/904", 

936 ) 

937 

938 def complementary_pop(pron: SoundData, key: str) -> SoundData: 

939 """Remove unnecessary keys from dict values 

940 in a list comprehension...""" 

941 if key in pron: 

942 pron.pop(key) # type: ignore 

943 return pron 

944 

945 # If the result has sounds, eliminate sounds that have a prefix that 

946 # does not match "word" or one of "forms" 

947 if "sounds" in data and "word" in data: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true

948 accepted = [data["word"]] 

949 accepted.extend(f["form"] for f in data.get("forms", dict())) 

950 data["sounds"] = list( 

951 s 

952 for s in data["sounds"] 

953 if "form" not in s or s["form"] in accepted 

954 ) 

955 # If the result has sounds, eliminate sounds that have a pos that 

956 # does not match "pos" 

957 if "sounds" in data and "pos" in data: 957 ↛ 958line 957 didn't jump to line 958 because the condition on line 957 was never true

958 data["sounds"] = list( 

959 complementary_pop(s, "pos") 

960 for s in data["sounds"] 

961 # "pos" is not a field of SoundData, correctly, so we're 

962 # removing it here. It's a kludge on a kludge on a kludge. 

963 if "pos" not in s or s["pos"] == data["pos"] # type: ignore[typeddict-item] 

964 ) 

965 

966 def push_sense() -> bool: 

967 """Starts collecting data for a new word sense. This returns True 

968 if a sense was added.""" 

969 nonlocal sense_data 

970 tags = sense_data.get("tags", ()) 

971 if ( 

972 not sense_data.get("glosses") 

973 and "translation-hub" not in tags 

974 and "no-gloss" not in tags 

975 ): 

976 return False 

977 

978 if ( 978 ↛ 988line 978 didn't jump to line 988

979 ( 

980 "participle" in sense_data.get("tags", ()) 

981 or "infinitive" in sense_data.get("tags", ()) 

982 ) 

983 and "alt_of" not in sense_data 

984 and "form_of" not in sense_data 

985 and "etymology_text" in etym_data 

986 and etym_data["etymology_text"] != "" 

987 ): 

988 etym = etym_data["etymology_text"] 

989 etym = etym.split(". ")[0] 

990 ret = parse_alt_or_inflection_of(wxr, etym, set()) 

991 if ret is not None: 

992 tags, lst = ret 

993 assert isinstance(lst, (list, tuple)) 

994 if "form-of" in tags: 

995 data_extend(sense_data, "form_of", lst) 

996 data_extend(sense_data, "tags", tags) 

997 elif "alt-of" in tags: 

998 data_extend(sense_data, "alt_of", lst) 

999 data_extend(sense_data, "tags", tags) 

1000 

1001 if not sense_data.get("glosses") and "no-gloss" not in sense_data.get( 1001 ↛ 1004line 1001 didn't jump to line 1004 because the condition on line 1001 was never true

1002 "tags", () 

1003 ): 

1004 data_append(sense_data, "tags", "no-gloss") 

1005 

1006 pos_datas.append(sense_data) 

1007 sense_data = {} 

1008 return True 

1009 

1010 def push_pos() -> None: 

1011 """Starts collecting data for a new part-of-speech.""" 

1012 nonlocal pos_data 

1013 nonlocal pos_datas 

1014 push_sense() 

1015 if wxr.wtp.subsection: 

1016 data: WordData = {"senses": pos_datas} 

1017 merge_base(data, pos_data) 

1018 level_four_datas.append(data) 

1019 pos_data = {} 

1020 pos_datas = [] 

1021 wxr.wtp.start_subsection(None) 

1022 

1023 def push_level_four_section() -> None: 

1024 """Starts collecting data for a new level four sections, which 

1025 is usually virtual and empty, unless the article has Chinese 

1026 'Pronunciation' sections that are etymology-section-like but 

1027 under etymology, and at the same level in the source. We modify 

1028 the source to demote Pronunciation sections like that to level 

1029 4, and other sections one step lower.""" 

1030 nonlocal level_four_data 

1031 nonlocal level_four_datas 

1032 nonlocal etym_datas 

1033 push_pos() 

1034 # print(f"======\n{etym_data=}") 

1035 # print(f"======\n{etym_datas=}") 

1036 # print(f"======\n{level_four_data=}") 

1037 # print(f"======\n{level_four_datas=}") 

1038 for data in level_four_datas: 

1039 merge_base(data, level_four_data) 

1040 etym_datas.append(data) 

1041 for data in etym_datas: 

1042 merge_base(data, etym_data) 

1043 page_datas.append(data) 

1044 level_four_data = {} 

1045 level_four_datas = [] 

1046 etym_datas = [] 

1047 

1048 def push_etym() -> None: 

1049 """Starts collecting data for a new etymology.""" 

1050 nonlocal etym_data 

1051 nonlocal etym_datas 

1052 nonlocal have_etym 

1053 nonlocal inside_level_four 

1054 have_etym = True 

1055 push_level_four_section() 

1056 inside_level_four = False 

1057 etym_data = {} 

1058 

1059 def select_data() -> WordData: 

1060 """Selects where to store data (pos or etym) based on whether we 

1061 are inside a pos (part-of-speech).""" 

1062 # print(f"{wxr.wtp.subsection=}") 

1063 # print(f"{stack=}") 

1064 if wxr.wtp.subsection is not None: 1064 ↛ 1066line 1064 didn't jump to line 1066 because the condition on line 1064 was always true

1065 return pos_data 

1066 if stack[-1] == language: 

1067 return base_data 

1068 if inside_level_four is False: 

1069 return etym_data 

1070 return level_four_data 

1071 

1072 def head_post_template_fn( 

1073 name: str, ht: TemplateArgs, expansion: str 

1074 ) -> Optional[str]: 

1075 """Handles special templates in the head section of a word. Head 

1076 section is the text after part-of-speech subtitle and before word 

1077 sense list. Typically it generates the bold line for the word, but 

1078 may also contain other useful information that often ends in 

1079 side boxes. We want to capture some of that additional information.""" 

1080 # print("HEAD_POST_TEMPLATE_FN", name, ht) 

1081 if is_panel_template(wxr, name): 1081 ↛ 1084line 1081 didn't jump to line 1084 because the condition on line 1081 was never true

1082 # Completely ignore these templates (not even recorded in 

1083 # head_templates) 

1084 return "" 

1085 if name == "head": 

1086 # XXX are these also captured in forms? Should this special case 

1087 # be removed? 

1088 t = ht.get(2, "") 

1089 if t == "pinyin": 1089 ↛ 1090line 1089 didn't jump to line 1090 because the condition on line 1089 was never true

1090 data_append(pos_data, "tags", "Pinyin") 

1091 elif t == "romanization": 1091 ↛ 1092line 1091 didn't jump to line 1092 because the condition on line 1091 was never true

1092 data_append(pos_data, "tags", "romanization") 

1093 if HEAD_TAG_RE.fullmatch(name) is not None: 1093 ↛ 1102line 1093 didn't jump to line 1102 because the condition on line 1093 was always true

1094 args_ht = clean_template_args(wxr, ht) 

1095 cleaned_expansion = clean_node(wxr, None, expansion) 

1096 dt = {"name": name, "args": args_ht, "expansion": cleaned_expansion} 

1097 data_append(pos_data, "head_templates", dt) 

1098 

1099 # The following are both captured in head_templates and parsed 

1100 # separately 

1101 

1102 if name in wikipedia_templates: 1102 ↛ 1105line 1102 didn't jump to line 1105 because the condition on line 1102 was never true

1103 # Note: various places expect to have content from wikipedia 

1104 # templates, so cannot convert this to empty 

1105 parse_wikipedia_template(wxr, pos_data, ht) 

1106 return None 

1107 

1108 if name == "number box": 1108 ↛ 1110line 1108 didn't jump to line 1110 because the condition on line 1108 was never true

1109 # XXX extract numeric value? 

1110 return "" 

1111 if name == "enum": 1111 ↛ 1113line 1111 didn't jump to line 1113 because the condition on line 1111 was never true

1112 # XXX extract? 

1113 return "" 

1114 if name == "cardinalbox": 1114 ↛ 1117line 1114 didn't jump to line 1117 because the condition on line 1114 was never true

1115 # XXX extract similar to enum? 

1116 # XXX this can also occur in top-level under language 

1117 return "" 

1118 if name == "Han simplified forms": 1118 ↛ 1120line 1118 didn't jump to line 1120 because the condition on line 1118 was never true

1119 # XXX extract? 

1120 return "" 

1121 # if name == "ja-kanji forms": 

1122 # # XXX extract? 

1123 # return "" 

1124 # if name == "vi-readings": 

1125 # # XXX extract? 

1126 # return "" 

1127 # if name == "ja-kanji": 

1128 # # XXX extract? 

1129 # return "" 

1130 if name == "picdic" or name == "picdicimg" or name == "picdiclabel": 1130 ↛ 1132line 1130 didn't jump to line 1132 because the condition on line 1130 was never true

1131 # XXX extract? 

1132 return "" 

1133 

1134 return None 

1135 

1136 def parse_part_of_speech(posnode: WikiNode, pos: str) -> None: 

1137 """Parses the subsection for a part-of-speech under a language on 

1138 a page.""" 

1139 assert isinstance(posnode, WikiNode) 

1140 assert isinstance(pos, str) 

1141 # print("parse_part_of_speech", pos) 

1142 pos_data["pos"] = pos 

1143 pre: list[list[Union[str, WikiNode]]] = [[]] # list of lists 

1144 lists: list[list[WikiNode]] = [[]] # list of lists 

1145 first_para = True 

1146 first_head_tmplt = True 

1147 collecting_head = True 

1148 start_of_paragraph = True 

1149 

1150 # XXX extract templates from posnode with recursively_extract 

1151 # that break stuff, like ja-kanji or az-suffix-form. 

1152 # Do the extraction with a list of template names, combined from 

1153 # different lists, then separate out them into different lists 

1154 # that are handled at different points of the POS section. 

1155 # First, extract az-suffix-form, put it in `inflection`, 

1156 # and parse `inflection`'s content when appropriate later. 

1157 # The contents of az-suffix-form (and ja-kanji) that generate 

1158 # divs with "floatright" in their style gets deleted by 

1159 # clean_value, so templates that slip through from here won't 

1160 # break anything. 

1161 # XXX bookmark 

1162 # print(posnode.children) 

1163 

1164 floaters, poschildren = recursively_extract( 

1165 posnode.children, 

1166 lambda x: ( 

1167 isinstance(x, WikiNode) 

1168 and x.kind == NodeKind.TEMPLATE 

1169 and x.largs[0][0] in FLOATING_TABLE_TEMPLATES 

1170 ), 

1171 ) 

1172 tempnode = WikiNode(NodeKind.LEVEL6, 0) 

1173 tempnode.largs = [["Inflection"]] 

1174 tempnode.children = floaters 

1175 parse_inflection(tempnode, "Floating Div", pos) 

1176 # print(poschildren) 

1177 # XXX new above 

1178 

1179 if not poschildren: 1179 ↛ 1180line 1179 didn't jump to line 1180 because the condition on line 1179 was never true

1180 if not floaters: 

1181 wxr.wtp.debug( 

1182 "PoS section without contents", 

1183 sortid="en/page/1051/20230612", 

1184 ) 

1185 else: 

1186 wxr.wtp.debug( 

1187 "PoS section without contents except for a floating table", 

1188 sortid="en/page/1056/20230612", 

1189 ) 

1190 return 

1191 

1192 for node in poschildren: 

1193 if isinstance(node, str): 

1194 for m in re.finditer(r"\n+|[^\n]+", node): 

1195 p = m.group(0) 

1196 if p.startswith("\n\n") and pre: 

1197 first_para = False 

1198 start_of_paragraph = True 

1199 break 

1200 if p and collecting_head: 

1201 pre[-1].append(p) 

1202 continue 

1203 assert isinstance(node, WikiNode) 

1204 kind = node.kind 

1205 if kind == NodeKind.LIST: 

1206 lists[-1].append(node) 

1207 collecting_head = False 

1208 start_of_paragraph = True 

1209 continue 

1210 elif kind in LEVEL_KINDS: 

1211 # Stop parsing section if encountering any kind of 

1212 # level header (like ===Noun=== or ====Further Reading====). 

1213 # At a quick glance, this should be the default behavior, 

1214 # but if some kinds of source articles have sub-sub-sections 

1215 # that should be parsed XXX it should be handled by changing 

1216 # this break. 

1217 break 

1218 elif collecting_head and kind == NodeKind.LINK: 1218 ↛ 1221line 1218 didn't jump to line 1221 because the condition on line 1218 was never true

1219 # We might collect relevant links as they are often pictures 

1220 # relating to the word 

1221 if len(node.largs[0]) >= 1 and isinstance( 

1222 node.largs[0][0], str 

1223 ): 

1224 if node.largs[0][0].startswith( 

1225 ns_title_prefix_tuple(wxr, "Category") 

1226 ): 

1227 # [[Category:...]] 

1228 # We're at the end of the file, probably, so stop 

1229 # here. Otherwise the head will get garbage. 

1230 break 

1231 if node.largs[0][0].startswith( 

1232 ns_title_prefix_tuple(wxr, "File") 

1233 ): 

1234 # Skips file links 

1235 continue 

1236 start_of_paragraph = False 

1237 pre[-1].extend(node.largs[-1]) 

1238 elif kind == NodeKind.HTML: 

1239 if node.sarg == "br": 1239 ↛ 1245line 1239 didn't jump to line 1245 because the condition on line 1239 was always true

1240 if pre[-1]: 1240 ↛ 1192line 1240 didn't jump to line 1192 because the condition on line 1240 was always true

1241 pre.append([]) # Switch to next head 

1242 lists.append([]) # Lists parallels pre 

1243 collecting_head = True 

1244 start_of_paragraph = True 

1245 elif collecting_head and node.sarg not in ( 

1246 "gallery", 

1247 "ref", 

1248 "cite", 

1249 "caption", 

1250 ): 

1251 start_of_paragraph = False 

1252 pre[-1].append(node) 

1253 else: 

1254 start_of_paragraph = False 

1255 elif isinstance(node, TemplateNode): 

1256 # XXX Insert code here that disambiguates between 

1257 # templates that generate word heads and templates 

1258 # that don't. 

1259 # There's head_tag_re that seems like a regex meant 

1260 # to identify head templates. Too bad it's None. 

1261 

1262 # ignore {{category}}, {{cat}}... etc. 

1263 if node.template_name in stop_head_at_these_templates: 

1264 # we've reached a template that should be at the end, 

1265 continue 

1266 

1267 # skip these templates; panel_templates is already used 

1268 # to skip certain templates else, but it also applies to 

1269 # head parsing quite well. 

1270 # node.largs[0][0] should always be str, but can't type-check 

1271 # that. 

1272 if is_panel_template(wxr, node.template_name): 1272 ↛ 1273line 1272 didn't jump to line 1273 because the condition on line 1272 was never true

1273 continue 

1274 # skip these templates 

1275 # if node.largs[0][0] in skip_these_templates_in_head: 

1276 # first_head_tmplt = False # no first_head_tmplt at all 

1277 # start_of_paragraph = False 

1278 # continue 

1279 

1280 if first_head_tmplt and pre[-1]: 

1281 first_head_tmplt = False 

1282 start_of_paragraph = False 

1283 pre[-1].append(node) 

1284 elif pre[-1] and start_of_paragraph: 

1285 pre.append([]) # Switch to the next head 

1286 lists.append([]) # lists parallel pre 

1287 collecting_head = True 

1288 start_of_paragraph = False 

1289 pre[-1].append(node) 

1290 else: 

1291 pre[-1].append(node) 

1292 elif first_para: 

1293 start_of_paragraph = False 

1294 if collecting_head: 1294 ↛ 1192line 1294 didn't jump to line 1192 because the condition on line 1294 was always true

1295 pre[-1].append(node) 

1296 # XXX use template_fn in clean_node to check that the head macro 

1297 # is compatible with the current part-of-speech and generate warning 

1298 # if not. Use template_allowed_pos_map. 

1299 

1300 # Clean up empty pairs, and fix messes with extra newlines that 

1301 # separate templates that are followed by lists wiktextract issue #314 

1302 

1303 cleaned_pre: list[list[Union[str, WikiNode]]] = [] 

1304 cleaned_lists: list[list[WikiNode]] = [] 

1305 pairless_pre_index = None 

1306 

1307 for pre1, ls in zip(pre, lists): 

1308 if pre1 and not ls: 

1309 pairless_pre_index = len(cleaned_pre) 

1310 if not pre1 and not ls: 1310 ↛ 1312line 1310 didn't jump to line 1312 because the condition on line 1310 was never true

1311 # skip [] + [] 

1312 continue 

1313 if not ls and all( 

1314 (isinstance(x, str) and not x.strip()) for x in pre1 

1315 ): 

1316 # skip ["\n", " "] + [] 

1317 continue 

1318 if ls and not pre1: 

1319 if pairless_pre_index is not None: 1319 ↛ 1320line 1319 didn't jump to line 1320 because the condition on line 1319 was never true

1320 cleaned_lists[pairless_pre_index] = ls 

1321 pairless_pre_index = None 

1322 continue 

1323 cleaned_pre.append(pre1) 

1324 cleaned_lists.append(ls) 

1325 

1326 pre = cleaned_pre 

1327 lists = cleaned_lists 

1328 

1329 there_are_many_heads = len(pre) > 1 

1330 header_tags: list[str] = [] 

1331 

1332 if not any(g for g in lists): 

1333 process_gloss_without_list(poschildren, pos, pos_data, header_tags) 

1334 else: 

1335 for i, (pre1, ls) in enumerate(zip(pre, lists)): 

1336 # if len(ls) == 0: 

1337 # # don't have gloss list 

1338 # # XXX add code here to filter out 'garbage', like text 

1339 # # that isn't a head template or head. 

1340 # continue 

1341 

1342 if all(not sl for sl in lists[i:]): 1342 ↛ 1343line 1342 didn't jump to line 1343 because the condition on line 1342 was never true

1343 if i == 0: 

1344 if isinstance(node, str): 

1345 wxr.wtp.debug( 

1346 "first head without list of senses," 

1347 "string: '{}[...]', {}/{}".format( 

1348 node[:20], word, language 

1349 ), 

1350 sortid="page/1689/20221215", 

1351 ) 

1352 if isinstance(node, WikiNode): 

1353 if node.largs and node.largs[0][0] in [ 

1354 "Han char", 

1355 ]: 

1356 # just ignore these templates 

1357 pass 

1358 else: 

1359 wxr.wtp.debug( 

1360 "first head without " 

1361 "list of senses, " 

1362 "template node " 

1363 "{}, {}/{}".format( 

1364 node.largs, word, language 

1365 ), 

1366 sortid="page/1694/20221215", 

1367 ) 

1368 else: 

1369 wxr.wtp.debug( 

1370 "first head without list of senses, " 

1371 "{}/{}".format(word, language), 

1372 sortid="page/1700/20221215", 

1373 ) 

1374 # no break here so that the first head always 

1375 # gets processed. 

1376 else: 

1377 if isinstance(node, str): 

1378 wxr.wtp.debug( 

1379 "later head without list of senses," 

1380 "string: '{}[...]', {}/{}".format( 

1381 node[:20], word, language 

1382 ), 

1383 sortid="page/1708/20221215", 

1384 ) 

1385 if isinstance(node, WikiNode): 

1386 wxr.wtp.debug( 

1387 "later head without list of senses," 

1388 "template node " 

1389 "{}, {}/{}".format( 

1390 node.sarg if node.sarg else node.largs, 

1391 word, 

1392 language, 

1393 ), 

1394 sortid="page/1713/20221215", 

1395 ) 

1396 else: 

1397 wxr.wtp.debug( 

1398 "later head without list of senses, " 

1399 "{}/{}".format(word, language), 

1400 sortid="page/1719/20221215", 

1401 ) 

1402 break 

1403 head_group = i + 1 if there_are_many_heads else None 

1404 # print("parse_part_of_speech: {}: {}: pre={}" 

1405 # .format(wxr.wtp.section, wxr.wtp.subsection, pre1)) 

1406 process_gloss_header( 

1407 pre1, pos, head_group, pos_data, header_tags 

1408 ) 

1409 for ln in ls: 

1410 # Parse each list associated with this head. 

1411 for node in ln.children: 

1412 # Parse nodes in l.children recursively. 

1413 # The recursion function uses push_sense() to 

1414 # add stuff into pos_data, and returns True or 

1415 # False if something is added, which bubbles upward. 

1416 # If the bubble is "True", then higher levels of 

1417 # the recursion will not push_sense(), because 

1418 # the data is already pushed into a sub-gloss 

1419 # downstream, unless the higher level has examples 

1420 # that need to be put somewhere. 

1421 common_data: SenseData = {"tags": list(header_tags)} 

1422 if head_group: 

1423 common_data["head_nr"] = head_group 

1424 parse_sense_node(node, common_data, pos) # type: ignore[arg-type] 

1425 

1426 # If there are no senses extracted, add a dummy sense. We want to 

1427 # keep tags extracted from the head for the dummy sense. 

1428 push_sense() # Make sure unfinished data pushed, and start clean sense 

1429 if len(pos_datas) == 0: 

1430 data_extend(sense_data, "tags", header_tags) 

1431 data_append(sense_data, "tags", "no-gloss") 

1432 push_sense() 

1433 

1434 def process_gloss_header( 

1435 header_nodes: list[Union[WikiNode, str]], 

1436 pos_type: str, 

1437 header_group: Optional[int], 

1438 pos_data: WordData, 

1439 header_tags: list[str], 

1440 ) -> None: 

1441 ruby = [] 

1442 links: list[str] = [] 

1443 

1444 # process template parse nodes here 

1445 new_nodes = [] 

1446 info_template_data = [] 

1447 for node in header_nodes: 

1448 info_data, info_out = parse_info_template_node(wxr, node, "head") 

1449 if info_data or info_out: 

1450 if info_data: 1450 ↛ 1452line 1450 didn't jump to line 1452 because the condition on line 1450 was always true

1451 info_template_data.append(info_data) 

1452 if info_out: # including just the original node 1452 ↛ 1453line 1452 didn't jump to line 1453 because the condition on line 1452 was never true

1453 new_nodes.append(info_out) 

1454 else: 

1455 new_nodes.append(node) 

1456 header_nodes = new_nodes 

1457 

1458 if info_template_data: 

1459 if "info_templates" not in pos_data: 1459 ↛ 1462line 1459 didn't jump to line 1462 because the condition on line 1459 was always true

1460 pos_data["info_templates"] = info_template_data 

1461 else: 

1462 pos_data["info_templates"].extend(info_template_data) 

1463 

1464 if not word.isalnum(): 

1465 # if the word contains non-letter or -number characters, it might 

1466 # have something that messes with split-at-semi-comma; we collect 

1467 # links so that we can skip splitting them. 

1468 exp = wxr.wtp.parse( 

1469 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1470 ) 

1471 link_nodes, _ = recursively_extract( 

1472 exp.children, 

1473 lambda x: isinstance(x, WikiNode) and x.kind == NodeKind.LINK, 

1474 ) 

1475 for ln in link_nodes: 

1476 ltext = clean_node(wxr, None, ln.largs[-1]) # type: ignore[union-attr] 

1477 if not ltext.isalnum(): 

1478 links.append(ltext) 

1479 if word not in links: 1479 ↛ 1481line 1479 didn't jump to line 1481 because the condition on line 1479 was always true

1480 links.append(word) 

1481 if lang_code == "ja": 

1482 exp = wxr.wtp.parse( 

1483 wxr.wtp.node_to_wikitext(header_nodes), expand_all=True 

1484 ) 

1485 rub, _ = recursively_extract( 

1486 exp.children, 

1487 lambda x: isinstance(x, WikiNode) 

1488 and x.kind == NodeKind.HTML 

1489 and x.sarg == "ruby", 

1490 ) 

1491 if rub is not None: 1491 ↛ 1500line 1491 didn't jump to line 1500 because the condition on line 1491 was always true

1492 for r in rub: 1492 ↛ 1493line 1492 didn't jump to line 1493 because the loop on line 1492 never started

1493 if TYPE_CHECKING: 

1494 # we know the lambda above in recursively_extract 

1495 # returns only WikiNodes in rub 

1496 assert isinstance(r, WikiNode) 

1497 rt = parse_ruby(wxr, r) 

1498 if rt is not None: 

1499 ruby.append(rt) 

1500 header_text = clean_node( 

1501 wxr, pos_data, header_nodes, post_template_fn=head_post_template_fn 

1502 ) 

1503 header_text = re.sub(r"\s+", " ", header_text) 

1504 # print(f"{header_text=}") 

1505 parse_word_head( 

1506 wxr, 

1507 pos_type, 

1508 header_text, 

1509 pos_data, 

1510 is_reconstruction, 

1511 header_group, 

1512 ruby=ruby, 

1513 links=links, 

1514 ) 

1515 if "tags" in pos_data: 

1516 # pos_data can get "tags" data from some source; type-checkers 

1517 # doesn't like it, so let's ignore it. 

1518 header_tags.extend(pos_data["tags"]) # type: ignore[typeddict-item] 

1519 del pos_data["tags"] # type: ignore[typeddict-item] 

1520 else: 

1521 header_tags.clear() 

1522 

1523 def process_gloss_without_list( 

1524 nodes: list[Union[WikiNode, str]], 

1525 pos_type: str, 

1526 pos_data: WordData, 

1527 header_tags: list[str], 

1528 ) -> None: 

1529 # gloss text might not inside a list 

1530 header_nodes: list[Union[str, WikiNode]] = [] 

1531 gloss_nodes: list[Union[str, WikiNode]] = [] 

1532 for node in strip_nodes(nodes): 

1533 if isinstance(node, WikiNode): 

1534 if isinstance(node, TemplateNode): 

1535 if node.template_name in ( 

1536 "zh-see", 

1537 "ja-see", 

1538 "ja-see-kango", 

1539 ): 

1540 continue # soft redirect 

1541 elif ( 1541 ↛ 1549line 1541 didn't jump to line 1549

1542 node.template_name == "head" 

1543 or node.template_name.startswith(f"{lang_code}-") 

1544 ): 

1545 header_nodes.append(node) 

1546 continue 

1547 elif node.kind in LEVEL_KINDS: # following nodes are not gloss 

1548 break 

1549 gloss_nodes.append(node) 

1550 

1551 if len(header_nodes) > 0: 

1552 process_gloss_header( 

1553 header_nodes, pos_type, None, pos_data, header_tags 

1554 ) 

1555 if len(gloss_nodes) > 0: 

1556 process_gloss_contents( 

1557 gloss_nodes, pos_type, {"tags": list(header_tags)} 

1558 ) 

1559 

1560 def parse_sense_node( 

1561 node: Union[str, WikiNode], # never receives str 

1562 sense_base: SenseData, 

1563 pos: str, 

1564 ) -> bool: 

1565 """Recursively (depth first) parse LIST_ITEM nodes for sense data. 

1566 Uses push_sense() to attempt adding data to pos_data in the scope 

1567 of parse_language() when it reaches deep in the recursion. push_sense() 

1568 returns True if it succeeds, and that is bubbled up the stack; if 

1569 a sense was added downstream, the higher levels (whose shared data 

1570 was already added by a subsense) do not push_sense(), unless it 

1571 has examples that need to be put somewhere. 

1572 """ 

1573 assert isinstance(sense_base, dict) # Added to every sense deeper in 

1574 if not isinstance(node, WikiNode): 1574 ↛ 1576line 1574 didn't jump to line 1576 because the condition on line 1574 was never true

1575 # This doesn't seem to ever happen in practice. 

1576 wxr.wtp.debug( 

1577 "{}: parse_sense_node called with" 

1578 "something that isn't a WikiNode".format(pos), 

1579 sortid="page/1287/20230119", 

1580 ) 

1581 return False 

1582 

1583 if node.kind != NodeKind.LIST_ITEM: 1583 ↛ 1584line 1583 didn't jump to line 1584 because the condition on line 1583 was never true

1584 wxr.wtp.debug( 

1585 "{}: non-list-item inside list".format(pos), sortid="page/1678" 

1586 ) 

1587 return False 

1588 

1589 if node.sarg == ":": 1589 ↛ 1595line 1589 didn't jump to line 1595 because the condition on line 1589 was never true

1590 # Skip example entries at the highest level, ones without 

1591 # a sense ("...#") above them. 

1592 # If node.sarg is exactly and only ":", then it's at 

1593 # the highest level; lower levels would have more 

1594 # "indentation", like "#:" or "##:" 

1595 return False 

1596 

1597 # If a recursion call succeeds in push_sense(), bubble it up with 

1598 # `added`. 

1599 # added |= push_sense() or added |= parse_sense_node(...) to OR. 

1600 added = False 

1601 

1602 gloss_template_args: set[str] = set() 

1603 

1604 # For LISTs and LIST_ITEMS, their argument is something like 

1605 # "##" or "##:", and using that we can rudimentally determine 

1606 # list 'depth' if need be, and also what kind of list or 

1607 # entry it is; # is for normal glosses, : for examples (indent) 

1608 # and * is used for quotations on wiktionary. 

1609 current_depth = node.sarg 

1610 

1611 children = node.children 

1612 

1613 # subentries, (presumably) a list 

1614 # of subglosses below this. The list's 

1615 # argument ends with #, and its depth should 

1616 # be bigger than parent node. 

1617 subentries = [ 

1618 x 

1619 for x in children 

1620 if isinstance(x, WikiNode) 

1621 and x.kind == NodeKind.LIST 

1622 and x.sarg == current_depth + "#" 

1623 ] 

1624 

1625 # sublists of examples and quotations. .sarg 

1626 # does not end with "#". 

1627 others = [ 

1628 x 

1629 for x in children 

1630 if isinstance(x, WikiNode) 

1631 and x.kind == NodeKind.LIST 

1632 and x.sarg != current_depth + "#" 

1633 ] 

1634 

1635 # the actual contents of this particular node. 

1636 # can be a gloss (or a template that expands into 

1637 # many glosses which we can't easily pre-expand) 

1638 # or could be an "outer gloss" with more specific 

1639 # subglosses, or could be a qualfier for the subglosses. 

1640 contents = [ 

1641 x 

1642 for x in children 

1643 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1644 ] 

1645 # If this entry has sublists of entries, we should combine 

1646 # gloss information from both the "outer" and sublist content. 

1647 # Sometimes the outer gloss 

1648 # is more non-gloss or tags, sometimes it is a coarse sense 

1649 # and the inner glosses are more specific. The outer one 

1650 # does not seem to have qualifiers. 

1651 

1652 # If we have one sublist with one element, treat it 

1653 # specially as it may be a Wiktionary error; raise 

1654 # that nested element to the same level. 

1655 # XXX If need be, this block can be easily removed in 

1656 # the current recursive logicand the result is one sense entry 

1657 # with both glosses in the glosses list, as you would 

1658 # expect. If the higher entry has examples, there will 

1659 # be a higher entry with some duplicated data. 

1660 if len(subentries) == 1: 

1661 slc = subentries[0].children 

1662 if len(slc) == 1: 1662 ↛ 1665line 1662 didn't jump to line 1665 because the condition on line 1662 was never true

1663 # copy current node and modify it so it doesn't 

1664 # loop infinitely. 

1665 cropped_node = copy.copy(node) 

1666 cropped_node.children = [ 

1667 x 

1668 for x in children 

1669 if not ( 

1670 isinstance(x, WikiNode) 

1671 and x.kind == NodeKind.LIST 

1672 and x.sarg == current_depth + "#" 

1673 ) 

1674 ] 

1675 added |= parse_sense_node(cropped_node, sense_base, pos) 

1676 nonlocal sense_data # this kludge causes duplicated raw_ 

1677 # glosses data if this is not done; 

1678 # if the top-level (cropped_node) 

1679 # does not push_sense() properly or 

1680 # parse_sense_node() returns early, 

1681 # sense_data is not reset. This happens 

1682 # for example when you have a no-gloss 

1683 # string like "(intransitive)": 

1684 # no gloss, push_sense() returns early 

1685 # and sense_data has duplicate data with 

1686 # sense_base 

1687 sense_data = {} 

1688 added |= parse_sense_node(slc[0], sense_base, pos) 

1689 return added 

1690 

1691 return process_gloss_contents( 

1692 contents, 

1693 pos, 

1694 sense_base, 

1695 subentries, 

1696 others, 

1697 gloss_template_args, 

1698 added, 

1699 ) 

1700 

1701 def process_gloss_contents( 

1702 contents: list[Union[str, WikiNode]], 

1703 pos: str, 

1704 sense_base: SenseData, 

1705 subentries: list[WikiNode] = [], 

1706 others: list[WikiNode] = [], 

1707 gloss_template_args: Set[str] = set(), 

1708 added: bool = False, 

1709 ) -> bool: 

1710 def sense_template_fn( 

1711 name: str, ht: TemplateArgs, is_gloss: bool = False 

1712 ) -> Optional[str]: 

1713 # print(f"sense_template_fn: {name}, {ht}") 

1714 if name in wikipedia_templates: 1714 ↛ 1716line 1714 didn't jump to line 1716 because the condition on line 1714 was never true

1715 # parse_wikipedia_template(wxr, pos_data, ht) 

1716 return None 

1717 if is_panel_template(wxr, name): 1717 ↛ 1718line 1717 didn't jump to line 1718 because the condition on line 1717 was never true

1718 return "" 

1719 if name in INFO_TEMPLATE_FUNCS: 

1720 info_data, info_exp = parse_info_template_arguments( 

1721 wxr, name, ht, "sense" 

1722 ) 

1723 if info_data or info_exp: 1723 ↛ 1729line 1723 didn't jump to line 1729 because the condition on line 1723 was always true

1724 if info_data: 1724 ↛ 1726line 1724 didn't jump to line 1726 because the condition on line 1724 was always true

1725 data_append(sense_base, "info_templates", info_data) 

1726 if info_exp and isinstance(info_exp, str): 1726 ↛ 1728line 1726 didn't jump to line 1728 because the condition on line 1726 was always true

1727 return info_exp 

1728 return "" 

1729 if name in ("defdate",): 1729 ↛ 1730line 1729 didn't jump to line 1730 because the condition on line 1729 was never true

1730 return "" 

1731 if name == "senseid": 1731 ↛ 1732line 1731 didn't jump to line 1732 because the condition on line 1731 was never true

1732 langid = clean_node(wxr, None, ht.get(1, ())) 

1733 arg = clean_node(wxr, sense_base, ht.get(2, ())) 

1734 if re.match(r"Q\d+$", arg): 

1735 data_append(sense_base, "wikidata", arg) 

1736 data_append(sense_base, "senseid", langid + ":" + arg) 

1737 if name in sense_linkage_templates: 1737 ↛ 1739line 1737 didn't jump to line 1739 because the condition on line 1737 was never true

1738 # print(f"SENSE_TEMPLATE_FN: {name}") 

1739 parse_sense_linkage(wxr, sense_base, name, ht) 

1740 return "" 

1741 if name == "†" or name == "zh-obsolete": 1741 ↛ 1742line 1741 didn't jump to line 1742 because the condition on line 1741 was never true

1742 data_append(sense_base, "tags", "obsolete") 

1743 return "" 

1744 if name in { 

1745 "ux", 

1746 "uxi", 

1747 "usex", 

1748 "afex", 

1749 "prefixusex", 

1750 "ko-usex", 

1751 "ko-x", 

1752 "hi-x", 

1753 "ja-usex-inline", 

1754 "ja-x", 

1755 "quotei", 

1756 "he-x", 

1757 "hi-x", 

1758 "km-x", 

1759 "ne-x", 

1760 "shn-x", 

1761 "th-x", 

1762 "ur-x", 

1763 }: 

1764 # Usage examples are captured separately below. We don't 

1765 # want to expand them into glosses even when unusual coding 

1766 # is used in the entry. 

1767 # These templates may slip through inside another item, but 

1768 # currently we're separating out example entries (..#:) 

1769 # well enough that there seems to very little contamination. 

1770 if is_gloss: 1770 ↛ 1776line 1770 didn't jump to line 1776 because the condition on line 1770 was always true

1771 wxr.wtp.warning( 

1772 "Example template is used for gloss text", 

1773 sortid="extractor.en.page.sense_template_fn/1415", 

1774 ) 

1775 else: 

1776 return "" 

1777 if name == "w": 1777 ↛ 1778line 1777 didn't jump to line 1778 because the condition on line 1777 was never true

1778 if ht.get(2) == "Wp": 

1779 return "" 

1780 for k, v in ht.items(): 

1781 v = v.strip() 

1782 if v and "<" not in v: 1782 ↛ 1780line 1782 didn't jump to line 1780 because the condition on line 1782 was always true

1783 gloss_template_args.add(v) 

1784 return None 

1785 

1786 def extract_link_texts(item: GeneralNode) -> None: 

1787 """Recursively extracts link texts from the gloss source. This 

1788 information is used to select whether to remove final "." from 

1789 form_of/alt_of (e.g., ihm/Hunsrik).""" 

1790 if isinstance(item, (list, tuple)): 

1791 for x in item: 

1792 extract_link_texts(x) 

1793 return 

1794 if isinstance(item, str): 

1795 # There seem to be HTML sections that may futher contain 

1796 # unparsed links. 

1797 for m in re.finditer(r"\[\[([^]]*)\]\]", item): 1797 ↛ 1798line 1797 didn't jump to line 1798 because the loop on line 1797 never started

1798 print("ITER:", m.group(0)) 

1799 v = m.group(1).split("|")[-1].strip() 

1800 if v: 

1801 gloss_template_args.add(v) 

1802 return 

1803 if not isinstance(item, WikiNode): 1803 ↛ 1804line 1803 didn't jump to line 1804 because the condition on line 1803 was never true

1804 return 

1805 if item.kind == NodeKind.LINK: 

1806 v = item.largs[-1] 

1807 if ( 1807 ↛ 1813line 1807 didn't jump to line 1813

1808 isinstance(v, list) 

1809 and len(v) == 1 

1810 and isinstance(v[0], str) 

1811 ): 

1812 gloss_template_args.add(v[0].strip()) 

1813 for x in item.children: 

1814 extract_link_texts(x) 

1815 

1816 extract_link_texts(contents) 

1817 

1818 # get the raw text of non-list contents of this node, and other stuff 

1819 # like tag and category data added to sense_base 

1820 # cast = no-op type-setter for the type-checker 

1821 partial_template_fn = cast( 

1822 TemplateFnCallable, 

1823 partial(sense_template_fn, is_gloss=True), 

1824 ) 

1825 rawgloss = clean_node( 

1826 wxr, 

1827 sense_base, 

1828 contents, 

1829 template_fn=partial_template_fn, 

1830 collect_links=True, 

1831 ) 

1832 

1833 if not rawgloss: 

1834 return False 

1835 

1836 # remove manually typed ordered list text at the start("1. ") 

1837 rawgloss = re.sub(r"^\d+\.\s+", "", rawgloss).strip() 

1838 

1839 # get stuff like synonyms and categories from "others", 

1840 # maybe examples and quotations 

1841 clean_node(wxr, sense_base, others, template_fn=sense_template_fn) 

1842 

1843 # The gloss could contain templates that produce more list items. 

1844 # This happens commonly with, e.g., {{inflection of|...}}. Split 

1845 # to parts. However, e.g. Interlingua generates multiple glosses 

1846 # in HTML directly without Wikitext markup, so we must also split 

1847 # by just newlines. 

1848 subglosses = rawgloss.splitlines() 

1849 

1850 if len(subglosses) == 0: 1850 ↛ 1851line 1850 didn't jump to line 1851 because the condition on line 1850 was never true

1851 return False 

1852 

1853 if any(s.startswith("#") for s in subglosses): 

1854 subtree = wxr.wtp.parse(rawgloss) 

1855 # from wikitextprocessor.parser import print_tree 

1856 # print("SUBTREE GENERATED BY TEMPLATE:") 

1857 # print_tree(subtree) 

1858 new_subentries = [ 

1859 x 

1860 for x in subtree.children 

1861 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

1862 ] 

1863 

1864 new_others = [ 

1865 x 

1866 for x in subtree.children 

1867 if isinstance(x, WikiNode) 

1868 and x.kind == NodeKind.LIST 

1869 and not x.sarg.endswith("#") 

1870 ] 

1871 

1872 new_contents = [ 

1873 clean_node(wxr, [], x) 

1874 for x in subtree.children 

1875 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

1876 ] 

1877 

1878 subentries = subentries or new_subentries 

1879 others = others or new_others 

1880 subglosses = new_contents 

1881 rawgloss = "".join(subglosses) 

1882 # Generate no gloss for translation hub pages, but add the 

1883 # "translation-hub" tag for them 

1884 if rawgloss == "(This entry is a translation hub.)": 1884 ↛ 1885line 1884 didn't jump to line 1885 because the condition on line 1884 was never true

1885 data_append(sense_data, "tags", "translation-hub") 

1886 return push_sense() 

1887 

1888 # Remove certain substrings specific to outer glosses 

1889 strip_ends = [", particularly:"] 

1890 for x in strip_ends: 

1891 if rawgloss.endswith(x): 1891 ↛ 1892line 1891 didn't jump to line 1892 because the condition on line 1891 was never true

1892 rawgloss = rawgloss[: -len(x)].strip() 

1893 break 

1894 

1895 # A single gloss, or possibly an outer gloss. 

1896 # Check if the possible outer gloss starts with 

1897 # parenthesized tags/topics 

1898 

1899 if rawgloss and rawgloss not in sense_base.get("raw_glosses", ()): 1899 ↛ 1901line 1899 didn't jump to line 1901 because the condition on line 1899 was always true

1900 data_append(sense_base, "raw_glosses", subglosses[0].strip()) 

1901 m = QUALIFIERS_RE.match(rawgloss) 

1902 # (...): ... or (...(...)...): ... 

1903 if m: 

1904 q = m.group(1) 

1905 rawgloss = rawgloss[m.end() :].strip() 

1906 parse_sense_qualifier(wxr, q, sense_base) 

1907 if rawgloss == "A pejorative:": 1907 ↛ 1908line 1907 didn't jump to line 1908 because the condition on line 1907 was never true

1908 data_append(sense_base, "tags", "pejorative") 

1909 rawgloss = "" 

1910 elif rawgloss == "Short forms.": 1910 ↛ 1911line 1910 didn't jump to line 1911 because the condition on line 1910 was never true

1911 data_append(sense_base, "tags", "abbreviation") 

1912 rawgloss = "" 

1913 elif rawgloss == "Technical or specialized senses.": 1913 ↛ 1914line 1913 didn't jump to line 1914 because the condition on line 1913 was never true

1914 rawgloss = "" 

1915 elif rawgloss.startswith("inflection of "): 

1916 parsed = parse_alt_or_inflection_of(wxr, rawgloss, set()) 

1917 if parsed is not None: 1917 ↛ 1926line 1917 didn't jump to line 1926 because the condition on line 1917 was always true

1918 tags, origins = parsed 

1919 if origins is not None: 1919 ↛ 1921line 1919 didn't jump to line 1921 because the condition on line 1919 was always true

1920 data_extend(sense_base, "form_of", origins) 

1921 if tags is not None: 1921 ↛ 1924line 1921 didn't jump to line 1924 because the condition on line 1921 was always true

1922 data_extend(sense_base, "tags", tags) 

1923 else: 

1924 data_append(sense_base, "tags", "form-of") 

1925 else: 

1926 data_append(sense_base, "tags", "form-of") 

1927 if rawgloss: 1927 ↛ 1958line 1927 didn't jump to line 1958 because the condition on line 1927 was always true

1928 # Code duplicating a lot of clean-up operations from later in 

1929 # this block. We want to clean up the "supergloss" as much as 

1930 # possible, in almost the same way as a normal gloss. 

1931 supergloss = rawgloss 

1932 

1933 if supergloss.startswith("; "): 1933 ↛ 1934line 1933 didn't jump to line 1934 because the condition on line 1933 was never true

1934 supergloss = supergloss[1:].strip() 

1935 

1936 if supergloss.startswith(("^†", "†")): 

1937 data_append(sense_base, "tags", "obsolete") 

1938 supergloss = supergloss[2:].strip() 

1939 elif supergloss.startswith("^‡"): 1939 ↛ 1940line 1939 didn't jump to line 1940 because the condition on line 1939 was never true

1940 data_extend(sense_base, "tags", ["obsolete", "historical"]) 

1941 supergloss = supergloss[2:].strip() 

1942 

1943 # remove [14th century...] style brackets at the end 

1944 supergloss = re.sub(r"\s\[[^]]*\]\s*$", "", supergloss) 

1945 

1946 if supergloss.startswith((",", ":")): 1946 ↛ 1947line 1946 didn't jump to line 1947 because the condition on line 1946 was never true

1947 supergloss = supergloss[1:] 

1948 supergloss = supergloss.strip() 

1949 if supergloss.startswith("N. of "): 1949 ↛ 1950line 1949 didn't jump to line 1950 because the condition on line 1949 was never true

1950 supergloss = "Name of " + supergloss[6:] 

1951 supergloss = supergloss[2:] 

1952 data_append(sense_base, "glosses", supergloss) 

1953 if supergloss in ("A person:",): 1953 ↛ 1954line 1953 didn't jump to line 1954 because the condition on line 1953 was never true

1954 data_append(sense_base, "tags", "g-person") 

1955 

1956 # The main recursive call (except for the exceptions at the 

1957 # start of this function). 

1958 for sublist in subentries: 

1959 if not ( 1959 ↛ 1962line 1959 didn't jump to line 1962 because the condition on line 1959 was never true

1960 isinstance(sublist, WikiNode) and sublist.kind == NodeKind.LIST 

1961 ): 

1962 wxr.wtp.debug( 

1963 f"'{repr(rawgloss[:20])}.' gloss has `subentries`" 

1964 f"with items that are not LISTs", 

1965 sortid="page/1511/20230119", 

1966 ) 

1967 continue 

1968 for item in sublist.children: 

1969 if not ( 1969 ↛ 1973line 1969 didn't jump to line 1973 because the condition on line 1969 was never true

1970 isinstance(item, WikiNode) 

1971 and item.kind == NodeKind.LIST_ITEM 

1972 ): 

1973 continue 

1974 # copy sense_base to prevent cross-contamination between 

1975 # subglosses and other subglosses and superglosses 

1976 sense_base2 = copy.deepcopy(sense_base) 

1977 if parse_sense_node(item, sense_base2, pos): 1977 ↛ 1968line 1977 didn't jump to line 1968 because the condition on line 1977 was always true

1978 added = True 

1979 

1980 # Capture examples. 

1981 # This is called after the recursive calls above so that 

1982 # sense_base is not contaminated with meta-data from 

1983 # example entries for *this* gloss. 

1984 examples = [] 

1985 if wxr.config.capture_examples: 1985 ↛ 1989line 1985 didn't jump to line 1989 because the condition on line 1985 was always true

1986 examples = extract_examples(others, sense_base) 

1987 

1988 # push_sense() succeeded somewhere down-river, so skip this level 

1989 if added: 

1990 if examples: 

1991 # this higher-up gloss has examples that we do not want to skip 

1992 wxr.wtp.debug( 

1993 "'{}[...]' gloss has examples we want to keep, " 

1994 "but there are subglosses.".format(repr(rawgloss[:30])), 

1995 sortid="page/1498/20230118", 

1996 ) 

1997 else: 

1998 return True 

1999 

2000 # Some entries, e.g., "iacebam", have weird sentences in quotes 

2001 # after the gloss, but these sentences don't seem to be intended 

2002 # as glosses. Skip them. 

2003 indexed_subglosses = list( 

2004 (i, gl) 

2005 for i, gl in enumerate(subglosses) 

2006 if gl.strip() and not re.match(r'\s*(\([^)]*\)\s*)?"[^"]*"\s*$', gl) 

2007 ) 

2008 

2009 if len(indexed_subglosses) > 1 and "form_of" not in sense_base: 2009 ↛ 2010line 2009 didn't jump to line 2010 because the condition on line 2009 was never true

2010 gl = indexed_subglosses[0][1].strip() 

2011 if gl.endswith(":"): 

2012 gl = gl[:-1].strip() 

2013 parsed = parse_alt_or_inflection_of(wxr, gl, gloss_template_args) 

2014 if parsed is not None: 

2015 infl_tags, infl_dts = parsed 

2016 if infl_dts and "form-of" in infl_tags and len(infl_tags) == 1: 

2017 # Interpret others as a particular form under 

2018 # "inflection of" 

2019 data_extend(sense_base, "tags", infl_tags) 

2020 data_extend(sense_base, "form_of", infl_dts) 

2021 indexed_subglosses = indexed_subglosses[1:] 

2022 elif not infl_dts: 

2023 data_extend(sense_base, "tags", infl_tags) 

2024 indexed_subglosses = indexed_subglosses[1:] 

2025 

2026 # Create senses for remaining subglosses 

2027 for i, (gloss_i, gloss) in enumerate(indexed_subglosses): 

2028 gloss = gloss.strip() 

2029 if not gloss and len(indexed_subglosses) > 1: 2029 ↛ 2030line 2029 didn't jump to line 2030 because the condition on line 2029 was never true

2030 continue 

2031 # Push a new sense (if the last one is not empty) 

2032 if push_sense(): 2032 ↛ 2033line 2032 didn't jump to line 2033 because the condition on line 2032 was never true

2033 added = True 

2034 # if gloss not in sense_data.get("raw_glosses", ()): 

2035 # data_append(sense_data, "raw_glosses", gloss) 

2036 if i == 0 and examples: 

2037 # In a multi-line gloss, associate examples 

2038 # with only one of them. 

2039 # XXX or you could use gloss_i == len(indexed_subglosses) 

2040 # to associate examples with the *last* one. 

2041 data_extend(sense_data, "examples", examples) 

2042 if gloss.startswith("; ") and gloss_i > 0: 2042 ↛ 2043line 2042 didn't jump to line 2043 because the condition on line 2042 was never true

2043 gloss = gloss[1:].strip() 

2044 # If the gloss starts with †, mark as obsolete 

2045 if gloss.startswith("^†"): 2045 ↛ 2046line 2045 didn't jump to line 2046 because the condition on line 2045 was never true

2046 data_append(sense_data, "tags", "obsolete") 

2047 gloss = gloss[2:].strip() 

2048 elif gloss.startswith("^‡"): 2048 ↛ 2049line 2048 didn't jump to line 2049 because the condition on line 2048 was never true

2049 data_extend(sense_data, "tags", ["obsolete", "historical"]) 

2050 gloss = gloss[2:].strip() 

2051 # Copy data for all senses to this sense 

2052 for k, v in sense_base.items(): 

2053 if isinstance(v, (list, tuple)): 

2054 if k != "tags": 

2055 # Tags handled below (countable/uncountable special) 

2056 data_extend(sense_data, k, v) 

2057 else: 

2058 assert k not in ("tags", "categories", "topics") 

2059 sense_data[k] = v # type:ignore[literal-required] 

2060 # Parse the gloss for this particular sense 

2061 m = QUALIFIERS_RE.match(gloss) 

2062 # (...): ... or (...(...)...): ... 

2063 if m: 

2064 parse_sense_qualifier(wxr, m.group(1), sense_data) 

2065 gloss = gloss[m.end() :].strip() 

2066 

2067 # Remove common suffix "[from 14th c.]" and similar 

2068 gloss = re.sub(r"\s\[[^]]*\]\s*$", "", gloss) 

2069 

2070 # Check to make sure we don't have unhandled list items in gloss 

2071 ofs = max(gloss.find("#"), gloss.find("* ")) 

2072 if ofs > 10 and "(#)" not in gloss: 2072 ↛ 2073line 2072 didn't jump to line 2073 because the condition on line 2072 was never true

2073 wxr.wtp.debug( 

2074 "gloss may contain unhandled list items: {}".format(gloss), 

2075 sortid="page/1412", 

2076 ) 

2077 elif "\n" in gloss: 2077 ↛ 2078line 2077 didn't jump to line 2078 because the condition on line 2077 was never true

2078 wxr.wtp.debug( 

2079 "gloss contains newline: {}".format(gloss), 

2080 sortid="page/1416", 

2081 ) 

2082 

2083 # Kludge, some glosses have a comma after initial qualifiers in 

2084 # parentheses 

2085 if gloss.startswith((",", ":")): 2085 ↛ 2086line 2085 didn't jump to line 2086 because the condition on line 2085 was never true

2086 gloss = gloss[1:] 

2087 gloss = gloss.strip() 

2088 if gloss.endswith(":"): 2088 ↛ 2089line 2088 didn't jump to line 2089 because the condition on line 2088 was never true

2089 gloss = gloss[:-1].strip() 

2090 if gloss.startswith("N. of "): 2090 ↛ 2091line 2090 didn't jump to line 2091 because the condition on line 2090 was never true

2091 gloss = "Name of " + gloss[6:] 

2092 if gloss.startswith("†"): 2092 ↛ 2093line 2092 didn't jump to line 2093 because the condition on line 2092 was never true

2093 data_append(sense_data, "tags", "obsolete") 

2094 gloss = gloss[1:] 

2095 elif gloss.startswith("^†"): 2095 ↛ 2096line 2095 didn't jump to line 2096 because the condition on line 2095 was never true

2096 data_append(sense_data, "tags", "obsolete") 

2097 gloss = gloss[2:] 

2098 

2099 # Copy tags from sense_base if any. This will not copy 

2100 # countable/uncountable if either was specified in the sense, 

2101 # as sometimes both are specified in word head but only one 

2102 # in individual senses. 

2103 countability_tags = [] 

2104 base_tags = sense_base.get("tags", ()) 

2105 sense_tags = sense_data.get("tags", ()) 

2106 for tag in base_tags: 

2107 if tag in ("countable", "uncountable"): 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true

2108 if tag not in countability_tags: 

2109 countability_tags.append(tag) 

2110 continue 

2111 if tag not in sense_tags: 

2112 data_append(sense_data, "tags", tag) 

2113 if countability_tags: 2113 ↛ 2114line 2113 didn't jump to line 2114 because the condition on line 2113 was never true

2114 if ( 

2115 "countable" not in sense_tags 

2116 and "uncountable" not in sense_tags 

2117 ): 

2118 data_extend(sense_data, "tags", countability_tags) 

2119 

2120 # If outer gloss specifies a form-of ("inflection of", see 

2121 # aquamarine/German), try to parse the inner glosses as 

2122 # tags for an inflected form. 

2123 if "form-of" in sense_base.get("tags", ()): 

2124 parsed = parse_alt_or_inflection_of( 

2125 wxr, gloss, gloss_template_args 

2126 ) 

2127 if parsed is not None: 2127 ↛ 2133line 2127 didn't jump to line 2133 because the condition on line 2127 was always true

2128 infl_tags, infl_dts = parsed 

2129 if not infl_dts and infl_tags: 2129 ↛ 2133line 2129 didn't jump to line 2133 because the condition on line 2129 was always true

2130 # Interpret as a particular form under "inflection of" 

2131 data_extend(sense_data, "tags", infl_tags) 

2132 

2133 if not gloss: 2133 ↛ 2134line 2133 didn't jump to line 2134 because the condition on line 2133 was never true

2134 data_append(sense_data, "tags", "empty-gloss") 

2135 elif gloss != "-" and gloss not in sense_data.get("glosses", []): 2135 ↛ 2136line 2135 didn't jump to line 2136 because the condition on line 2135 was never true

2136 if ( 

2137 gloss_i == 0 

2138 and len(sense_data.get("glosses", tuple())) >= 1 

2139 ): 

2140 # If we added a "high-level gloss" from rawgloss, but this 

2141 # is that same gloss_i, add this instead of the raw_gloss 

2142 # from before if they're different: the rawgloss was not 

2143 # cleaned exactly the same as this later gloss 

2144 sense_data["glosses"][-1] = gloss 

2145 else: 

2146 # Add the gloss for the sense. 

2147 data_append(sense_data, "glosses", gloss) 

2148 

2149 # Kludge: there are cases (e.g., etc./Swedish) where there are 

2150 # two abbreviations in the same sense, both generated by the 

2151 # {{abbreviation of|...}} template. Handle these with some magic. 

2152 position = 0 

2153 split_glosses = [] 

2154 for m in re.finditer(r"Abbreviation of ", gloss): 2154 ↛ 2155line 2154 didn't jump to line 2155 because the loop on line 2154 never started

2155 if m.start() != position: 

2156 split_glosses.append(gloss[position : m.start()]) 

2157 position = m.start() 

2158 split_glosses.append(gloss[position:]) 

2159 for gloss in split_glosses: 

2160 # Check if this gloss describes an alt-of or inflection-of 

2161 if ( 

2162 lang_code != "en" 

2163 and " " not in gloss 

2164 and distw([word], gloss) < 0.3 

2165 ): 

2166 # Don't try to parse gloss if it is one word 

2167 # that is close to the word itself for non-English words 

2168 # (probable translations of a tag/form name) 

2169 continue 

2170 parsed = parse_alt_or_inflection_of( 

2171 wxr, gloss, gloss_template_args 

2172 ) 

2173 if parsed is None: 

2174 continue 

2175 tags, dts = parsed 

2176 if not dts and tags: 2176 ↛ 2179line 2176 didn't jump to line 2179 because the condition on line 2176 was always true

2177 data_extend(sense_data, "tags", tags) 

2178 continue 

2179 for dt in dts: # type:ignore[union-attr] 

2180 ftags = list(tag for tag in tags if tag != "form-of") 

2181 if "alt-of" in tags: 

2182 data_extend(sense_data, "tags", ftags) 

2183 data_append(sense_data, "alt_of", dt) 

2184 elif "compound-of" in tags: 

2185 data_extend(sense_data, "tags", ftags) 

2186 data_append(sense_data, "compound_of", dt) 

2187 elif "synonym-of" in tags: 

2188 data_extend(dt, "tags", ftags) 

2189 data_append(sense_data, "synonyms", dt) 

2190 elif tags and dt.get("word", "").startswith("of "): 

2191 dt["word"] = dt["word"][3:] 

2192 data_append(sense_data, "tags", "form-of") 

2193 data_extend(sense_data, "tags", ftags) 

2194 data_append(sense_data, "form_of", dt) 

2195 elif "form-of" in tags: 

2196 data_extend(sense_data, "tags", tags) 

2197 data_append(sense_data, "form_of", dt) 

2198 

2199 if len(sense_data) == 0: 

2200 if len(sense_base.get("tags", [])) == 0: 2200 ↛ 2202line 2200 didn't jump to line 2202 because the condition on line 2200 was always true

2201 del sense_base["tags"] 

2202 sense_data.update(sense_base) 

2203 if push_sense(): 2203 ↛ 2207line 2203 didn't jump to line 2207 because the condition on line 2203 was always true

2204 # push_sense succeded in adding a sense to pos_data 

2205 added = True 

2206 # print("PARSE_SENSE DONE:", pos_datas[-1]) 

2207 return added 

2208 

2209 def parse_inflection( 

2210 node: WikiNode, section: str, pos: Optional[str] 

2211 ) -> None: 

2212 """Parses inflection data (declension, conjugation) from the given 

2213 page. This retrieves the actual inflection template 

2214 parameters, which are very useful for applications that need 

2215 to learn the inflection classes and generate inflected 

2216 forms.""" 

2217 assert isinstance(node, WikiNode) 

2218 assert isinstance(section, str) 

2219 assert pos is None or isinstance(pos, str) 

2220 # print("parse_inflection:", node) 

2221 

2222 if pos is None: 2222 ↛ 2223line 2222 didn't jump to line 2223 because the condition on line 2222 was never true

2223 wxr.wtp.debug( 

2224 "inflection table outside part-of-speech", sortid="page/1812" 

2225 ) 

2226 return 

2227 

2228 def inflection_template_fn( 

2229 name: str, ht: TemplateArgs 

2230 ) -> Optional[str]: 

2231 # print("decl_conj_template_fn", name, ht) 

2232 if is_panel_template(wxr, name): 

2233 return "" 

2234 if name in ("is-u-mutation",): 

2235 # These are not to be captured as an exception to the 

2236 # generic code below 

2237 return None 

2238 m = re.search( 

2239 r"-(conj|decl|ndecl|adecl|infl|conjugation|" 

2240 r"declension|inflection|mut|mutation)($|-)", 

2241 name, 

2242 ) 

2243 if m: 

2244 args_ht = clean_template_args(wxr, ht) 

2245 dt = {"name": name, "args": args_ht} 

2246 data_append(pos_data, "inflection_templates", dt) 

2247 

2248 return None 

2249 

2250 # Convert the subtree back to Wikitext, then expand all and parse, 

2251 # capturing templates in the process 

2252 text = wxr.wtp.node_to_wikitext(node.children) 

2253 

2254 # Split text into separate sections for each to-level template 

2255 brace_matches = re.split("({{+|}}+)", text) # ["{{", "template", "}}"] 

2256 template_sections = [] 

2257 template_nesting = 0 # depth of SINGLE BRACES { { nesting } } 

2258 # Because there is the possibility of triple curly braces 

2259 # ("{{{", "}}}") in addition to normal ("{{ }}"), we do not 

2260 # count nesting depth using pairs of two brackets, but 

2261 # instead use singular braces ("{ }"). 

2262 # Because template delimiters should be balanced, regardless 

2263 # of whether {{ or {{{ is used, and because we only care 

2264 # about the outer-most delimiters (the highest level template) 

2265 # we can just count the single braces when those single 

2266 # braces are part of a group. 

2267 

2268 # print(text) 

2269 # print(repr(brace_matches)) 

2270 if len(brace_matches) > 1: 2270 ↛ 2271line 2270 didn't jump to line 2271 because the condition on line 2270 was never true

2271 tsection: list[str] = [] 

2272 after_templates = False # kludge to keep any text 

2273 # before first template 

2274 # with the first template; 

2275 # otherwise, text 

2276 # goes with preceding template 

2277 for m in brace_matches: 

2278 if m.startswith("{{"): 

2279 if template_nesting == 0 and after_templates: 

2280 template_sections.append(tsection) 

2281 tsection = [] 

2282 # start new section 

2283 after_templates = True 

2284 template_nesting += len(m) 

2285 tsection.append(m) 

2286 elif m.startswith("}}"): 

2287 template_nesting -= len(m) 

2288 if template_nesting < 0: 

2289 wxr.wtp.error( 

2290 "Negatively nested braces, " 

2291 "couldn't split inflection templates, " 

2292 "{}/{} section {}".format(word, language, section), 

2293 sortid="page/1871", 

2294 ) 

2295 template_sections = [] # use whole text 

2296 break 

2297 tsection.append(m) 

2298 else: 

2299 tsection.append(m) 

2300 if tsection: # dangling tsection 

2301 template_sections.append(tsection) 

2302 # Why do it this way around? The parser has a preference 

2303 # to associate bits outside of tables with the preceding 

2304 # table (`after`-variable), so a new tsection begins 

2305 # at {{ and everything before it belongs to the previous 

2306 # template. 

2307 

2308 texts = [] 

2309 if not template_sections: 2309 ↛ 2312line 2309 didn't jump to line 2312 because the condition on line 2309 was always true

2310 texts = [text] 

2311 else: 

2312 for tsection in template_sections: 

2313 texts.append("".join(tsection)) 

2314 if template_nesting != 0: 2314 ↛ 2315line 2314 didn't jump to line 2315 because the condition on line 2314 was never true

2315 wxr.wtp.error( 

2316 "Template nesting error: " 

2317 "template_nesting = {} " 

2318 "couldn't split inflection templates, " 

2319 "{}/{} section {}".format( 

2320 template_nesting, word, language, section 

2321 ), 

2322 sortid="page/1896", 

2323 ) 

2324 texts = [text] 

2325 for text in texts: 

2326 tree = wxr.wtp.parse( 

2327 text, expand_all=True, template_fn=inflection_template_fn 

2328 ) 

2329 

2330 # Parse inflection tables from the section. The data is stored 

2331 # under "forms". 

2332 if wxr.config.capture_inflections: 2332 ↛ 2325line 2332 didn't jump to line 2325 because the condition on line 2332 was always true

2333 tablecontext = None 

2334 m = re.search(r"{{([^}{|]+)\|?", text) 

2335 if m: 2335 ↛ 2336line 2335 didn't jump to line 2336 because the condition on line 2335 was never true

2336 template_name = m.group(1) 

2337 tablecontext = TableContext(template_name) 

2338 

2339 parse_inflection_section( 

2340 wxr, 

2341 pos_data, 

2342 word, 

2343 language, 

2344 pos, 

2345 section, 

2346 tree, 

2347 tablecontext=tablecontext, 

2348 ) 

2349 

2350 def get_subpage_section( 

2351 title: str, subtitle: str, seq: Union[list[str], tuple[str, ...]] 

2352 ) -> Optional[Union[WikiNode, str]]: 

2353 """Loads a subpage of the given page, and finds the section 

2354 for the given language, part-of-speech, and section title. This 

2355 is used for finding translations and other sections on subpages.""" 

2356 assert isinstance(language, str) 

2357 assert isinstance(title, str) 

2358 assert isinstance(subtitle, str) 

2359 assert isinstance(seq, (list, tuple)) 

2360 for x in seq: 

2361 assert isinstance(x, str) 

2362 subpage_title = word + "/" + subtitle 

2363 subpage_content = wxr.wtp.get_page_body(subpage_title, 0) 

2364 if subpage_content is None: 

2365 wxr.wtp.error( 

2366 "/translations not found despite " 

2367 "{{see translation subpage|...}}", 

2368 sortid="page/1934", 

2369 ) 

2370 return None 

2371 

2372 def recurse( 

2373 node: Union[str, WikiNode], seq: Union[list[str], tuple[str, ...]] 

2374 ) -> Optional[Union[str, WikiNode]]: 

2375 # print(f"seq: {seq}") 

2376 if not seq: 

2377 return node 

2378 if not isinstance(node, WikiNode): 

2379 return None 

2380 # print(f"node.kind: {node.kind}") 

2381 if node.kind in LEVEL_KINDS: 

2382 t = clean_node(wxr, None, node.largs[0]) 

2383 # print(f"t: {t} == seq[0]: {seq[0]}?") 

2384 if t.lower() == seq[0].lower(): 

2385 seq = seq[1:] 

2386 if not seq: 

2387 return node 

2388 for n in node.children: 

2389 ret = recurse(n, seq) 

2390 if ret is not None: 

2391 return ret 

2392 return None 

2393 

2394 tree = wxr.wtp.parse( 

2395 subpage_content, 

2396 pre_expand=True, 

2397 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

2398 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

2399 ) 

2400 assert tree.kind == NodeKind.ROOT 

2401 ret = recurse(tree, seq) 

2402 if ret is None: 

2403 wxr.wtp.debug( 

2404 "Failed to find subpage section {}/{} seq {}".format( 

2405 title, subtitle, seq 

2406 ), 

2407 sortid="page/1963", 

2408 ) 

2409 return ret 

2410 

2411 def parse_linkage( 

2412 data: WordData, field: str, linkagenode: WikiNode 

2413 ) -> None: 

2414 assert isinstance(data, dict) 

2415 assert isinstance(field, str) 

2416 assert isinstance(linkagenode, WikiNode) 

2417 # if field == "synonyms": 

2418 # print("field", field) 

2419 # print("data", data) 

2420 # print("children:") 

2421 # print(linkagenode.children) 

2422 if not wxr.config.capture_linkages: 2422 ↛ 2423line 2422 didn't jump to line 2423 because the condition on line 2422 was never true

2423 return 

2424 have_panel_template = False 

2425 toplevel_text = [] 

2426 next_navframe_sense = None # Used for "(sense):" before NavFrame 

2427 

2428 def parse_linkage_item( 

2429 contents: list[Union[str, WikiNode]], 

2430 field: str, 

2431 sense: Optional[str] = None, 

2432 ): 

2433 assert isinstance(contents, (list, tuple)) 

2434 assert isinstance(field, str) 

2435 assert sense is None or isinstance(sense, str) 

2436 

2437 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

2438 # .format(field, sense, contents)) 

2439 

2440 parts: list[str] = [] 

2441 ruby: list[tuple[str, str]] = [] 

2442 urls: list[str] = [] 

2443 # data about link text; this is used to skip splitting on 

2444 # linkage text items that contain stuff like commas; for 

2445 # example "Hunde, die bellen, beißen nicht" in article 

2446 # beißen is split into "Hunde", "die bellen" etc. 

2447 # We take that link text and use it, eventually, 

2448 # in split_at_comma_semi to skip splitting on those 

2449 # commas. 

2450 links_that_should_not_be_split: list[str] = [] 

2451 

2452 def item_recurse( 

2453 contents: list[Union[str, WikiNode]], italic=False 

2454 ) -> None: 

2455 assert isinstance(contents, (list, tuple)) 

2456 nonlocal sense 

2457 nonlocal ruby 

2458 nonlocal parts 

2459 # print("ITEM_RECURSE:", contents) 

2460 for node in contents: 

2461 if isinstance(node, str): 2461 ↛ 2464line 2461 didn't jump to line 2464 because the condition on line 2461 was always true

2462 parts.append(node) 

2463 continue 

2464 kind = node.kind 

2465 # print("ITEM_RECURSE KIND:", kind, 

2466 # node.sarg if node.sarg else node.largs) 

2467 if kind == NodeKind.LIST: 

2468 if parts: 

2469 sense1: Optional[str] 

2470 sense1 = clean_node(wxr, None, parts) 

2471 if sense1.endswith(":"): 

2472 sense1 = sense1[:-1].strip() 

2473 if sense1.startswith("(") and sense1.endswith(")"): 

2474 sense1 = sense1[1:-1].strip() 

2475 if sense1.lower() == TRANSLATIONS_TITLE: 

2476 sense1 = None 

2477 # print("linkage item_recurse LIST sense1:", sense1) 

2478 parse_linkage_recurse( 

2479 node.children, field, sense=sense1 or sense 

2480 ) 

2481 parts = [] 

2482 else: 

2483 parse_linkage_recurse(node.children, field, sense) 

2484 elif kind in ( 

2485 NodeKind.TABLE, 

2486 NodeKind.TABLE_ROW, 

2487 NodeKind.TABLE_CELL, 

2488 ): 

2489 parse_linkage_recurse(node.children, field, sense) 

2490 elif kind in ( 

2491 NodeKind.TABLE_HEADER_CELL, 

2492 NodeKind.TABLE_CAPTION, 

2493 ): 

2494 continue 

2495 elif kind == NodeKind.HTML: 

2496 classes = (node.attrs.get("class") or "").split() 

2497 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2498 continue 

2499 elif node.sarg == "ruby": 

2500 rb = parse_ruby(wxr, node) 

2501 if rb: 

2502 ruby.append(rb) 

2503 parts.append(rb[0]) 

2504 continue 

2505 elif node.sarg == "math": 

2506 parts.append(clean_node(wxr, None, node)) 

2507 continue 

2508 elif "interProject" in classes: 

2509 continue # These do not seem to be displayed 

2510 if "NavFrame" in classes: 

2511 parse_linkage_recurse(node.children, field, sense) 

2512 else: 

2513 item_recurse(node.children, italic=italic) 

2514 elif kind == NodeKind.ITALIC: 

2515 item_recurse(node.children, italic=True) 

2516 elif kind == NodeKind.LINK: 

2517 ignore = False 

2518 if isinstance(node.largs[0][0], str): 

2519 v1 = node.largs[0][0].strip().lower() 

2520 if v1.startswith( 

2521 ns_title_prefix_tuple(wxr, "Category", True) 

2522 + ns_title_prefix_tuple(wxr, "File", True) 

2523 ): 

2524 ignore = True 

2525 if not ignore: 

2526 v = node.largs[-1] 

2527 if ( 

2528 len(node.largs) == 1 

2529 and len(v) > 0 

2530 and isinstance(v[0], str) 

2531 and v[0][0] == ":" 

2532 ): 

2533 v = [v[0][1:]] + list(v[1:]) # type:ignore 

2534 if isinstance(v[0], str) and not v[0].isalnum(): 

2535 links_that_should_not_be_split.append( 

2536 "".join(v[0]) 

2537 ) # type: ignore 

2538 item_recurse(v, italic=italic) 

2539 elif kind == NodeKind.URL: 

2540 if len(node.largs) < 2 and node.largs: 

2541 # Naked url captured 

2542 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

2543 continue 

2544 if len(node.largs) == 2: 

2545 # Url from link with text 

2546 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

2547 # print(f"{node.largs=!r}") 

2548 # print("linkage recurse URL {}".format(node)) 

2549 item_recurse(node.largs[-1], italic=italic) 

2550 elif kind in (NodeKind.PREFORMATTED, NodeKind.BOLD): 

2551 item_recurse(node.children, italic=italic) 

2552 else: 

2553 wxr.wtp.debug( 

2554 "linkage item_recurse unhandled {}: {}".format( 

2555 node.kind, node 

2556 ), 

2557 sortid="page/2073", 

2558 ) 

2559 

2560 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

2561 # .format(contents)) 

2562 

2563 item_recurse(contents) 

2564 item = clean_node(wxr, None, parts) 

2565 # print("LINKAGE ITEM CONTENTS:", parts) 

2566 # print("CLEANED ITEM: {!r}".format(item)) 

2567 # print(f"URLS {urls=!r}") 

2568 

2569 return parse_linkage_item_text( 

2570 wxr, 

2571 word, 

2572 data, 

2573 field, 

2574 item, 

2575 sense, 

2576 ruby, 

2577 pos_datas, 

2578 is_reconstruction, 

2579 urls or None, 

2580 links_that_should_not_be_split or None, 

2581 ) 

2582 

2583 def parse_linkage_recurse( 

2584 contents: list[Union[WikiNode, str]], 

2585 field: str, 

2586 sense: Optional[str], 

2587 ) -> None: 

2588 assert isinstance(contents, (list, tuple)) 

2589 assert sense is None or isinstance(sense, str) 

2590 nonlocal next_navframe_sense 

2591 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

2592 for node in contents: 

2593 if isinstance(node, str): 

2594 # Ignore top-level text, generally comments before the 

2595 # linkages list. However, if no linkages are found, then 

2596 # use this for linkages (not all words use bullet points 

2597 # for linkages). 

2598 toplevel_text.append(node) 

2599 continue 

2600 assert isinstance(node, WikiNode) 

2601 kind = node.kind 

2602 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

2603 if kind == NodeKind.LIST: 

2604 parse_linkage_recurse(node.children, field, sense) 

2605 elif kind == NodeKind.LIST_ITEM: 2605 ↛ 2612line 2605 didn't jump to line 2612 because the condition on line 2605 was always true

2606 v = parse_linkage_item(node.children, field, sense) 

2607 if v: 2607 ↛ 2611line 2607 didn't jump to line 2611 because the condition on line 2607 was never true

2608 # parse_linkage_item() can return a value that should 

2609 # be used as the sense for the follow-on linkages, 

2610 # which are typically provided in a table (see 滿) 

2611 next_navframe_sense = v 

2612 elif kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

2613 parse_linkage_recurse(node.children, field, sense) 

2614 elif kind == NodeKind.TABLE_CELL: 

2615 parse_linkage_item(node.children, field, sense) 

2616 elif kind in ( 

2617 NodeKind.TABLE_CAPTION, 

2618 NodeKind.TABLE_HEADER_CELL, 

2619 NodeKind.PREFORMATTED, 

2620 NodeKind.BOLD, 

2621 ): 

2622 continue 

2623 elif kind == NodeKind.HTML: 

2624 # Recurse to process inside the HTML for most tags 

2625 if node.sarg in ("gallery", "ref", "cite", "caption"): 

2626 continue 

2627 classes = (node.attrs.get("class") or "").split() 

2628 if node.sarg == "li": 

2629 # duplicates code from if kind == NodeKind.LIST_ITEM ⇑ 

2630 v = parse_linkage_item(node.children, field, sense) 

2631 if v: 

2632 next_navframe_sense = v 

2633 elif "qualifier-content" in classes: 

2634 sense1 = clean_node(wxr, None, node.children) 

2635 if sense1.endswith(":"): 

2636 sense1 = sense1[:-1].strip() 

2637 if sense and sense1: 

2638 wxr.wtp.debug( 

2639 "linkage qualifier-content on multiple " 

2640 "levels: {!r} and {!r}".format(sense, sense1), 

2641 sortid="page/2170", 

2642 ) 

2643 parse_linkage_recurse(node.children, field, sense1) 

2644 elif "NavFrame" in classes: 

2645 # NavFrame uses previously assigned next_navframe_sense 

2646 # (from a "(sense):" item) and clears it afterwards 

2647 parse_linkage_recurse( 

2648 node.children, field, sense or next_navframe_sense 

2649 ) 

2650 next_navframe_sense = None 

2651 else: 

2652 parse_linkage_recurse(node.children, field, sense) 

2653 elif kind in LEVEL_KINDS: 

2654 # Just recurse to any possible subsections 

2655 parse_linkage_recurse(node.children, field, sense) 

2656 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

2657 # Skip these on top level; at least sometimes bold is 

2658 # used for indicating a subtitle 

2659 continue 

2660 elif kind == NodeKind.LINK: 

2661 # Recurse into the last argument 

2662 # Apparently ":/" is used as a link to "/", so strip 

2663 # initial value 

2664 parse_linkage_recurse(node.largs[-1], field, sense) 

2665 else: 

2666 wxr.wtp.debug( 

2667 "parse_linkage_recurse unhandled {}: {}".format( 

2668 kind, node 

2669 ), 

2670 sortid="page/2196", 

2671 ) 

2672 

2673 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

2674 nonlocal have_panel_template 

2675 if is_panel_template(wxr, name): 

2676 have_panel_template = True 

2677 return "" 

2678 return None 

2679 

2680 def parse_zh_synonyms( 

2681 parsed: list[Union[WikiNode, str]], 

2682 data: list[LinkageData], 

2683 hdrs: list[str], 

2684 root_word: str, 

2685 ) -> None: 

2686 """Parses Chinese dialectal synonyms tables""" 

2687 for item in parsed: 

2688 if isinstance(item, WikiNode): 

2689 if item.kind == NodeKind.TABLE_ROW: 

2690 cleaned = clean_node(wxr, None, item.children) 

2691 # print("cleaned:", repr(cleaned)) 

2692 if any( 

2693 [ 

2694 "Variety" in cleaned, 

2695 "Location" in cleaned, 

2696 "Words" in cleaned, 

2697 ] 

2698 ): 

2699 pass 

2700 else: 

2701 split = cleaned.split("\n") 

2702 new_hdrs = split[:-1] 

2703 if len(new_hdrs) == 2: 

2704 hdrs = [new_hdrs[0]] 

2705 new_hdrs.pop(0) 

2706 combined_hdrs = [x.strip() for x in hdrs + new_hdrs] 

2707 tags = [] 

2708 words = split[-1].split(",") 

2709 for hdr in combined_hdrs: 

2710 hdr = hdr.replace("(", ",") 

2711 hdr = hdr.replace(")", "") 

2712 hdr = hdr.replace("N.", "Northern,") 

2713 hdr = hdr.replace("S.", "Southern,") 

2714 new = hdr.split(",") 

2715 for tag in sorted(new): 

2716 tag = tag.strip() 

2717 tag = tag.replace(" ", "-") 

2718 if tag in valid_tags: 

2719 tags.append(tag) 

2720 else: 

2721 if tag in zh_tag_lookup: 

2722 tags.extend(zh_tag_lookup[tag]) 

2723 else: 

2724 print( 

2725 f"MISSING ZH SYNONYM TAG for " 

2726 f"root {root_word}, word " 

2727 f"{words}: {tag}" 

2728 ) 

2729 sys.stdout.flush() 

2730 

2731 for word in words: 

2732 data.append( 

2733 {"word": word.strip(), "tags": tags} 

2734 ) 

2735 elif item.kind == NodeKind.HTML: 

2736 cleaned = clean_node(wxr, None, item.children) 

2737 if "Synonyms of" in cleaned: 

2738 cleaned = cleaned.replace("Synonyms of ", "") 

2739 root_word = cleaned 

2740 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2741 else: 

2742 parse_zh_synonyms(item.children, data, hdrs, root_word) 

2743 

2744 def parse_zh_synonyms_list( 

2745 parsed: list[Union[WikiNode, str]], 

2746 data: list[LinkageData], 

2747 hdrs: list[str], 

2748 root_word: str, 

2749 ) -> None: 

2750 """Parses Chinese dialectal synonyms tables (list format)""" 

2751 for item in parsed: 

2752 if isinstance(item, WikiNode): 

2753 if item.kind == NodeKind.LIST_ITEM: 

2754 cleaned = clean_node(wxr, None, item.children) 

2755 # print("cleaned:", repr(cleaned)) 

2756 if any( 

2757 [ 

2758 "Variety" in cleaned, 

2759 "Location" in cleaned, 

2760 "Words" in cleaned, 

2761 ] 

2762 ): 

2763 pass 

2764 else: 

2765 cleaned = cleaned.replace("(", ",") 

2766 cleaned = cleaned.replace(")", "") 

2767 split = cleaned.split(",") 

2768 # skip empty words / titles 

2769 if split[0] == "": 

2770 continue 

2771 words = split[0].split("/") 

2772 new_hdrs = [x.strip() for x in split[1:]] 

2773 tags = [] 

2774 roman = None 

2775 for tag in sorted(new_hdrs): 

2776 if tag in valid_tags: 

2777 tags.append(tag) 

2778 elif tag in zh_tag_lookup: 

2779 tags.extend(zh_tag_lookup[tag]) 

2780 elif ( 

2781 classify_desc(tag) == "romanization" 

2782 and roman is None 

2783 ): 

2784 roman = tag 

2785 else: 

2786 print( 

2787 f"MISSING ZH SYNONYM TAG " 

2788 f"(possibly pinyin) - root " 

2789 f"{root_word}, word {words}: {tag}" 

2790 ) 

2791 sys.stdout.flush() 

2792 

2793 for word in words: 

2794 dt: LinkageData = {"word": word.strip()} 

2795 if tags: 

2796 dt["tags"] = tags 

2797 if roman is not None: 

2798 dt["roman"] = roman 

2799 data.append(dt) 

2800 elif item.kind == NodeKind.HTML: 

2801 cleaned = clean_node(wxr, None, item.children) 

2802 if cleaned.find("Synonyms of") >= 0: 

2803 cleaned = cleaned.replace("Synonyms of ", "") 

2804 root_word = cleaned 

2805 parse_zh_synonyms_list( 

2806 item.children, data, hdrs, root_word 

2807 ) 

2808 else: 

2809 parse_zh_synonyms_list( 

2810 item.children, data, hdrs, root_word 

2811 ) 

2812 

2813 def contains_kind( 

2814 children: list[Union[WikiNode, str]], nodekind: NodeKind 

2815 ) -> bool: 

2816 assert isinstance(children, list) 

2817 for item in children: 

2818 if not isinstance(item, WikiNode): 

2819 continue 

2820 if item.kind == nodekind: 

2821 return True 

2822 elif contains_kind(item.children, nodekind): 

2823 return True 

2824 return False 

2825 

2826 # Main body of parse_linkage() 

2827 text = wxr.wtp.node_to_wikitext(linkagenode.children) 

2828 parsed = wxr.wtp.parse( 

2829 text, expand_all=True, template_fn=linkage_template_fn1 

2830 ) 

2831 if field == "synonyms" and lang_code == "zh": 2831 ↛ 2832line 2831 didn't jump to line 2832 because the condition on line 2831 was never true

2832 synonyms: list[LinkageData] = [] 

2833 if contains_kind(parsed.children, NodeKind.LIST): 

2834 parse_zh_synonyms_list(parsed.children, synonyms, [], "") 

2835 else: 

2836 parse_zh_synonyms(parsed.children, synonyms, [], "") 

2837 # print(json.dumps(synonyms, indent=4, ensure_ascii=False)) 

2838 data_extend(data, "synonyms", synonyms) 

2839 parse_linkage_recurse(parsed.children, field, None) 

2840 if not data.get(field) and not have_panel_template: 2840 ↛ 2841line 2840 didn't jump to line 2841 because the condition on line 2840 was never true

2841 text = "".join(toplevel_text).strip() 

2842 if "\n" not in text and "," in text and text.count(",") > 3: 

2843 if not text.startswith("See "): 

2844 parse_linkage_item([text], field, None) 

2845 

2846 def parse_translations(data: WordData, xlatnode: WikiNode) -> None: 

2847 """Parses translations for a word. This may also pull in translations 

2848 from separate translation subpages.""" 

2849 assert isinstance(data, dict) 

2850 assert isinstance(xlatnode, WikiNode) 

2851 # print("===== PARSE_TRANSLATIONS {} {} {}" 

2852 # .format(wxr.wtp.title, wxr.wtp.section, wxr.wtp.subsection)) 

2853 # print("parse_translations xlatnode={}".format(xlatnode)) 

2854 if not wxr.config.capture_translations: 2854 ↛ 2855line 2854 didn't jump to line 2855 because the condition on line 2854 was never true

2855 return 

2856 sense_parts: list[Union[WikiNode, str]] = [] 

2857 sense: Optional[str] = None 

2858 

2859 def parse_translation_item( 

2860 contents: list[Union[WikiNode, str]], lang: Optional[str] = None 

2861 ) -> None: 

2862 nonlocal sense 

2863 assert isinstance(contents, list) 

2864 assert lang is None or isinstance(lang, str) 

2865 # print("PARSE_TRANSLATION_ITEM:", contents) 

2866 

2867 langcode: Optional[str] = None 

2868 if sense is None: 

2869 sense = clean_node(wxr, data, sense_parts).strip() 

2870 # print("sense <- clean_node: ", sense) 

2871 idx = sense.find("See also translations at") 

2872 if idx > 0: 2872 ↛ 2873line 2872 didn't jump to line 2873 because the condition on line 2872 was never true

2873 wxr.wtp.debug( 

2874 "Skipping translation see also: {}".format(sense), 

2875 sortid="page/2361", 

2876 ) 

2877 sense = sense[:idx].strip() 

2878 if sense.endswith(":"): 2878 ↛ 2879line 2878 didn't jump to line 2879 because the condition on line 2878 was never true

2879 sense = sense[:-1].strip() 

2880 if sense.endswith("—"): 2880 ↛ 2881line 2880 didn't jump to line 2881 because the condition on line 2880 was never true

2881 sense = sense[:-1].strip() 

2882 translations_from_template: list[str] = [] 

2883 

2884 def translation_item_template_fn( 

2885 name: str, ht: TemplateArgs 

2886 ) -> Optional[str]: 

2887 nonlocal langcode 

2888 # print("TRANSLATION_ITEM_TEMPLATE_FN:", name, ht) 

2889 if is_panel_template(wxr, name): 

2890 return "" 

2891 if name in ("t+check", "t-check", "t-needed"): 

2892 # We ignore these templates. They seem to have outright 

2893 # garbage in some entries, and very varying formatting in 

2894 # others. These should be transitory and unreliable 

2895 # anyway. 

2896 return "__IGNORE__" 

2897 if name in ("t", "t+", "t-simple", "tt", "tt+"): 

2898 code = ht.get(1) 

2899 if code: 

2900 if langcode and code != langcode: 

2901 wxr.wtp.debug( 

2902 "inconsistent language codes {} vs " 

2903 "{} in translation item: {!r} {}".format( 

2904 langcode, code, name, ht 

2905 ), 

2906 sortid="page/2386", 

2907 ) 

2908 langcode = code 

2909 tr = ht.get(2) 

2910 if tr: 

2911 tr = clean_node(wxr, None, [tr]) 

2912 translations_from_template.append(tr) 

2913 return None 

2914 if name == "t-egy": 

2915 langcode = "egy" 

2916 return None 

2917 if name == "ttbc": 

2918 code = ht.get(1) 

2919 if code: 

2920 langcode = code 

2921 return None 

2922 if name == "trans-see": 

2923 wxr.wtp.error( 

2924 "UNIMPLEMENTED trans-see template", sortid="page/2405" 

2925 ) 

2926 return "" 

2927 if name.endswith("-top"): 

2928 return "" 

2929 if name.endswith("-bottom"): 

2930 return "" 

2931 if name.endswith("-mid"): 

2932 return "" 

2933 # wxr.wtp.debug("UNHANDLED TRANSLATION ITEM TEMPLATE: {!r}" 

2934 # .format(name), 

2935 # sortid="page/2414") 

2936 return None 

2937 

2938 sublists = list( 

2939 x 

2940 for x in contents 

2941 if isinstance(x, WikiNode) and x.kind == NodeKind.LIST 

2942 ) 

2943 contents = list( 

2944 x 

2945 for x in contents 

2946 if not isinstance(x, WikiNode) or x.kind != NodeKind.LIST 

2947 ) 

2948 

2949 item = clean_node( 

2950 wxr, data, contents, template_fn=translation_item_template_fn 

2951 ) 

2952 # print(" TRANSLATION ITEM: {!r} [{}]".format(item, sense)) 

2953 

2954 # Parse the translation item. 

2955 if item: 2955 ↛ exitline 2955 didn't return from function 'parse_translation_item' because the condition on line 2955 was always true

2956 lang = parse_translation_item_text( 

2957 wxr, 

2958 word, 

2959 data, 

2960 item, 

2961 sense, 

2962 lang, 

2963 langcode, 

2964 translations_from_template, 

2965 is_reconstruction, 

2966 ) 

2967 

2968 # Handle sublists. They are frequently used for different 

2969 # scripts for the language and different variants of the 

2970 # language. We will include the lower-level header as a 

2971 # tag in those cases. 

2972 for listnode in sublists: 2972 ↛ 2973line 2972 didn't jump to line 2973 because the loop on line 2972 never started

2973 assert listnode.kind == NodeKind.LIST 

2974 for node in listnode.children: 

2975 if not isinstance(node, WikiNode): 

2976 continue 

2977 if node.kind == NodeKind.LIST_ITEM: 

2978 parse_translation_item(node.children, lang=lang) 

2979 

2980 def parse_translation_template(node: WikiNode) -> None: 

2981 assert isinstance(node, WikiNode) 

2982 

2983 def template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

2984 nonlocal sense_parts 

2985 nonlocal sense 

2986 if is_panel_template(wxr, name): 

2987 return "" 

2988 if name == "see also": 

2989 # XXX capture 

2990 # XXX for example, "/" has top-level list containing 

2991 # see also items. So also should parse those. 

2992 return "" 

2993 if name == "trans-see": 

2994 # XXX capture 

2995 return "" 

2996 if name == "see translation subpage": 

2997 sense_parts = [] 

2998 sense = None 

2999 sub = ht.get(1, "") 

3000 if sub: 

3001 m = re.match( 

3002 r"\s*(([^:\d]*)\s*\d*)\s*:\s*([^:]*)\s*", sub 

3003 ) 

3004 else: 

3005 m = None 

3006 etym = "" 

3007 etym_numbered = "" 

3008 pos = "" 

3009 if m: 

3010 etym_numbered = m.group(1) 

3011 etym = m.group(2) 

3012 pos = m.group(3) 

3013 if not sub: 

3014 wxr.wtp.debug( 

3015 "no part-of-speech in " 

3016 "{{see translation subpage|...}}, " 

3017 "defaulting to just wxr.wtp.section " 

3018 "(= language)", 

3019 sortid="page/2468", 

3020 ) 

3021 # seq sent to get_subpage_section without sub and pos 

3022 seq = [ 

3023 language, 

3024 TRANSLATIONS_TITLE, 

3025 ] 

3026 elif ( 

3027 m 

3028 and etym.lower().strip() in ETYMOLOGY_TITLES 

3029 and pos.lower() in POS_TITLES 

3030 ): 

3031 seq = [ 

3032 language, 

3033 etym_numbered, 

3034 pos, 

3035 TRANSLATIONS_TITLE, 

3036 ] 

3037 elif sub.lower() in POS_TITLES: 

3038 # seq with sub but not pos 

3039 seq = [ 

3040 language, 

3041 sub, 

3042 TRANSLATIONS_TITLE, 

3043 ] 

3044 else: 

3045 # seq with sub and pos 

3046 pos = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3047 if pos.lower() not in POS_TITLES: 

3048 wxr.wtp.debug( 

3049 "unhandled see translation subpage: " 

3050 "language={} sub={} " 

3051 "wxr.wtp.subsection={}".format( 

3052 language, sub, wxr.wtp.subsection 

3053 ), 

3054 sortid="page/2478", 

3055 ) 

3056 seq = [language, sub, pos, TRANSLATIONS_TITLE] 

3057 subnode = get_subpage_section( 

3058 wxr.wtp.title or "MISSING_TITLE", 

3059 TRANSLATIONS_TITLE, 

3060 seq, 

3061 ) 

3062 if subnode is not None and isinstance(subnode, WikiNode): 

3063 parse_translations(data, subnode) 

3064 else: 

3065 # Failed to find the normal subpage section 

3066 seq = [TRANSLATIONS_TITLE] 

3067 subnode = get_subpage_section( 

3068 wxr.wtp.title or "MISSING_TITLE", 

3069 TRANSLATIONS_TITLE, 

3070 seq, 

3071 ) 

3072 if subnode is not None and isinstance( 

3073 subnode, WikiNode 

3074 ): 

3075 parse_translations(data, subnode) 

3076 return "" 

3077 if name in ( 

3078 "c", 

3079 "C", 

3080 "categorize", 

3081 "cat", 

3082 "catlangname", 

3083 "topics", 

3084 "top", 

3085 "qualifier", 

3086 "cln", 

3087 ): 

3088 # These are expanded in the default way 

3089 return None 

3090 if name in ("trans-top",): 

3091 # XXX capture id from trans-top? Capture sense here 

3092 # instead of trying to parse it from expanded content? 

3093 if ht.get(1): 

3094 sense_parts = [] 

3095 sense = ht.get(1) 

3096 else: 

3097 sense_parts = [] 

3098 sense = None 

3099 return None 

3100 if name in ( 

3101 "trans-bottom", 

3102 "trans-mid", 

3103 "checktrans-mid", 

3104 "checktrans-bottom", 

3105 ): 

3106 return None 

3107 if name == "checktrans-top": 

3108 sense_parts = [] 

3109 sense = None 

3110 return "" 

3111 if name == "trans-top-also": 

3112 # XXX capture? 

3113 sense_parts = [] 

3114 sense = None 

3115 return "" 

3116 wxr.wtp.error( 

3117 "UNIMPLEMENTED parse_translation_template: {} {}".format( 

3118 name, ht 

3119 ), 

3120 sortid="page/2517", 

3121 ) 

3122 return "" 

3123 

3124 wxr.wtp.expand( 

3125 wxr.wtp.node_to_wikitext(node), template_fn=template_fn 

3126 ) 

3127 

3128 def parse_translation_recurse(xlatnode: WikiNode) -> None: 

3129 nonlocal sense 

3130 nonlocal sense_parts 

3131 for node in xlatnode.children: 

3132 # print(node) 

3133 if isinstance(node, str): 

3134 if sense: 3134 ↛ 3135line 3134 didn't jump to line 3135 because the condition on line 3134 was never true

3135 if not node.isspace(): 

3136 wxr.wtp.debug( 

3137 "skipping string in the middle of " 

3138 "translations: {}".format(node), 

3139 sortid="page/2530", 

3140 ) 

3141 continue 

3142 # Add a part to the sense 

3143 sense_parts.append(node) 

3144 sense = None 

3145 continue 

3146 assert isinstance(node, WikiNode) 

3147 kind = node.kind 

3148 if kind == NodeKind.LIST: 

3149 for item in node.children: 

3150 if not isinstance(item, WikiNode): 3150 ↛ 3151line 3150 didn't jump to line 3151 because the condition on line 3150 was never true

3151 continue 

3152 if item.kind != NodeKind.LIST_ITEM: 3152 ↛ 3153line 3152 didn't jump to line 3153 because the condition on line 3152 was never true

3153 continue 

3154 if item.sarg == ":": 3154 ↛ 3155line 3154 didn't jump to line 3155 because the condition on line 3154 was never true

3155 continue 

3156 parse_translation_item(item.children) 

3157 elif kind == NodeKind.LIST_ITEM and node.sarg == ":": 3157 ↛ 3161line 3157 didn't jump to line 3161 because the condition on line 3157 was never true

3158 # Silently skip list items that are just indented; these 

3159 # are used for text between translations, such as indicating 

3160 # translations that need to be checked. 

3161 pass 

3162 elif kind == NodeKind.TEMPLATE: 3162 ↛ 3163line 3162 didn't jump to line 3163 because the condition on line 3162 was never true

3163 parse_translation_template(node) 

3164 elif kind in ( 3164 ↛ 3169line 3164 didn't jump to line 3169 because the condition on line 3164 was never true

3165 NodeKind.TABLE, 

3166 NodeKind.TABLE_ROW, 

3167 NodeKind.TABLE_CELL, 

3168 ): 

3169 parse_translation_recurse(node) 

3170 elif kind == NodeKind.HTML: 3170 ↛ 3171line 3170 didn't jump to line 3171 because the condition on line 3170 was never true

3171 if node.attrs.get("class") == "NavFrame": 

3172 # Reset ``sense_parts`` (and force recomputing 

3173 # by clearing ``sense``) as each NavFrame specifies 

3174 # its own sense. This helps eliminate garbage coming 

3175 # from text at the beginning at the translations 

3176 # section. 

3177 sense_parts = [] 

3178 sense = None 

3179 # for item in node.children: 

3180 # if not isinstance(item, WikiNode): 

3181 # continue 

3182 # parse_translation_recurse(item) 

3183 parse_translation_recurse(node) 

3184 elif kind in LEVEL_KINDS: 3184 ↛ 3186line 3184 didn't jump to line 3186 because the condition on line 3184 was never true

3185 # Sub-levels will be recursed elsewhere 

3186 pass 

3187 elif kind in (NodeKind.ITALIC, NodeKind.BOLD): 3187 ↛ 3188line 3187 didn't jump to line 3188 because the condition on line 3187 was never true

3188 parse_translation_recurse(node) 

3189 elif kind == NodeKind.PREFORMATTED: 3189 ↛ 3191line 3189 didn't jump to line 3191 because the condition on line 3189 was always true

3190 print("parse_translation_recurse: PREFORMATTED:", node) 

3191 elif kind == NodeKind.LINK: 

3192 arg0 = node.largs[0] 

3193 # Kludge: I've seen occasional normal links to translation 

3194 # subpages from main pages (e.g., language/English/Noun 

3195 # in July 2021) instead of the normal 

3196 # {{see translation subpage|...}} template. This should 

3197 # handle them. Note: must be careful not to read other 

3198 # links, particularly things like in "human being": 

3199 # "a human being -- see [[man/translations]]" (group title) 

3200 if ( 

3201 isinstance(arg0, (list, tuple)) 

3202 and arg0 

3203 and isinstance(arg0[0], str) 

3204 and arg0[0].endswith("/" + TRANSLATIONS_TITLE) 

3205 and arg0[0][: -(1 + len(TRANSLATIONS_TITLE))] 

3206 == wxr.wtp.title 

3207 ): 

3208 wxr.wtp.debug( 

3209 "translations subpage link found on main " 

3210 "page instead " 

3211 "of normal {{see translation subpage|...}}", 

3212 sortid="page/2595", 

3213 ) 

3214 sub = wxr.wtp.subsection or "MISSING_SUBSECTION" 

3215 if sub.lower() in POS_TITLES: 

3216 seq = [ 

3217 language, 

3218 sub, 

3219 TRANSLATIONS_TITLE, 

3220 ] 

3221 subnode = get_subpage_section( 

3222 wxr.wtp.title, 

3223 TRANSLATIONS_TITLE, 

3224 seq, 

3225 ) 

3226 if subnode is not None and isinstance( 

3227 subnode, WikiNode 

3228 ): 

3229 parse_translations(data, subnode) 

3230 else: 

3231 wxr.wtp.error( 

3232 "/translations link outside part-of-speech" 

3233 ) 

3234 

3235 if ( 

3236 len(arg0) >= 1 

3237 and isinstance(arg0[0], str) 

3238 and not arg0[0].lower().startswith("category:") 

3239 ): 

3240 for x in node.largs[-1]: 

3241 if isinstance(x, str): 

3242 sense_parts.append(x) 

3243 else: 

3244 parse_translation_recurse(x) 

3245 elif not sense: 

3246 sense_parts.append(node) 

3247 else: 

3248 wxr.wtp.debug( 

3249 "skipping text between translation items/senses: " 

3250 "{}".format(node), 

3251 sortid="page/2621", 

3252 ) 

3253 

3254 # Main code of parse_translation(). We want ``sense`` to be assigned 

3255 # regardless of recursion levels, and thus the code is structured 

3256 # to define at this level and recurse in parse_translation_recurse(). 

3257 parse_translation_recurse(xlatnode) 

3258 

3259 def parse_etymology(data: WordData, node: WikiNode) -> None: 

3260 """Parses an etymology section.""" 

3261 assert isinstance(data, dict) 

3262 assert isinstance(node, WikiNode) 

3263 

3264 templates: list[TemplateData] = [] 

3265 

3266 # Counter for preventing the capture of etymology templates 

3267 # when we are inside templates that we want to ignore (i.e., 

3268 # not capture). 

3269 ignore_count = 0 

3270 

3271 def etym_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3272 nonlocal ignore_count 

3273 if is_panel_template(wxr, name) or name in ["zh-x", "zh-q"]: 

3274 return "" 

3275 if re.match(ignored_etymology_templates_re, name): 

3276 ignore_count += 1 

3277 return None 

3278 

3279 # CONTINUE_HERE 

3280 

3281 def etym_post_template_fn( 

3282 name: str, ht: TemplateArgs, expansion: str 

3283 ) -> None: 

3284 nonlocal ignore_count 

3285 if name in wikipedia_templates: 

3286 parse_wikipedia_template(wxr, data, ht) 

3287 return None 

3288 if re.match(ignored_etymology_templates_re, name): 

3289 ignore_count -= 1 

3290 return None 

3291 if ignore_count == 0: 

3292 ht = clean_template_args(wxr, ht) 

3293 expansion = clean_node(wxr, None, expansion) 

3294 templates.append( 

3295 {"name": name, "args": ht, "expansion": expansion} 

3296 ) 

3297 return None 

3298 

3299 # Remove any subsections 

3300 contents = list( 

3301 x 

3302 for x in node.children 

3303 if not isinstance(x, WikiNode) or x.kind not in LEVEL_KINDS 

3304 ) 

3305 # Convert to text, also capturing templates using post_template_fn 

3306 text = clean_node( 

3307 wxr, 

3308 None, 

3309 contents, 

3310 template_fn=etym_template_fn, 

3311 post_template_fn=etym_post_template_fn, 

3312 ).strip(": \n") # remove ":" indent wikitext before zh-x template 

3313 # Save the collected information. 

3314 if len(text) > 0: 3314 ↛ 3316line 3314 didn't jump to line 3316 because the condition on line 3314 was always true

3315 data["etymology_text"] = text 

3316 if len(templates) > 0: 3316 ↛ 3321line 3316 didn't jump to line 3321 because the condition on line 3316 was never true

3317 # Some etymology templates, like Template:root do not generate 

3318 # text, so they should be added here. Elsewhere, we check 

3319 # for Template:root and add some text to the expansion to please 

3320 # the validation. 

3321 data["etymology_templates"] = templates 

3322 

3323 for child_node in node.find_child_recursively( 3323 ↛ exitline 3323 didn't return from function 'parse_etymology' because the loop on line 3323 didn't complete

3324 LEVEL_KIND_FLAGS | NodeKind.TEMPLATE 

3325 ): 

3326 if child_node.kind in LEVEL_KIND_FLAGS: 3326 ↛ 3328line 3326 didn't jump to line 3328 because the condition on line 3326 was always true

3327 break 

3328 elif isinstance( 

3329 child_node, TemplateNode 

3330 ) and child_node.template_name in ["zh-x", "zh-q"]: 

3331 if "etymology_examples" not in data: 

3332 data["etymology_examples"] = [] 

3333 data["etymology_examples"].extend( 

3334 extract_template_zh_x( 

3335 wxr, child_node, None, ExampleData(raw_tags=[], tags=[]) 

3336 ) 

3337 ) 

3338 

3339 def parse_descendants( 

3340 data: WordData, node: WikiNode, is_proto_root_derived_section=False 

3341 ) -> None: 

3342 """Parses a Descendants section. Also used on Derived terms and 

3343 Extensions sections when we are dealing with a root of a reconstructed 

3344 language (i.e. is_proto_root_derived_section == True), as they use the 

3345 same structure. In the latter case, The wiktionary convention is not to 

3346 title the section as descendants since the immediate offspring of the 

3347 roots are morphologically derived terms within the same proto-language. 

3348 Still, since the rest of the section lists true descendants, we use the 

3349 same function. Entries in the descendants list that are technically 

3350 derived terms will have a field "tags": ["derived"].""" 

3351 assert isinstance(data, dict) 

3352 assert isinstance(node, WikiNode) 

3353 assert isinstance(is_proto_root_derived_section, bool) 

3354 

3355 descendants = [] 

3356 

3357 # Most templates that are not in a LIST should be ignored as they only 

3358 # add formatting, like "desc-top", "der-top3", etc. Any template in 

3359 # unignored_non_list_templates actually contains relevant descendant 

3360 # info. E.g. "CJKV" is often the only line at all in descendants 

3361 # sections in many Chinese/Japanese/Korean/Vietnamese pages, but would 

3362 # be skipped if we didn't handle it specially as it is not part of a 

3363 # LIST, and additionally is in panel_templates. There are probably more 

3364 # such templates that should be added to this... 

3365 unignored_non_list_templates: list[str] = ["CJKV"] 

3366 

3367 def process_list_item_children( 

3368 sarg: str, children: list[Union[str, WikiNode]] 

3369 ) -> None: 

3370 assert isinstance(sarg, str) 

3371 assert isinstance(children, list) 

3372 # The descendants section is a hierarchical bulleted listed. sarg is 

3373 # usually some number of "*" characters indicating the level of 

3374 # indentation of the line, e.g. "***" indicates the line will be 

3375 # thrice-indented. A bare ";" is used to indicate a subtitle-like 

3376 # line with no indentation. ":" at the end of one or more "*"s is 

3377 # used to indicate that the bullet will not be displayed. 

3378 item_data: DescendantData = {"depth": sarg.count("*")} 

3379 templates: list[TemplateData] = [] 

3380 is_derived = False 

3381 

3382 # Counter for preventing the capture of templates when we are inside 

3383 # templates that we want to ignore (i.e., not capture). 

3384 ignore_count = 0 

3385 

3386 def desc_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3387 nonlocal ignore_count 

3388 if ( 

3389 is_panel_template(wxr, name) 

3390 and name not in unignored_non_list_templates 

3391 ): 

3392 return "" 

3393 if re.match(ignored_descendants_templates_re, name): 

3394 ignore_count += 1 

3395 return None 

3396 

3397 def desc_post_template_fn( 

3398 name: str, ht: TemplateArgs, expansion: str 

3399 ) -> None: 

3400 nonlocal ignore_count 

3401 if name in wikipedia_templates: 

3402 parse_wikipedia_template(wxr, data, ht) 

3403 return None 

3404 if re.match(ignored_descendants_templates_re, name): 

3405 ignore_count -= 1 

3406 return None 

3407 if ignore_count == 0: 

3408 ht = clean_template_args(wxr, ht) 

3409 nonlocal is_derived 

3410 # If we're in a proto-root Derived terms or Extensions 

3411 # section, and the current list item has a link template 

3412 # to a term in the same proto-language, then we tag this 

3413 # descendant entry with "derived" 

3414 is_derived = ( 

3415 is_proto_root_derived_section 

3416 and (name == "l" or name == "link") 

3417 and ("1" in ht and ht["1"] == lang_code) 

3418 ) 

3419 expansion = clean_node(wxr, None, expansion) 

3420 templates.append( 

3421 {"name": name, "args": ht, "expansion": expansion} 

3422 ) 

3423 return None 

3424 

3425 text = clean_node( 

3426 wxr, 

3427 None, 

3428 children, 

3429 template_fn=desc_template_fn, 

3430 post_template_fn=desc_post_template_fn, 

3431 ) 

3432 item_data["templates"] = templates 

3433 item_data["text"] = text 

3434 if is_derived: 

3435 item_data["tags"] = ["derived"] 

3436 descendants.append(item_data) 

3437 

3438 def node_children(node: WikiNode) -> Iterator[tuple[int, WikiNode]]: 

3439 for i, child in enumerate(node.children): 

3440 if isinstance(child, WikiNode): 

3441 yield (i, child) 

3442 

3443 def get_sublist_index(list_item: WikiNode) -> Optional[int]: 

3444 for i, child in node_children(list_item): 

3445 if child.kind == NodeKind.LIST: 

3446 return i 

3447 return None 

3448 

3449 def get_descendants(node: WikiNode) -> None: 

3450 """Appends the data for every list item in every list in node 

3451 to descendants.""" 

3452 for _, c in node_children(node): 

3453 if ( 

3454 c.kind == NodeKind.TEMPLATE 

3455 and c.largs 

3456 and len(c.largs[0]) == 1 

3457 and isinstance(c.largs[0][0], str) 

3458 and c.largs[0][0] in unignored_non_list_templates 

3459 ): 

3460 # Some Descendants sections have no wikitext list. Rather, 

3461 # the list is entirely generated by a single template (see 

3462 # e.g. the use of {{CJKV}} in Chinese entries). 

3463 process_list_item_children("", [c]) 

3464 elif c.kind == NodeKind.HTML: 

3465 # The Descendants sections for many languages feature 

3466 # templates that generate html to add styling (e.g. using 

3467 # multiple columns) to the list, so that the actual wikitext 

3468 # list items are found within a <div>. We look within the 

3469 # children of the html node for the actual list items. 

3470 get_descendants(c) 

3471 elif c.kind == NodeKind.LIST: 

3472 get_descendants(c) 

3473 elif c.kind == NodeKind.LIST_ITEM: 

3474 # If a LIST_ITEM has subitems in a sublist, usually its 

3475 # last child is a LIST. However, sometimes after the LIST 

3476 # there is one or more trailing LIST_ITEMs, like "\n" or 

3477 # a reference template. If there is a sublist, we discard 

3478 # everything after it. 

3479 i = get_sublist_index(c) 

3480 if i is not None: 

3481 process_list_item_children(c.sarg, c.children[:i]) 

3482 get_descendants(c.children[i]) # type: ignore[arg-type] 

3483 else: 

3484 process_list_item_children(c.sarg, c.children) 

3485 

3486 # parse_descendants() actual work starts here 

3487 get_descendants(node) 

3488 

3489 # if e.g. on a PIE page, there may be both Derived terms and Extensions 

3490 # sections, in which case this function will be called multiple times, 

3491 # so we have to check if descendants exists first. 

3492 if "descendants" in data: 

3493 data["descendants"].extend(descendants) 

3494 else: 

3495 data["descendants"] = descendants 

3496 

3497 def process_children(treenode: WikiNode, pos: Optional[str]) -> None: 

3498 """This recurses into a subtree in the parse tree for a page.""" 

3499 nonlocal etym_data 

3500 nonlocal pos_data 

3501 nonlocal inside_level_four 

3502 

3503 redirect_list: list[str] = [] # for `zh-see` template 

3504 

3505 def skip_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

3506 """This is called for otherwise unprocessed parts of the page. 

3507 We still expand them so that e.g. Category links get captured.""" 

3508 if name in wikipedia_templates: 3508 ↛ 3509line 3508 didn't jump to line 3509 because the condition on line 3508 was never true

3509 data = select_data() 

3510 parse_wikipedia_template(wxr, data, ht) 

3511 return None 

3512 if is_panel_template(wxr, name): 3512 ↛ 3513line 3512 didn't jump to line 3513 because the condition on line 3512 was never true

3513 return "" 

3514 return None 

3515 

3516 for node in treenode.children: 

3517 # print(node) 

3518 if not isinstance(node, WikiNode): 

3519 # print(" X{}".format(repr(node)[:40])) 

3520 continue 

3521 if isinstance(node, TemplateNode): 

3522 if process_soft_redirect_template(wxr, node, redirect_list): 

3523 continue 

3524 elif node.template_name == "zh-forms": 3524 ↛ 3525line 3524 didn't jump to line 3525 because the condition on line 3524 was never true

3525 process_zh_forms_templates(wxr, node, base_data) 

3526 

3527 if node.kind not in LEVEL_KINDS: 

3528 # XXX handle e.g. wikipedia links at the top of a language 

3529 # XXX should at least capture "also" at top of page 

3530 if node.kind in ( 

3531 NodeKind.HLINE, 

3532 NodeKind.LIST, 

3533 NodeKind.LIST_ITEM, 

3534 ): 

3535 continue 

3536 # print(" UNEXPECTED: {}".format(node)) 

3537 # Clean the node to collect category links 

3538 clean_node(wxr, etym_data, node, template_fn=skip_template_fn) 

3539 continue 

3540 t = clean_node( 

3541 wxr, etym_data, node.sarg if node.sarg else node.largs 

3542 ) 

3543 t = t.lower() 

3544 # XXX these counts were never implemented fully, and even this 

3545 # gets discarded: Search STATISTICS_IMPLEMENTATION 

3546 wxr.config.section_counts[t] += 1 

3547 # print("PROCESS_CHILDREN: T:", repr(t)) 

3548 if t in IGNORED_TITLES: 3548 ↛ 3549line 3548 didn't jump to line 3549 because the condition on line 3548 was never true

3549 pass 

3550 elif t.startswith(PRONUNCIATION_TITLE): 3550 ↛ 3555line 3550 didn't jump to line 3555 because the condition on line 3550 was never true

3551 # Chinese Pronunciation section kludge; we demote these to 

3552 # be level 4 instead of 3 so that they're part of a larger 

3553 # etymology hierarchy; usually the data here is empty and 

3554 # acts as an inbetween between POS and Etymology data 

3555 inside_level_four = True 

3556 if t.startswith(PRONUNCIATION_TITLE + " "): 

3557 # Pronunciation 1, etc, are used in Chinese Glyphs, 

3558 # and each of them may have senses under Definition 

3559 push_level_four_section() 

3560 wxr.wtp.start_subsection(None) 

3561 if wxr.config.capture_pronunciation: 

3562 data = select_data() 

3563 parse_pronunciation( 

3564 wxr, 

3565 node, 

3566 data, 

3567 etym_data, 

3568 have_etym, 

3569 base_data, 

3570 lang_code, 

3571 ) 

3572 elif t.startswith(tuple(ETYMOLOGY_TITLES)): 

3573 push_etym() 

3574 wxr.wtp.start_subsection(None) 

3575 if wxr.config.capture_etymologies: 3575 ↛ 3640line 3575 didn't jump to line 3640 because the condition on line 3575 was always true

3576 m = re.search(r"\s(\d+)$", t) 

3577 if m: 3577 ↛ 3578line 3577 didn't jump to line 3578 because the condition on line 3577 was never true

3578 etym_data["etymology_number"] = int(m.group(1)) 

3579 parse_etymology(etym_data, node) 

3580 elif t == DESCENDANTS_TITLE and wxr.config.capture_descendants: 3580 ↛ 3581line 3580 didn't jump to line 3581 because the condition on line 3580 was never true

3581 data = select_data() 

3582 parse_descendants(data, node) 

3583 elif ( 3583 ↛ 3589line 3583 didn't jump to line 3589

3584 t in PROTO_ROOT_DERIVED_TITLES 

3585 and pos == "root" 

3586 and is_reconstruction 

3587 and wxr.config.capture_descendants 

3588 ): 

3589 data = select_data() 

3590 parse_descendants(data, node, True) 

3591 elif t == TRANSLATIONS_TITLE: 

3592 data = select_data() 

3593 parse_translations(data, node) 

3594 elif t in INFLECTION_TITLES: 3594 ↛ 3595line 3594 didn't jump to line 3595 because the condition on line 3594 was never true

3595 parse_inflection(node, t, pos) 

3596 else: 

3597 lst = t.split() 

3598 while len(lst) > 1 and lst[-1].isdigit(): 3598 ↛ 3599line 3598 didn't jump to line 3599 because the condition on line 3598 was never true

3599 lst = lst[:-1] 

3600 t_no_number = " ".join(lst).lower() 

3601 if t_no_number in POS_TITLES: 

3602 push_pos() 

3603 dt = POS_TITLES[t_no_number] # type:ignore[literal-required] 

3604 pos = dt["pos"] or "MISSING_POS" 

3605 wxr.wtp.start_subsection(t) 

3606 if "debug" in dt: 3606 ↛ 3607line 3606 didn't jump to line 3607 because the condition on line 3606 was never true

3607 wxr.wtp.debug( 

3608 "{} in section {}".format(dt["debug"], t), 

3609 sortid="page/2755", 

3610 ) 

3611 if "warning" in dt: 3611 ↛ 3612line 3611 didn't jump to line 3612 because the condition on line 3611 was never true

3612 wxr.wtp.warning( 

3613 "{} in section {}".format(dt["warning"], t), 

3614 sortid="page/2759", 

3615 ) 

3616 if "error" in dt: 3616 ↛ 3617line 3616 didn't jump to line 3617 because the condition on line 3616 was never true

3617 wxr.wtp.error( 

3618 "{} in section {}".format(dt["error"], t), 

3619 sortid="page/2763", 

3620 ) 

3621 # Parse word senses for the part-of-speech 

3622 parse_part_of_speech(node, pos) 

3623 if "tags" in dt: 3623 ↛ 3624line 3623 didn't jump to line 3624 because the condition on line 3623 was never true

3624 for pdata in pos_datas: 

3625 data_extend(pdata, "tags", dt["tags"]) 

3626 elif t_no_number in LINKAGE_TITLES: 3626 ↛ 3630line 3626 didn't jump to line 3630 because the condition on line 3626 was always true

3627 rel = LINKAGE_TITLES[t_no_number] 

3628 data = select_data() 

3629 parse_linkage(data, rel, node) 

3630 elif t_no_number == COMPOUNDS_TITLE: 

3631 data = select_data() 

3632 if wxr.config.capture_compounds: 

3633 parse_linkage(data, "derived", node) 

3634 

3635 # XXX parse interesting templates also from other sections. E.g., 

3636 # {{Letter|...}} in ===See also=== 

3637 # Also <gallery> 

3638 

3639 # Recurse to children of this node, processing subtitles therein 

3640 stack.append(t) 

3641 process_children(node, pos) 

3642 stack.pop() 

3643 

3644 if len(redirect_list) > 0: 

3645 if len(pos_data) > 0: 

3646 pos_data["redirects"] = redirect_list 

3647 if "pos" not in pos_data: 3647 ↛ 3648line 3647 didn't jump to line 3648 because the condition on line 3647 was never true

3648 pos_data["pos"] = "soft-redirect" 

3649 else: 

3650 new_page_data = copy.deepcopy(base_data) 

3651 new_page_data["redirects"] = redirect_list 

3652 if "pos" not in new_page_data: 3652 ↛ 3654line 3652 didn't jump to line 3654 because the condition on line 3652 was always true

3653 new_page_data["pos"] = "soft-redirect" 

3654 new_page_data["senses"] = [{"tags": ["no-gloss"]}] 

3655 page_datas.append(new_page_data) 

3656 

3657 def extract_examples( 

3658 others: list[WikiNode], sense_base: SenseData 

3659 ) -> list[ExampleData]: 

3660 """Parses through a list of definitions and quotes to find examples. 

3661 Returns a list of example dicts to be added to sense data. Adds 

3662 meta-data, mostly categories, into sense_base.""" 

3663 assert isinstance(others, list) 

3664 examples: list[ExampleData] = [] 

3665 

3666 for sub in others: 

3667 if not sub.sarg.endswith((":", "*")): 3667 ↛ 3668line 3667 didn't jump to line 3668 because the condition on line 3667 was never true

3668 continue 

3669 for item in sub.children: 

3670 if not isinstance(item, WikiNode): 3670 ↛ 3671line 3670 didn't jump to line 3671 because the condition on line 3670 was never true

3671 continue 

3672 if item.kind != NodeKind.LIST_ITEM: 3672 ↛ 3673line 3672 didn't jump to line 3673 because the condition on line 3672 was never true

3673 continue 

3674 usex_type = None 

3675 example_template_args = [] 

3676 example_template_names = [] 

3677 taxons = set() 

3678 

3679 # Bypass this function when parsing Chinese, Japanese and 

3680 # quotation templates. 

3681 new_example_lists = extract_example_list_item( 

3682 wxr, item, sense_base, ExampleData(raw_tags=[], tags=[]) 

3683 ) 

3684 if len(new_example_lists) > 0: 3684 ↛ 3685line 3684 didn't jump to line 3685 because the condition on line 3684 was never true

3685 examples.extend(new_example_lists) 

3686 continue 

3687 

3688 def usex_template_fn( 

3689 name: str, ht: TemplateArgs 

3690 ) -> Optional[str]: 

3691 nonlocal usex_type 

3692 if is_panel_template(wxr, name): 

3693 return "" 

3694 if name in usex_templates: 

3695 usex_type = "example" 

3696 example_template_args.append(ht) 

3697 example_template_names.append(name) 

3698 elif name in quotation_templates: 

3699 usex_type = "quotation" 

3700 elif name in taxonomy_templates: 

3701 taxons.update(ht.get(1, "").split()) 

3702 for prefix in template_linkages: 

3703 if re.search( 

3704 r"(^|[-/\s]){}($|\b|[0-9])".format(prefix), name 

3705 ): 

3706 return "" 

3707 return None 

3708 

3709 # bookmark 

3710 ruby: list[tuple[str, str]] = [] 

3711 contents = item.children 

3712 if lang_code == "ja": 

3713 # Capture ruby contents if this is a Japanese language 

3714 # example. 

3715 # print(contents) 

3716 if ( 3716 ↛ 3721line 3716 didn't jump to line 3721

3717 contents 

3718 and isinstance(contents, str) 

3719 and re.match(r"\s*$", contents[0]) 

3720 ): 

3721 contents = contents[1:] 

3722 exp = wxr.wtp.parse( 

3723 wxr.wtp.node_to_wikitext(contents), 

3724 # post_template_fn=head_post_template_fn, 

3725 expand_all=True, 

3726 ) 

3727 rub, rest = extract_ruby(wxr, exp.children) 

3728 if rub: 3728 ↛ 3732line 3728 didn't jump to line 3732 because the condition on line 3728 was always true

3729 for rtup in rub: 

3730 ruby.append(rtup) 

3731 contents = rest 

3732 subtext = clean_node( 

3733 wxr, sense_base, contents, template_fn=usex_template_fn 

3734 ) 

3735 

3736 frozen_taxons = frozenset(taxons) 

3737 classify_desc2 = partial(classify_desc, accepted=frozen_taxons) 

3738 

3739 # print(f"{subtext=}") 

3740 subtext = re.sub( 

3741 r"\s*\(please add an English " 

3742 r"translation of this " 

3743 r"(example|usage example|quote)\)", 

3744 "", 

3745 subtext, 

3746 ).strip() 

3747 subtext = re.sub(r"\^\([^)]*\)", "", subtext) 

3748 subtext = re.sub(r"\s*[―—]+$", "", subtext) 

3749 # print("subtext:", repr(subtext)) 

3750 

3751 lines = subtext.splitlines() 

3752 # print(lines) 

3753 

3754 lines = list(re.sub(r"^[#:*]*", "", x).strip() for x in lines) 

3755 lines = list( 

3756 x 

3757 for x in lines 

3758 if not re.match( 

3759 r"(Synonyms: |Antonyms: |Hyponyms: |" 

3760 r"Synonym: |Antonym: |Hyponym: |" 

3761 r"Hypernyms: |Derived terms: |" 

3762 r"Related terms: |" 

3763 r"Hypernym: |Derived term: |" 

3764 r"Coordinate terms:|" 

3765 r"Related term: |" 

3766 r"For more quotations using )", 

3767 x, 

3768 ) 

3769 ) 

3770 tr = "" 

3771 ref = "" 

3772 roman = "" 

3773 # for line in lines: 

3774 # print("LINE:", repr(line)) 

3775 # print(classify_desc(line)) 

3776 if len(lines) == 1 and lang_code != "en": 3776 ↛ 3777line 3776 didn't jump to line 3777 because the condition on line 3776 was never true

3777 parts = example_splitter_re.split(lines[0]) 

3778 if ( 

3779 len(parts) > 2 

3780 and len(example_template_args) == 1 

3781 and any( 

3782 ("―" in s) or ("—" in s) 

3783 for s in example_template_args[0].values() 

3784 ) 

3785 ): 

3786 if nparts := synch_splits_with_args( 

3787 lines[0], example_template_args[0] 

3788 ): 

3789 parts = nparts 

3790 if ( 

3791 len(example_template_args) == 1 

3792 and "lit" in example_template_args[0] 

3793 ): 

3794 # ugly brute-force kludge in case there's a lit= arg 

3795 literally = example_template_args[0].get("lit", "") 

3796 if literally: 

3797 literally = ( 

3798 " (literally, “" 

3799 + clean_value(wxr, literally) 

3800 + "”)" 

3801 ) 

3802 else: 

3803 literally = "" 

3804 if ( 

3805 len(example_template_args) == 1 

3806 and len(parts) == 2 

3807 and len(example_template_args[0]) 

3808 - ( 

3809 # horrible kludge to ignore these arguments 

3810 # when calculating how many there are 

3811 sum( 

3812 s in example_template_args[0] 

3813 for s in ( 

3814 "lit", # generates text, but we handle it 

3815 "inline", 

3816 "noenum", 

3817 "nocat", 

3818 "sort", 

3819 ) 

3820 ) 

3821 ) 

3822 == 3 

3823 and clean_value( 

3824 wxr, example_template_args[0].get(2, "") 

3825 ) 

3826 == parts[0].strip() 

3827 and clean_value( 

3828 wxr, 

3829 ( 

3830 example_template_args[0].get(3) 

3831 or example_template_args[0].get("translation") 

3832 or example_template_args[0].get("t", "") 

3833 ) 

3834 + literally, # in case there's a lit= argument 

3835 ) 

3836 == parts[1].strip() 

3837 ): 

3838 # {{exampletemplate|ex|Foo bar baz|English translation}} 

3839 # is a pretty reliable 'heuristic', so we use it here 

3840 # before the others. To be extra sure the template 

3841 # doesn't do anything weird, we compare the arguments 

3842 # and the output to each other. 

3843 lines = [parts[0].strip()] 

3844 tr = parts[1].strip() 

3845 elif ( 

3846 len(parts) == 2 

3847 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3848 ): 

3849 # These other branches just do some simple heuristics w/ 

3850 # the expanded output of the template (if applicable). 

3851 lines = [parts[0].strip()] 

3852 tr = parts[1].strip() 

3853 elif ( 

3854 len(parts) == 3 

3855 and classify_desc2(parts[1]) 

3856 in ("romanization", "english") 

3857 and classify_desc2(parts[2]) in ENGLISH_TEXTS 

3858 ): 

3859 lines = [parts[0].strip()] 

3860 roman = parts[1].strip() 

3861 tr = parts[2].strip() 

3862 else: 

3863 parts = re.split(r"\s+-\s+", lines[0]) 

3864 if ( 

3865 len(parts) == 2 

3866 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3867 ): 

3868 lines = [parts[0].strip()] 

3869 tr = parts[1].strip() 

3870 elif len(lines) > 1: 

3871 if any( 3871 ↛ 3874line 3871 didn't jump to line 3874 because the condition on line 3871 was never true

3872 re.search(r"[]\d:)]\s*$", x) for x in lines[:-1] 

3873 ) and not (len(example_template_names) == 1): 

3874 refs: list[str] = [] 

3875 for i in range(len(lines)): 

3876 if re.match(r"^[#*]*:+(\s*$|\s+)", lines[i]): 

3877 break 

3878 refs.append(lines[i].strip()) 

3879 if re.search(r"[]\d:)]\s*$", lines[i]): 

3880 break 

3881 ref = " ".join(refs) 

3882 lines = lines[i + 1 :] 

3883 if ( 

3884 lang_code != "en" 

3885 and len(lines) >= 2 

3886 and classify_desc2(lines[-1]) in ENGLISH_TEXTS 

3887 ): 

3888 i = len(lines) - 1 

3889 while ( 

3890 i > 1 

3891 and classify_desc2(lines[i - 1]) 

3892 in ENGLISH_TEXTS 

3893 ): 

3894 i -= 1 

3895 tr = "\n".join(lines[i:]) 

3896 lines = lines[:i] 

3897 if len(lines) >= 2: 

3898 if classify_desc2(lines[-1]) == "romanization": 

3899 roman = lines[-1].strip() 

3900 lines = lines[:-1] 

3901 

3902 elif lang_code == "en" and re.match(r"^[#*]*:+", lines[1]): 3902 ↛ 3903line 3902 didn't jump to line 3903 because the condition on line 3902 was never true

3903 ref = lines[0] 

3904 lines = lines[1:] 

3905 elif lang_code != "en" and len(lines) == 2: 3905 ↛ 3906line 3905 didn't jump to line 3906 because the condition on line 3905 was never true

3906 cls1 = classify_desc2(lines[0]) 

3907 cls2 = classify_desc2(lines[1]) 

3908 if cls2 in ENGLISH_TEXTS and cls1 != "english": 

3909 tr = lines[1] 

3910 lines = [lines[0]] 

3911 elif cls1 in ENGLISH_TEXTS and cls2 != "english": 

3912 tr = lines[0] 

3913 lines = [lines[1]] 

3914 elif ( 

3915 re.match(r"^[#*]*:+", lines[1]) 

3916 and classify_desc2( 

3917 re.sub(r"^[#*:]+\s*", "", lines[1]) 

3918 ) 

3919 in ENGLISH_TEXTS 

3920 ): 

3921 tr = re.sub(r"^[#*:]+\s*", "", lines[1]) 

3922 lines = [lines[0]] 

3923 elif cls1 == "english" and cls2 in ENGLISH_TEXTS: 

3924 # Both were classified as English, but 

3925 # presumably one is not. Assume first is 

3926 # non-English, as that seems more common. 

3927 tr = lines[1] 

3928 lines = [lines[0]] 

3929 elif ( 3929 ↛ 3945line 3929 didn't jump to line 3945

3930 usex_type != "quotation" 

3931 and lang_code != "en" 

3932 and len(lines) == 3 

3933 ): 

3934 cls1 = classify_desc2(lines[0]) 

3935 cls2 = classify_desc2(lines[1]) 

3936 cls3 = classify_desc2(lines[2]) 

3937 if ( 3937 ↛ 3968line 3937 didn't jump to line 3968

3938 cls3 == "english" 

3939 and cls2 in ("english", "romanization") 

3940 and cls1 != "english" 

3941 ): 

3942 tr = lines[2].strip() 

3943 roman = lines[1].strip() 

3944 lines = [lines[0].strip()] 

3945 elif ( 

3946 usex_type == "quotation" 

3947 and lang_code != "en" 

3948 and len(lines) > 2 

3949 ): 

3950 # for x in lines: 

3951 # print(" LINE: {}: {}" 

3952 # .format(classify_desc2(x), x)) 

3953 if re.match(r"^[#*]*:+\s*$", lines[1]): 

3954 ref = lines[0] 

3955 lines = lines[2:] 

3956 cls1 = classify_desc2(lines[-1]) 

3957 if cls1 == "english": 

3958 i = len(lines) - 1 

3959 while ( 

3960 i > 1 

3961 and classify_desc2(lines[i - 1]) 

3962 == ENGLISH_TEXTS 

3963 ): 

3964 i -= 1 

3965 tr = "\n".join(lines[i:]) 

3966 lines = lines[:i] 

3967 

3968 roman = re.sub(r"[ \t\r]+", " ", roman).strip() 

3969 roman = re.sub(r"\[\s*…\s*\]", "[…]", roman) 

3970 tr = re.sub(r"^[#*:]+\s*", "", tr) 

3971 tr = re.sub(r"[ \t\r]+", " ", tr).strip() 

3972 tr = re.sub(r"\[\s*…\s*\]", "[…]", tr) 

3973 ref = re.sub(r"^[#*:]+\s*", "", ref) 

3974 ref = re.sub( 

3975 r", (volume |number |page )?“?" 

3976 r"\(please specify ([^)]|\(s\))*\)”?|" 

3977 ", text here$", 

3978 "", 

3979 ref, 

3980 ) 

3981 ref = re.sub(r"\[\s*…\s*\]", "[…]", ref) 

3982 lines = list(re.sub(r"^[#*:]+\s*", "", x) for x in lines) 

3983 subtext = "\n".join(x for x in lines if x) 

3984 if not tr and lang_code != "en": 3984 ↛ 3985line 3984 didn't jump to line 3985 because the condition on line 3984 was never true

3985 m = re.search(r"([.!?])\s+\(([^)]+)\)\s*$", subtext) 

3986 if m and classify_desc2(m.group(2)) in ENGLISH_TEXTS: 

3987 tr = m.group(2) 

3988 subtext = subtext[: m.start()] + m.group(1) 

3989 elif lines: 

3990 parts = re.split(r"\s*[―—]+\s*", lines[0]) 

3991 if ( 

3992 len(parts) == 2 

3993 and classify_desc2(parts[1]) in ENGLISH_TEXTS 

3994 ): 

3995 subtext = parts[0].strip() 

3996 tr = parts[1].strip() 

3997 subtext = re.sub(r'^[“"`]([^“"`”\']*)[”"\']$', r"\1", subtext) 

3998 subtext = re.sub( 

3999 r"(please add an English translation of " 

4000 r"this (quote|usage example))", 

4001 "", 

4002 subtext, 

4003 ) 

4004 subtext = re.sub( 

4005 r"\s*→New International Version " "translation$", 

4006 "", 

4007 subtext, 

4008 ) # e.g. pis/Tok Pisin (Bible) 

4009 subtext = re.sub(r"[ \t\r]+", " ", subtext).strip() 

4010 subtext = re.sub(r"\[\s*…\s*\]", "[…]", subtext) 

4011 note = None 

4012 m = re.match(r"^\(([^)]*)\):\s+", subtext) 

4013 if ( 4013 ↛ 4021line 4013 didn't jump to line 4021

4014 m is not None 

4015 and lang_code != "en" 

4016 and ( 

4017 m.group(1).startswith("with ") 

4018 or classify_desc2(m.group(1)) == "english" 

4019 ) 

4020 ): 

4021 note = m.group(1) 

4022 subtext = subtext[m.end() :] 

4023 ref = re.sub(r"\s*\(→ISBN\)", "", ref) 

4024 ref = re.sub(r",\s*→ISBN", "", ref) 

4025 ref = ref.strip() 

4026 if ref.endswith(":") or ref.endswith(","): 4026 ↛ 4027line 4026 didn't jump to line 4027 because the condition on line 4026 was never true

4027 ref = ref[:-1].strip() 

4028 ref = re.sub(r"\s+,\s+", ", ", ref) 

4029 ref = re.sub(r"\s+", " ", ref) 

4030 if ref and not subtext: 4030 ↛ 4031line 4030 didn't jump to line 4031 because the condition on line 4030 was never true

4031 subtext = ref 

4032 ref = "" 

4033 if subtext: 4033 ↛ 3669line 4033 didn't jump to line 3669 because the condition on line 4033 was always true

4034 dt: ExampleData = {"text": subtext} 

4035 if ref: 4035 ↛ 4036line 4035 didn't jump to line 4036 because the condition on line 4035 was never true

4036 dt["ref"] = ref 

4037 if tr: 

4038 dt["english"] = tr 

4039 if usex_type: 4039 ↛ 4040line 4039 didn't jump to line 4040 because the condition on line 4039 was never true

4040 dt["type"] = usex_type 

4041 if note: 4041 ↛ 4042line 4041 didn't jump to line 4042 because the condition on line 4041 was never true

4042 dt["note"] = note 

4043 if roman: 

4044 dt["roman"] = roman 

4045 if ruby: 

4046 dt["ruby"] = ruby 

4047 examples.append(dt) 

4048 

4049 return examples 

4050 

4051 # Main code of parse_language() 

4052 # Process the section 

4053 stack.append(language) 

4054 process_children(langnode, None) 

4055 stack.pop() 

4056 

4057 # Finalize word entires 

4058 push_etym() 

4059 ret = [] 

4060 for data in page_datas: 

4061 merge_base(data, base_data) 

4062 ret.append(data) 

4063 

4064 # Copy all tags to word senses 

4065 for data in ret: 

4066 if "senses" not in data: 4066 ↛ 4067line 4066 didn't jump to line 4067 because the condition on line 4066 was never true

4067 continue 

4068 # WordData should not have a 'tags' field, but if it does, it's 

4069 # deleted and its contents removed and placed in each sense; 

4070 # that's why the type ignores. 

4071 tags: Iterable = data.get("tags", ()) # type: ignore[assignment] 

4072 if "tags" in data: 4072 ↛ 4073line 4072 didn't jump to line 4073 because the condition on line 4072 was never true

4073 del data["tags"] # type: ignore[typeddict-item] 

4074 for sense in data["senses"]: 

4075 data_extend(sense, "tags", tags) 

4076 

4077 return ret 

4078 

4079 

4080def parse_wikipedia_template( 

4081 wxr: WiktextractContext, data: WordData, ht: TemplateArgs 

4082) -> None: 

4083 """Helper function for parsing {{wikipedia|...}} and related templates.""" 

4084 assert isinstance(wxr, WiktextractContext) 

4085 assert isinstance(data, dict) 

4086 assert isinstance(ht, dict) 

4087 langid = clean_node(wxr, data, ht.get("lang", ())) 

4088 pagename = ( 

4089 clean_node(wxr, data, ht.get(1, ())) 

4090 or wxr.wtp.title 

4091 or "MISSING_PAGE_TITLE" 

4092 ) 

4093 if langid: 

4094 data_append(data, "wikipedia", langid + ":" + pagename) 

4095 else: 

4096 data_append(data, "wikipedia", pagename) 

4097 

4098 

4099def parse_top_template( 

4100 wxr: WiktextractContext, node: WikiNode, data: WordData 

4101) -> None: 

4102 """Parses a template that occurs on the top-level in a page, before any 

4103 language subtitles.""" 

4104 assert isinstance(wxr, WiktextractContext) 

4105 assert isinstance(node, WikiNode) 

4106 assert isinstance(data, dict) 

4107 

4108 def top_template_fn(name: str, ht: TemplateArgs) -> Optional[str]: 

4109 if name in wikipedia_templates: 

4110 parse_wikipedia_template(wxr, data, ht) 

4111 return None 

4112 if is_panel_template(wxr, name): 

4113 return "" 

4114 if name in ("reconstruction",): 

4115 return "" 

4116 if name.lower() == "also": 

4117 # XXX shows related words that might really have been the intended 

4118 # word, capture them 

4119 return "" 

4120 if name == "see also": 

4121 # XXX capture 

4122 return "" 

4123 if name == "cardinalbox": 

4124 # XXX capture 

4125 return "" 

4126 if name == "character info": 

4127 # XXX capture 

4128 return "" 

4129 if name == "commonscat": 

4130 # XXX capture link to Wikimedia commons 

4131 return "" 

4132 if name == "wrongtitle": 

4133 # XXX this should be captured to replace page title with the 

4134 # correct title. E.g. ⿰亻革家 

4135 return "" 

4136 if name == "wikidata": 

4137 arg = clean_node(wxr, data, ht.get(1, ())) 

4138 if arg.startswith("Q") or arg.startswith("Lexeme:L"): 

4139 data_append(data, "wikidata", arg) 

4140 return "" 

4141 wxr.wtp.debug( 

4142 "UNIMPLEMENTED top-level template: {} {}".format(name, ht), 

4143 sortid="page/2870", 

4144 ) 

4145 return "" 

4146 

4147 clean_node(wxr, None, [node], template_fn=top_template_fn) 

4148 

4149 

4150def fix_subtitle_hierarchy(wxr: WiktextractContext, text: str) -> str: 

4151 """Fix subtitle hierarchy to be strict Language -> Etymology -> 

4152 Part-of-Speech -> Translation/Linkage. Also merge Etymology sections 

4153 that are next to each other.""" 

4154 

4155 # Wiktextract issue #620, Chinese Glyph Origin before an etymology 

4156 # section get overwritten. In this case, let's just combine the two. 

4157 

4158 # In Chinese entries, Pronunciation can be preceded on the 

4159 # same level 3 by its Etymology *and* Glyph Origin sections: 

4160 # ===Glyph Origin=== 

4161 # ===Etymology=== 

4162 # ===Pronunciation=== 

4163 # Tatu suggested adding a new 'level' between 3 and 4, so Pronunciation 

4164 # is now Level 4, POS is shifted to Level 5 and the rest (incl. 'default') 

4165 # are now level 6 

4166 

4167 # Known lowercase PoS names are in part_of_speech_map 

4168 # Known lowercase linkage section names are in linkage_map 

4169 

4170 old = re.split( 

4171 r"(?m)^(==+)[ \t]*([^= \t]([^=\n]|=[^=])*?)" r"[ \t]*(==+)[ \t]*$", text 

4172 ) 

4173 

4174 parts = [] 

4175 npar = 4 # Number of parentheses in above expression 

4176 parts.append(old[0]) 

4177 prev_level = None 

4178 level = None 

4179 skip_level_title = False # When combining etymology sections 

4180 for i in range(1, len(old), npar + 1): 

4181 left = old[i] 

4182 right = old[i + npar - 1] 

4183 # remove Wikilinks in title 

4184 title = re.sub(r"^\[\[", "", old[i + 1]) 

4185 title = re.sub(r"\]\]$", "", title) 

4186 prev_level = level 

4187 level = len(left) 

4188 part = old[i + npar] 

4189 if level != len(right): 4189 ↛ 4190line 4189 didn't jump to line 4190 because the condition on line 4189 was never true

4190 wxr.wtp.debug( 

4191 "subtitle has unbalanced levels: " 

4192 "{!r} has {} on the left and {} on the right".format( 

4193 title, left, right 

4194 ), 

4195 sortid="page/2904", 

4196 ) 

4197 lc = title.lower() 

4198 if name_to_code(title, "en") != "": 

4199 if level > 2: 4199 ↛ 4200line 4199 didn't jump to line 4200 because the condition on line 4199 was never true

4200 wxr.wtp.debug( 

4201 "subtitle has language name {} at level {}".format( 

4202 title, level 

4203 ), 

4204 sortid="page/2911", 

4205 ) 

4206 level = 2 

4207 elif lc.startswith(tuple(ETYMOLOGY_TITLES)): 

4208 if level > 3: 4208 ↛ 4209line 4208 didn't jump to line 4209 because the condition on line 4208 was never true

4209 wxr.wtp.debug( 

4210 "etymology section {} at level {}".format(title, level), 

4211 sortid="page/2917", 

4212 ) 

4213 if prev_level == 3: # Two etymology (Glyph Origin + Etymology) 4213 ↛ 4215line 4213 didn't jump to line 4215 because the condition on line 4213 was never true

4214 # sections cheek-to-cheek 

4215 skip_level_title = True 

4216 # Modify the title of previous ("Glyph Origin") section, in 

4217 # case we have a meaningful title like "Etymology 1" 

4218 parts[-2] = "{}{}{}".format("=" * level, title, "=" * level) 

4219 level = 3 

4220 elif lc.startswith(PRONUNCIATION_TITLE): 4220 ↛ 4223line 4220 didn't jump to line 4223 because the condition on line 4220 was never true

4221 # Pronunciation is now a level between POS and Etymology, so 

4222 # we need to shift everything down by one 

4223 level = 4 

4224 elif lc in POS_TITLES: 

4225 level = 5 

4226 elif lc == TRANSLATIONS_TITLE: 

4227 level = 6 

4228 elif lc in LINKAGE_TITLES or lc == COMPOUNDS_TITLE: 4228 ↛ 4230line 4228 didn't jump to line 4230 because the condition on line 4228 was always true

4229 level = 6 

4230 elif lc in INFLECTION_TITLES: 

4231 level = 6 

4232 elif lc == DESCENDANTS_TITLE: 

4233 level = 6 

4234 elif title in PROTO_ROOT_DERIVED_TITLES: 

4235 level = 6 

4236 elif lc in IGNORED_TITLES: 

4237 level = 6 

4238 else: 

4239 level = 6 

4240 if skip_level_title: 4240 ↛ 4241line 4240 didn't jump to line 4241 because the condition on line 4240 was never true

4241 skip_level_title = False 

4242 parts.append(part) 

4243 else: 

4244 parts.append("{}{}{}".format("=" * level, title, "=" * level)) 

4245 parts.append(part) 

4246 # print("=" * level, title) 

4247 # if level != len(left): 

4248 # print(" FIXED LEVEL OF {} {} -> {}" 

4249 # .format(title, len(left), level)) 

4250 

4251 text = "".join(parts) 

4252 # print(text) 

4253 return text 

4254 

4255 

4256def parse_page(wxr: WiktextractContext, word: str, text: str) -> list[WordData]: 

4257 # Skip translation pages 

4258 if word.endswith("/" + TRANSLATIONS_TITLE): 4258 ↛ 4259line 4258 didn't jump to line 4259 because the condition on line 4258 was never true

4259 return [] 

4260 

4261 if wxr.config.verbose: 4261 ↛ 4262line 4261 didn't jump to line 4262 because the condition on line 4261 was never true

4262 logger.info(f"Parsing page: {word}") 

4263 

4264 wxr.config.word = word 

4265 wxr.wtp.start_page(word) 

4266 

4267 # Remove <noinclude> and similar tags from main pages. They 

4268 # should not appear there, but at least net/Elfdala has one and it 

4269 # is probably not the only one. 

4270 text = re.sub(r"(?si)<(/)?noinclude\s*>", "", text) 

4271 text = re.sub(r"(?si)<(/)?onlyinclude\s*>", "", text) 

4272 text = re.sub(r"(?si)<(/)?includeonly\s*>", "", text) 

4273 

4274 # Fix up the subtitle hierarchy. There are hundreds if not thousands of 

4275 # pages that have, for example, Translations section under Linkage, or 

4276 # Translations section on the same level as Noun. Enforce a proper 

4277 # hierarchy by manipulating the subtitle levels in certain cases. 

4278 text = fix_subtitle_hierarchy(wxr, text) 

4279 

4280 # Parse the page, pre-expanding those templates that are likely to 

4281 # influence parsing 

4282 tree = wxr.wtp.parse( 

4283 text, 

4284 pre_expand=True, 

4285 additional_expand=ADDITIONAL_EXPAND_TEMPLATES, 

4286 do_not_pre_expand=DO_NOT_PRE_EXPAND_TEMPLATES, 

4287 ) 

4288 # from wikitextprocessor.parser import print_tree 

4289 # print("PAGE PARSE:", print_tree(tree)) 

4290 

4291 top_data: WordData = {} 

4292 

4293 # Iterate over top-level titles, which should be languages for normal 

4294 # pages 

4295 by_lang = defaultdict(list) 

4296 for langnode in tree.children: 

4297 if not isinstance(langnode, WikiNode): 

4298 continue 

4299 if langnode.kind == NodeKind.TEMPLATE: 4299 ↛ 4300line 4299 didn't jump to line 4300 because the condition on line 4299 was never true

4300 parse_top_template(wxr, langnode, top_data) 

4301 continue 

4302 if langnode.kind == NodeKind.LINK: 4302 ↛ 4304line 4302 didn't jump to line 4304 because the condition on line 4302 was never true

4303 # Some pages have links at top level, e.g., "trees" in Wiktionary 

4304 continue 

4305 if langnode.kind != NodeKind.LEVEL2: 4305 ↛ 4306line 4305 didn't jump to line 4306 because the condition on line 4305 was never true

4306 wxr.wtp.debug( 

4307 f"unexpected top-level node: {langnode}", sortid="page/3014" 

4308 ) 

4309 continue 

4310 lang = clean_node( 

4311 wxr, None, langnode.sarg if langnode.sarg else langnode.largs 

4312 ) 

4313 lang_code = name_to_code(lang, "en") 

4314 if lang_code == "": 4314 ↛ 4315line 4314 didn't jump to line 4315 because the condition on line 4314 was never true

4315 wxr.wtp.debug( 

4316 f"unrecognized language name: {lang}", sortid="page/3019" 

4317 ) 

4318 if ( 4318 ↛ 4322line 4318 didn't jump to line 4322

4319 wxr.config.capture_language_codes 

4320 and lang_code not in wxr.config.capture_language_codes 

4321 ): 

4322 continue 

4323 wxr.wtp.start_section(lang) 

4324 

4325 # Collect all words from the page. 

4326 # print(f"{langnode=}") 

4327 datas = parse_language(wxr, langnode, lang, lang_code) 

4328 

4329 # Propagate fields resulting from top-level templates to this 

4330 # part-of-speech. 

4331 for data in datas: 

4332 if "lang" not in data: 4332 ↛ 4333line 4332 didn't jump to line 4333 because the condition on line 4332 was never true

4333 wxr.wtp.debug( 

4334 "internal error -- no lang in data: {}".format(data), 

4335 sortid="page/3034", 

4336 ) 

4337 continue 

4338 for k, v in top_data.items(): 4338 ↛ 4339line 4338 didn't jump to line 4339 because the loop on line 4338 never started

4339 assert isinstance(v, (list, tuple)) 

4340 data_extend(data, k, v) 

4341 by_lang[data["lang"]].append(data) 

4342 

4343 # XXX this code is clearly out of date. There is no longer a "conjugation" 

4344 # field. FIX OR REMOVE. 

4345 # Do some post-processing on the words. For example, we may distribute 

4346 # conjugation information to all the words. 

4347 ret = [] 

4348 for lang, lang_datas in by_lang.items(): 

4349 ret.extend(lang_datas) 

4350 

4351 for x in ret: 

4352 if x["word"] != word: 

4353 if word.startswith("Unsupported titles/"): 4353 ↛ 4359line 4353 didn't jump to line 4359 because the condition on line 4353 was always true

4354 wxr.wtp.debug( 

4355 f"UNSUPPORTED TITLE: '{word}' -> '{x['word']}'", 

4356 sortid="20231101/3578page.py", 

4357 ) 

4358 else: 

4359 wxr.wtp.debug( 

4360 f"DIFFERENT ORIGINAL TITLE: '{word}' " f"-> '{x['word']}'", 

4361 sortid="20231101/3582page.py", 

4362 ) 

4363 x["original_title"] = word 

4364 # validate tag data 

4365 recursively_separate_raw_tags(wxr, x) # type:ignore[arg-type] 

4366 return ret 

4367 

4368 

4369def recursively_separate_raw_tags( 

4370 wxr: WiktextractContext, data: dict[str, Any] 

4371) -> None: 

4372 if not isinstance(data, dict): 4372 ↛ 4373line 4372 didn't jump to line 4373 because the condition on line 4372 was never true

4373 wxr.wtp.error( 

4374 "'data' is not dict; most probably " 

4375 "data has a list that contains at least one dict and " 

4376 "at least one non-dict item", 

4377 sortid="en/page-4016/20240419", 

4378 ) 

4379 return 

4380 new_tags: list[str] = [] 

4381 raw_tags: list[str] = data.get("raw_tags", []) 

4382 for field, val in data.items(): 

4383 if field == "tags": 

4384 for tag in val: 

4385 if tag not in valid_tags: 4385 ↛ 4386line 4385 didn't jump to line 4386 because the condition on line 4385 was never true

4386 raw_tags.append(tag) 

4387 else: 

4388 new_tags.append(tag) 

4389 if isinstance(val, list): 

4390 if len(val) > 0 and isinstance(val[0], dict): 

4391 for d in val: 

4392 recursively_separate_raw_tags(wxr, d) 

4393 if "tags" in data and not new_tags: 4393 ↛ 4394line 4393 didn't jump to line 4394 because the condition on line 4393 was never true

4394 del data["tags"] 

4395 elif new_tags: 

4396 data["tags"] = new_tags 

4397 if raw_tags: 4397 ↛ 4398line 4397 didn't jump to line 4398 because the condition on line 4397 was never true

4398 data["raw_tags"] = raw_tags 

4399 

4400 

4401def process_soft_redirect_template( 

4402 wxr: WiktextractContext, 

4403 template_node: TemplateNode, 

4404 redirect_pages: list[str], 

4405) -> bool: 

4406 # return `True` if the template is soft redirect template 

4407 if template_node.template_name == "zh-see": 

4408 # https://en.wiktionary.org/wiki/Template:zh-see 

4409 title = clean_node( 

4410 wxr, None, template_node.template_parameters.get(1, "") 

4411 ) 

4412 if title != "": 4412 ↛ 4414line 4412 didn't jump to line 4414 because the condition on line 4412 was always true

4413 redirect_pages.append(title) 

4414 return True 

4415 elif template_node.template_name in ["ja-see", "ja-see-kango"]: 

4416 # https://en.wiktionary.org/wiki/Template:ja-see 

4417 for key, value in template_node.template_parameters.items(): 

4418 if isinstance(key, int): 4418 ↛ 4417line 4418 didn't jump to line 4417 because the condition on line 4418 was always true

4419 title = clean_node(wxr, None, value) 

4420 if title != "": 4420 ↛ 4417line 4420 didn't jump to line 4417 because the condition on line 4420 was always true

4421 redirect_pages.append(title) 

4422 return True 

4423 return False 

4424 

4425 

4426def process_zh_forms_templates( 

4427 wxr: WiktextractContext, 

4428 template_node: TemplateNode, 

4429 base_data: WordData, 

4430) -> None: 

4431 # https://en.wiktionary.org/wiki/Template:zh-forms 

4432 if "forms" not in base_data: 

4433 base_data["forms"] = [] 

4434 for p_name, p_value in template_node.template_parameters.items(): 

4435 if not isinstance(p_name, str): 

4436 continue 

4437 if re.fullmatch(r"s\d*", p_name): 

4438 form_data: FormData = { 

4439 "form": clean_node(wxr, None, p_value), 

4440 "tags": ["Simplified Chinese"], 

4441 } 

4442 if len(form_data["form"]) > 0: 

4443 base_data["forms"].append(form_data) 

4444 elif re.fullmatch(r"t\d+", p_name): 

4445 form_data = { 

4446 "form": clean_node(wxr, None, p_value), 

4447 "tags": ["Traditional Chinese"], 

4448 } 

4449 if len(form_data["form"]) > 0: 

4450 base_data["forms"].append(form_data) 

4451 elif p_name == "alt": 

4452 for form_text in clean_node(wxr, None, p_value).split(","): 

4453 texts = form_text.split("-") 

4454 form_data = {"form": texts[0]} 

4455 if len(texts) > 1: 

4456 # pronunciation data could be added after "-" 

4457 # see https://en.wiktionary.org/wiki/新婦 

4458 form_data["raw_tags"] = texts[1:] 

4459 if len(form_data["form"]) > 0: 

4460 base_data["forms"].append(form_data) 

4461 elif p_name == "lit": 

4462 lit = clean_node(wxr, None, p_value) 

4463 if lit != "": 

4464 base_data["literal_meaning"] = lit 

4465 if len(base_data["forms"]) == 0: 

4466 del base_data["forms"]