Coverage for src / wiktextract / extractor / en / linkages.py: 72%

758 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc) 

2# 

3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import re 

6import unicodedata 

7from typing import Optional, Sequence 

8 

9from wikitextprocessor import ( 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15from wikitextprocessor.core import TemplateArgs 

16from wikitextprocessor.parser import ( 

17 HTMLNode, 

18 is_list, 

19 is_list_item, 

20) 

21 

22from ...datautils import ( 

23 data_append, 

24 data_extend, 

25 ns_title_prefix_tuple, 

26 split_at_comma_semi, 

27) 

28from ...page import clean_node, is_panel_template 

29from ...tags import linkage_beginning_tags, valid_tags 

30from ...wxr_context import WiktextractContext 

31from ..ruby import extract_ruby, parse_ruby # noqa: F401 

32from .form_descriptions import ( 

33 classify_desc, 

34 decode_tags, 

35 head_final_bantu_langs, 

36 head_final_bantu_re, 

37 head_final_numeric_langs, 

38 head_final_other_langs, 

39 head_final_other_re, 

40 head_final_re, 

41 parse_head_final_tags, 

42 parse_sense_qualifier, 

43) 

44from .section_titles import TRANSLATIONS_TITLE 

45from .type_utils import FormData, LinkageData, SenseData, WordData 

46 

47# Linkage will be ignored if it matches this regexp before splitting 

48linkage_pre_split_ignore_re = re.compile( 

49 r"^(" 

50 + "|".join( 

51 re.escape(x) 

52 for x in [ 

53 "For more variations, see ", 

54 "Signal flag:", 

55 "Semaphore:", 

56 ] 

57 ) 

58 + r")" 

59) 

60 

61# Linkage will be ignored if it has one of these prefixes 

62linkage_ignore_prefixes = [ 

63 "Historical and regional synonyms of ", 

64 "edit data", 

65 "or these other third-person pronouns", 

66 "introduced in Unicode ", 

67 "Entries in the ", 

68 "Wikipedia article ", 

69 "Wiktionary's coverage of ", 

70 "Ethnologue entry for ", 

71 "Any of Thesaurus:", 

72 "See contents of Category:", 

73 "See also Thesaurus:", 

74 "See also Appendix:", 

75 "As SMS messaging ", 

76 "For the reversed question mark used in some right-to-left-scripts", 

77 "such as ", 

78 "Appendix:", 

79 "Category:", 

80 ":Category:", 

81] 

82 

83# Linkage will be ignored if it has any of these suffixes 

84linkage_ignore_suffixes = [ 

85 " Wikipedia", 

86 " Wikipedia.", 

87 " edition of Wiktionary", 

88] 

89 

90# Linkage will be ignored if it is one of these (with full match) 

91linkage_ignore_whole = [ 

92 "etc.", 

93 "other derived terms:", 

94 "Formal terms", 

95 "informal and slang terms", 

96] 

97 

98# Linkage will be ignored if it matches this regexp 

99linkage_ignore_re = re.compile( 

100 r"^(" 

101 + "|".join(re.escape(x) for x in linkage_ignore_whole) 

102 + r")$|^(" 

103 + "|".join(re.escape(x) for x in linkage_ignore_prefixes) 

104 + r")|(" 

105 + "|".join(re.escape(x) for x in linkage_ignore_suffixes) 

106 + r")$" 

107) 

108 

109# These prefixes will be removed from linkages, leaving the rest. This is 

110# considered separately for each linkage in a list. 

111linkage_remove_prefixes_re = re.compile( 

112 r"^(" 

113 + r"|".join( 

114 re.escape(x) 

115 for x in [ 

116 ":", 

117 "see Thesaurus:", 

118 "See Thesaurus:", 

119 "see also Thesaurus:", 

120 "See also Thesaurus:", 

121 "see also ", 

122 "See also ", 

123 "see ", 

124 "See ", 

125 "from ", 

126 "abbreviation of ", 

127 "ISO 639-1 code ", 

128 "ISO 639-3 code ", 

129 "Thesaurus:", 

130 ] 

131 ) 

132 + ")" 

133) 

134 

135# When removing prefix from linkage, this dictionary can be used to map 

136# the removed prefix to a space-separated list of tags to add 

137linkage_remove_prefixes_tags = { 

138 "abbreviation of ": "abbreviation", 

139} 

140 

141# These suffixes will be removed from linkages, leaving the rest. This is 

142# considered separately for each linkage in a list. 

143linkage_remove_suffixes_re = re.compile( 

144 r"(\s+on (Wikispecies|Wikimedia Commons|" 

145 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" 

146 r"\s*[-–] Pre-reform orthography.*)" 

147 r"$" 

148) 

149 

150# Ignore linkage parenthesized sections that contain one of these strings 

151linkage_paren_ignore_contains_re = re.compile( 

152 r"\b(" 

153 + "|".join( 

154 re.escape(x) 

155 for x in [ 

156 "from Etymology", 

157 "used as", 

158 "usage notes", 

159 ] 

160 ) 

161 + ")([, ]|$)" 

162) 

163 

164taxonomic_ending_map = { 

165 "superkingdoms": "superkingdom", 

166 "kingdoms": "kingdom", 

167 "subkingdoms": "subkingdom", 

168 "infrakingdoms": "infrakingdom", 

169 "phylums": "phylum", 

170 "subphylums": "subphylum", 

171 "infraphylums": "infraphylum", 

172 "superclasses": "superclass", 

173 "classes": "class", 

174 "orders": "order", 

175 "suborders": "suborder", 

176 "families": "family", 

177 "subfamilies": "subfamily", 

178 "genera": "genus", 

179} 

180for k, v in list(taxonomic_ending_map.items()): 

181 taxonomic_ending_map[v] = v # Also add singular -> singular 

182taxonomic_ending_re = re.compile( 

183 r"\s+[-‐‑‒–—]\s+({})$".format( 

184 "|".join(re.escape(x) for x in taxonomic_ending_map) 

185 ) 

186) 

187 

188# Exceptional splits for linkages. This can be used to fix particular linkages 

189# that are not handled correctly by the default code. This can also be used 

190# to create automatic aliases, e.g., for mapping "..." and "…" to both. 

191linkage_split_exceptions = { 

192 "∛ ∜": ["∛", "∜"], 

193 "...": ["...", "…"], 

194 "…": ["...", "…"], 

195} 

196 

197# Truncate linkage word if it matches any of these strings 

198linkage_truncate_re = re.compile( 

199 "|".join( 

200 re.escape(x) 

201 for x in [ 

202 " and its derived terms", 

203 " UTF-16 0x214C", 

204 ] 

205 ) 

206) 

207 

208# Regexp for identifying special linkages containing lists of letters, digits, 

209# or characters 

210script_chars_re = re.compile( 

211 r"(script letters| script| letters|" 

212 r"Dialectological|Puctuation|Symbols|" 

213 r"Guillemets|Single guillemets|" 

214 r" tetragrams|" 

215 r" digits)(;|$)|" 

216 r"(^|; )(Letters using |Letters of the |" 

217 r"Variations of letter )|" 

218 r"^(Hiragana|Katakana)$" 

219) 

220 

221# Matches an unicode character including any combining diacritics (even if 

222# separate characters) 

223unicode_dc_re = re.compile( 

224 r"\w[{}]|.".format( 

225 "".join( 

226 chr(x) 

227 for x in range(0, 0x110000) 

228 if unicodedata.category(chr(x)) == "Mn" 

229 ) 

230 ) 

231) 

232 

233 

234def extract_alt_form_section( 

235 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode 

236) -> None: 

237 for list_node in level_node.find_child(NodeKind.LIST): 

238 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

239 for node in list_item.children: 

240 if isinstance(node, TemplateNode) and node.template_name in [ 

241 "l", 

242 "link", 

243 "L", 

244 "alt", 

245 "alter", 

246 ]: 

247 extract_l_template(wxr, word_entry, node) 

248 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

249 word = clean_node(wxr, None, node) 

250 if word != "": 250 ↛ 239line 250 didn't jump to line 239 because the condition on line 250 was always true

251 form: FormData = {"form": word, "tags": ["alternative"]} 

252 data_append(word_entry, "forms", form) 

253 

254 

255def extract_l_template( 

256 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

257) -> None: 

258 forms: list[FormData] = [] 

259 expanded_node = wxr.wtp.parse( 

260 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

261 ) 

262 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

263 for span_tag in expanded_node.find_html("span"): 

264 span_lang = span_tag.attrs.get("lang", "") 

265 span_class = span_tag.attrs.get("class", "") 

266 if span_lang == lang_code: 266 ↛ 271line 266 didn't jump to line 271 because the condition on line 266 was always true

267 word = clean_node(wxr, None, span_tag) 

268 if word != "": 268 ↛ 263line 268 didn't jump to line 263 because the condition on line 268 was always true

269 form: FormData = {"form": word, "tags": ["alternative"]} 

270 forms.append(form) 

271 elif span_lang.endswith("-Latn") and len(forms) > 0: 

272 roman = clean_node(wxr, None, span_tag) 

273 if roman != "": 

274 forms[-1]["roman"] = roman 

275 elif "label-content" in span_class and len(forms) > 0: 

276 tag_text = clean_node(wxr, None, span_tag) 

277 if classify_desc(tag_text) == "tags": 

278 tagsets1, _ = decode_tags(tag_text) 

279 tags: list[str] = [] 

280 for ts in tagsets1: 

281 tags.extend(ts) 

282 for form in forms: 

283 form["tags"].extend(tags) 

284 data_extend(word_entry, "forms", forms) 

285 

286 

287ZH_DIAL_TAGS = { 

288 "Classical Chinese": ["Classical-Chinese"], 

289 "Formal": ["formal"], 

290 "Written Standard Chinese": ["Written-vernacular-Chinese"], 

291 "Northeastern Mandarin": ["Northeastern-Mandarin"], 

292 "Jilu Mandarin": ["Jilu-Mandarin"], 

293 "Jiaoliao Mandarin": ["Jiaoliao-Mandarin"], 

294 "Central Plains Mandarin": ["Central-Plains-Mandarin"], 

295 "Lanyin Mandarin": ["Lanyin-Mandarin"], 

296 "Southwestern Mandarin": ["Southwestern-Mandarin"], 

297 "Jianghuai Mandarin": ["Jianghuai-Mandarin"], 

298 "Northern Min": ["Min-Bei"], 

299 "Eastern Min": ["Min-Dong"], 

300 "Southern Min": ["Min-Nan"], 

301 "Zhongshan Min": ["Zhongshan-Min"], 

302 "Southern Pinghua": ["Southern-Pinghua"], 

303 "Puxian Min": ["Puxian-Min"], 

304} 

305 

306 

307def extract_zh_dial_template( 

308 wxr: WiktextractContext, 

309 word_entry: WordData, 

310 t_node: TemplateNode, 

311 sense: str, 

312): 

313 # https://en.wiktionary.org/wiki/Template:zh-dial 

314 from .pronunciation import split_zh_pron_raw_tag 

315 

316 linkage_list: list[LinkageData] = [] 

317 expanded_node = wxr.wtp.parse( 

318 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

319 ) 

320 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 320 ↛ 321line 320 didn't jump to line 321 because the loop on line 320 never started

321 is_note_row = False 

322 note_tags = {} 

323 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

324 for cell_node in row_node.find_child( 

325 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

326 ): 

327 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

328 is_note_row = clean_node(wxr, None, cell_node) == "Note" 

329 elif is_note_row: 

330 for note_str in clean_node(wxr, None, cell_node).split(";"): 

331 if "-" in note_str: 

332 note_symbol, note = note_str.split("-", maxsplit=1) 

333 note_symbol = note_symbol.strip() 

334 note = note.strip() 

335 if note_symbol != "" and note != "": 

336 note_tags[note_symbol] = note 

337 lang_tags = [] 

338 region_tags = [] 

339 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

340 if not row_node.contain_node(NodeKind.TABLE_CELL): 

341 continue # skip header row 

342 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

343 lang_tags = split_zh_pron_raw_tag( 

344 clean_node(wxr, None, header_node) 

345 ) 

346 if lang_tags == ["Note"]: # skip last note row 

347 continue 

348 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

349 for link_node in cell_node.find_child(NodeKind.LINK): 

350 region_tags = split_zh_pron_raw_tag( 

351 clean_node(wxr, None, link_node) 

352 ) 

353 for span_tag in cell_node.find_html("span"): 

354 span_text = clean_node(wxr, None, span_tag) 

355 if span_text == "": 

356 continue 

357 if ( 

358 span_tag.attrs.get("lang", "") == "zh" 

359 and span_text != wxr.wtp.title 

360 ): 

361 l_data: LinkageData = {"word": span_text} 

362 if sense != "": 

363 l_data["sense"] = sense 

364 if len(lang_tags) > 0: 

365 data_extend(l_data, "raw_tags", lang_tags) 

366 if len(region_tags) > 0: 

367 data_extend(l_data, "raw_tags", region_tags) 

368 linkage_list.append(l_data) 

369 elif ( 

370 span_tag.attrs.get("style", "") == "font-size:60%" 

371 and len(linkage_list) > 0 

372 ): 

373 for note_symbol in span_text.split(","): 

374 note_symbol = note_symbol.strip() 

375 raw_tag = note_symbol 

376 if note_symbol in note_tags: 

377 raw_tag = note_tags[note_symbol] 

378 if raw_tag != "": 

379 data_append( 

380 linkage_list[-1], "raw_tags", raw_tag 

381 ) 

382 

383 for l_data in linkage_list: 

384 raw_tags = [] 

385 for raw_tag in l_data.get("raw_tags", []): 

386 if raw_tag in ZH_DIAL_TAGS: 

387 data_extend(l_data, "tags", ZH_DIAL_TAGS[raw_tag]) 

388 elif raw_tag in valid_tags: 

389 data_append(l_data, "tags", raw_tag) 

390 else: 

391 raw_tags.append(raw_tag) 

392 if len(raw_tags) > 0: 

393 l_data["raw_tags"] = raw_tags 

394 elif "raw_tags" in l_data: 

395 del l_data["raw_tags"] 

396 data_extend(word_entry, "synonyms", linkage_list) 

397 

398 

399def parse_linkage( 

400 wxr: WiktextractContext, 

401 data: WordData, 

402 field: str, 

403 linkagenode: LevelNode, 

404 word: str, 

405 sense_datas: list[SenseData], 

406 is_reconstruction: bool, 

407) -> None: 

408 assert isinstance(data, dict) 

409 assert isinstance(field, str) 

410 assert isinstance(linkagenode, LevelNode) 

411 # print("field", field) 

412 # print("data", data) 

413 # print("children:") 

414 if not wxr.config.capture_linkages: 414 ↛ 415line 414 didn't jump to line 415 because the condition on line 414 was never true

415 return 

416 have_panel_template = False 

417 

418 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

419 nonlocal have_panel_template 

420 if is_panel_template(wxr, name): 

421 have_panel_template = True 

422 return "" 

423 # Ignore auto-filled templates like Template:table:Solar System/en 

424 if name.startswith(("table:", "list:")): 

425 return "" 

426 return None 

427 

428 # Main body of parse_linkage() 

429 l_nodes: list[str | WikiNode] = [] 

430 l_sense = "" 

431 for node in linkagenode.children: 

432 if isinstance(node, TemplateNode) and node.template_name == "zh-dial": 

433 extract_zh_dial_template(wxr, data, node, l_sense) 

434 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

435 for list_item in node.find_child(NodeKind.LIST_ITEM): 

436 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

437 if t_node.template_name in ["s", "sense"]: 

438 l_sense = clean_node(wxr, None, t_node).strip("(): ") 

439 l_nodes.append(node) 

440 else: 

441 l_nodes.append(node) 

442 text = wxr.wtp.node_to_wikitext(l_nodes) 

443 parsed = wxr.wtp.parse( 

444 text, expand_all=True, template_fn=linkage_template_fn1 

445 ) 

446 

447 text_outside_list_items = parse_linkage_recurse( 

448 wxr, 

449 parsed.children, 

450 field, 

451 None, 

452 None, 

453 word, 

454 data, 

455 sense_datas, 

456 is_reconstruction, 

457 ) 

458 

459 if not data.get(field) and not have_panel_template: 

460 text = "".join(text_outside_list_items).strip() 

461 if "\n" not in text and "," in text and text.count(",") > 3: 

462 if not text.startswith("See "): 462 ↛ exitline 462 didn't return from function 'parse_linkage' because the condition on line 462 was always true

463 parse_linkage_item( 

464 wxr, 

465 [text], 

466 field, 

467 word, 

468 data, 

469 sense_datas, 

470 is_reconstruction, 

471 None, 

472 ) 

473 

474 

475def parse_linkage_recurse( 

476 wxr: WiktextractContext, 

477 contents: list[WikiNode | str], 

478 field: str, 

479 sense: str | None, 

480 block_header_sense: str | None, 

481 word: str, 

482 data, 

483 sense_datas, 

484 is_reconstruction, 

485) -> list[str]: 

486 assert isinstance(contents, (list, tuple)) 

487 assert sense is None or isinstance(sense, str) 

488 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

489 

490 # Return values 

491 text_outside_list_items: list[str] = [] 

492 

493 for node in contents: 

494 if isinstance(node, str): 

495 # Ignore top-level text, generally comments before the 

496 # linkages list. However, if no linkages are found, then 

497 # use this for linkages (not all words use bullet points 

498 # for linkages). 

499 text_outside_list_items.append(node) 

500 continue 

501 assert isinstance(node, WikiNode) 

502 kind = node.kind 

503 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

504 if is_list(node) or kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

505 toli = parse_linkage_recurse( 

506 wxr, 

507 node.children, 

508 field, 

509 sense, 

510 block_header_sense, 

511 word, 

512 data, 

513 sense_datas, 

514 is_reconstruction, 

515 ) 

516 text_outside_list_items.extend(toli) 

517 elif is_list_item(node) or kind == NodeKind.TABLE_CELL: 

518 v = parse_linkage_item( 

519 wxr, 

520 node.children, 

521 field, 

522 word, 

523 data, 

524 sense_datas, 

525 is_reconstruction, 

526 sense, 

527 ) 

528 if v is not None: 528 ↛ 493line 528 didn't jump to line 493 because the condition on line 528 was always true

529 # parse_linkage_item() can return a value that should 

530 # be used as the sense for the follow-on linkages, 

531 # which are typically provided in a table (see 滿) 

532 block_header_sense = "".join(v) 

533 elif kind in ( 

534 NodeKind.TABLE_CAPTION, 

535 NodeKind.TABLE_HEADER_CELL, 

536 NodeKind.PREFORMATTED, 

537 NodeKind.BOLD, 

538 ): 

539 # Let's still ignore table extra stuff 

540 continue 

541 elif isinstance(node, HTMLNode): 541 ↛ 543line 541 didn't jump to line 543 because the condition on line 541 was never true

542 # Recurse to process inside the HTML for most tags 

543 if node.sarg in ("gallery", "ref", "cite", "caption"): 

544 continue 

545 classes = (node.attrs.get("class") or "").replace("+", " ").split() 

546 if "qualifier-content" in classes: 

547 sense1 = clean_node(wxr, None, node.children) 

548 if sense1.endswith(":"): 

549 sense1 = sense1[:-1].strip() 

550 if sense and sense1: 

551 wxr.wtp.debug( 

552 "linkage qualifier-content on multiple " 

553 "levels: {!r} and {!r}".format(sense, sense1), 

554 sortid="page/2170", 

555 ) 

556 toli = parse_linkage_recurse( 

557 wxr, 

558 node.children, 

559 field, 

560 sense1, 

561 block_header_sense, 

562 word, 

563 data, 

564 sense_datas, 

565 is_reconstruction, 

566 ) 

567 text_outside_list_items.extend(toli) 

568 elif "list-switcher-header" in classes: 

569 block_header_sense = clean_node(wxr, None, node.children) 

570 if block_header_sense.endswith(":"): 

571 block_header_sense = block_header_sense[:-1].strip() 

572 elif any(x in classes for x in ("NavFrame", "term-list")): 

573 # NavFrame uses previously assigned block_header_sense 

574 # (from a "(sense):" item) and clears it afterwards 

575 # print(f"{sense=}, {block_header_sense=}") 

576 toli = parse_linkage_recurse( 

577 wxr, 

578 node.children, 

579 field, 

580 sense or block_header_sense, 

581 block_header_sense, 

582 word, 

583 data, 

584 sense_datas, 

585 is_reconstruction, 

586 ) 

587 text_outside_list_items.extend(toli) 

588 block_header_sense = None 

589 else: 

590 toli = parse_linkage_recurse( 

591 wxr, 

592 node.children, 

593 field, 

594 sense, 

595 block_header_sense, 

596 word, 

597 data, 

598 sense_datas, 

599 is_reconstruction, 

600 ) 

601 text_outside_list_items.extend(toli) 

602 elif isinstance(node, LevelNode): 602 ↛ 604line 602 didn't jump to line 604 because the condition on line 602 was never true

603 # Just recurse to any possible subsections 

604 toli = parse_linkage_recurse( 

605 wxr, 

606 node.children, 

607 field, 

608 sense, 

609 block_header_sense, 

610 word, 

611 data, 

612 sense_datas, 

613 is_reconstruction, 

614 ) 

615 text_outside_list_items.extend(toli) 

616 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

617 # Skip these on top level; at least sometimes bold is 

618 # used for indicating a subtitle 

619 continue 

620 elif kind == NodeKind.LINK: 620 ↛ 637line 620 didn't jump to line 637 because the condition on line 620 was always true

621 # Recurse into the last argument 

622 # Apparently ":/" is used as a link to "/", so strip 

623 # initial value 

624 toli = parse_linkage_recurse( 

625 wxr, 

626 node.largs[-1], 

627 field, 

628 sense, 

629 block_header_sense, 

630 word, 

631 data, 

632 sense_datas, 

633 is_reconstruction, 

634 ) 

635 text_outside_list_items.extend(toli) 

636 else: 

637 wxr.wtp.debug( 

638 "parse_linkage_recurse unhandled {}: {}".format(kind, node), 

639 sortid="page/2196", 

640 ) 

641 

642 return text_outside_list_items 

643 

644 

645def parse_linkage_item( 

646 wxr: WiktextractContext, 

647 contents: list[str | WikiNode], 

648 field: str, 

649 word: str, 

650 data: WordData, 

651 sense_datas: list[SenseData], 

652 is_reconstruction: bool, 

653 sense: str | None = None, 

654) -> list[str]: 

655 assert isinstance(contents, (list, tuple)) 

656 assert isinstance(field, str) 

657 assert sense is None or isinstance(sense, str) 

658 

659 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

660 # .format(field, sense, contents)) 

661 

662 parts: list[str] = [] 

663 ruby: list[tuple[str, str]] = [] 

664 urls: list[str] = [] 

665 # data about link text; this is used to skip splitting on 

666 # linkage text items that contain stuff like commas; for 

667 # example "Hunde, die bellen, beißen nicht" in article 

668 # beißen is split into "Hunde", "die bellen" etc. 

669 # We take that link text and use it, eventually, 

670 # in split_at_comma_semi to skip splitting on those 

671 # commas. 

672 links_that_should_not_be_split: list[str] = [] 

673 

674 def item_recurse( 

675 contents: list[str | WikiNode], possible_sense: str | None = None 

676 ) -> bool: 

677 assert isinstance(contents, (list, tuple)) 

678 nonlocal sense 

679 nonlocal ruby 

680 nonlocal parts 

681 is_sense = False 

682 # print("ITEM_RECURSE:", contents) 

683 for node in contents: 

684 if isinstance(node, str): 

685 parts.append(node) 

686 continue 

687 kind = node.kind 

688 # print( 

689 # "ITEM_RECURSE KIND:", 

690 # kind, 

691 # node.sarg if node.sarg else node.largs, 

692 # ) 

693 

694 #### parts into possible_sense 

695 if ( 

696 is_list_item(node) 

697 or is_list(node) 

698 or kind 

699 in ( 

700 NodeKind.TABLE, 

701 NodeKind.TABLE_ROW, 

702 NodeKind.TABLE_CELL, 

703 ) 

704 and parts 

705 ): 

706 # print(f"{parts=}") 

707 candidate_sense: str | None 

708 candidate_sense = clean_node(wxr, None, parts) 

709 is_sense = False 

710 

711 if candidate_sense.endswith(":"): 

712 is_sense = True 

713 candidate_sense = candidate_sense[:-1].strip() 

714 if candidate_sense.startswith("(") and candidate_sense.endswith( 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was never true

715 ")" 

716 ): 

717 is_sense = True 

718 candidate_sense = candidate_sense[1:-1].strip() 

719 if ( 

720 candidate_sense.lower() == TRANSLATIONS_TITLE 

721 or not is_sense 

722 ): 

723 candidate_sense = None 

724 # print(f"{possible_sense=}, {is_sense=}") 

725 if is_sense: 

726 possible_sense = candidate_sense 

727 parts = [] 

728 else: 

729 candidate_sense = None 

730 

731 # Handle nodes 

732 if is_list_item(node): 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 parse_linkage_item( 

734 wxr, 

735 node.children, 

736 field, 

737 word, 

738 data, 

739 sense_datas, 

740 is_reconstruction, 

741 possible_sense or sense, 

742 ) 

743 elif is_list(node) or kind in ( 

744 NodeKind.TABLE, 

745 NodeKind.TABLE_ROW, 

746 NodeKind.TABLE_CELL, 

747 ): 

748 parse_linkage_recurse( 

749 wxr, 

750 node.children, 

751 field, 

752 possible_sense or sense, 

753 None, 

754 word, 

755 data, 

756 sense_datas, 

757 is_reconstruction, 

758 ) 

759 elif kind in ( 759 ↛ 763line 759 didn't jump to line 763 because the condition on line 759 was never true

760 NodeKind.TABLE_HEADER_CELL, 

761 NodeKind.TABLE_CAPTION, 

762 ): 

763 continue 

764 elif kind == NodeKind.HTML: 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true

765 classes = (node.attrs.get("class") or "").split() 

766 if node.sarg in ("gallery", "ref", "cite", "caption"): 

767 continue 

768 elif node.sarg == "ruby": 

769 rb = parse_ruby(wxr, node) 

770 if rb: 

771 ruby.append(rb) 

772 parts.append(rb[0]) 

773 continue 

774 elif node.sarg == "math": 

775 parts.append(clean_node(wxr, None, node)) 

776 continue 

777 elif "interProject" in classes: 

778 continue # These do not seem to be displayed 

779 if "NavFrame" in classes: 

780 parse_linkage_recurse( 

781 wxr, 

782 node.children, 

783 field, 

784 possible_sense or sense, 

785 None, 

786 word, 

787 data, 

788 sense_datas, 

789 is_reconstruction, 

790 ) 

791 else: 

792 item_recurse(node.children, possible_sense) 

793 elif kind == NodeKind.LINK: 

794 ignore = False 

795 if isinstance(node.largs[0][0], str): 795 ↛ 683line 795 didn't jump to line 683 because the condition on line 795 was always true

796 v1 = node.largs[0][0].strip().lower() 

797 if v1.startswith( 797 ↛ 801line 797 didn't jump to line 801 because the condition on line 797 was never true

798 ns_title_prefix_tuple(wxr, "Category", True) 

799 + ns_title_prefix_tuple(wxr, "File", True) 

800 ): 

801 ignore = True 

802 if not ignore: 802 ↛ 683line 802 didn't jump to line 683 because the condition on line 802 was always true

803 v = node.largs[-1] 

804 if ( 

805 len(node.largs) == 1 

806 and len(v) > 0 

807 and isinstance(v[0], str) 

808 and v[0][0] == ":" 

809 ): 

810 v = [v[0][1:]] + list(v[1:]) # type:ignore 

811 if isinstance(v[0], str) and not v[0].isalnum(): 

812 links_that_should_not_be_split.append("".join(v[0])) # type: ignore 

813 item_recurse(v, possible_sense) 

814 elif kind == NodeKind.URL: 

815 if len(node.largs) < 2 and node.largs: 

816 # Naked url captured 

817 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

818 continue 

819 if len(node.largs) == 2: 819 ↛ 824line 819 didn't jump to line 824 because the condition on line 819 was always true

820 # Url from link with text 

821 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

822 # print(f"{node.largs=!r}") 

823 # print("linkage recurse URL {}".format(node)) 

824 item_recurse(node.largs[-1], possible_sense) 

825 elif kind in ( 825 ↛ 832line 825 didn't jump to line 832 because the condition on line 825 was always true

826 NodeKind.PREFORMATTED, 

827 NodeKind.BOLD, 

828 NodeKind.ITALIC, 

829 ): 

830 item_recurse(node.children) 

831 else: 

832 wxr.wtp.debug( 

833 "linkage item_recurse unhandled {}: {}".format( 

834 node.kind, node 

835 ), 

836 sortid="page/2073", 

837 ) 

838 

839 return is_sense 

840 

841 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

842 # .format(contents)) 

843 

844 is_sense = item_recurse(contents) 

845 

846 if not is_sense: 

847 item = clean_node(wxr, None, parts) 

848 # print("LINKAGE ITEM CONTENTS:", parts) 

849 # print("CLEANED ITEM: {!r}".format(item)) 

850 # print(f"URLS {urls=!r}") 

851 

852 if v := parse_linkage_item_text( 852 ↛ 865line 852 didn't jump to line 865 because the condition on line 852 was never true

853 wxr, 

854 word, 

855 data, 

856 field, 

857 item, 

858 sense, 

859 ruby, 

860 sense_datas, 

861 is_reconstruction, 

862 urls or None, 

863 links_that_should_not_be_split or None, 

864 ): 

865 return [v] 

866 

867 return [] 

868 

869 

870def parse_linkage_item_text( 

871 wxr: WiktextractContext, 

872 word: str, 

873 data: WordData, 

874 field: str, 

875 item: str, 

876 sense: Optional[str], 

877 ruby: list, 

878 pos_datas: list, 

879 is_reconstruction: bool, 

880 urls: Optional[list[str]] = None, 

881 links: Optional[list[str]] = None, 

882) -> Optional[str]: 

883 """Parses a linkage item once it has been converted to a string. This 

884 may add one or more linkages to ``data`` under ``field``. This 

885 returns None or a string that contains a sense that should be applied 

886 to additional linkages (commonly used in tables for Asian characters).""" 

887 assert isinstance(wxr, WiktextractContext) 

888 assert isinstance(word, str) # Main word (derived from page title) 

889 assert isinstance(data, dict) # Parsed linkages are stored here under field 

890 assert isinstance(field, str) # The field under which to store linkage 

891 assert isinstance(item, str) # The string to parse 

892 assert sense is None or isinstance(sense, str) 

893 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" 

894 assert isinstance(pos_datas, list) # List of senses (containing "glosses") 

895 assert urls is None or isinstance(urls, list) # Captured urls 

896 assert is_reconstruction in (True, False) 

897 

898 item = item.replace("()", "") 

899 item = re.sub(r"\s+", " ", item) 

900 item = item.strip() 

901 

902 base_roman = None 

903 base_alt = None 

904 base_english = None 

905 script_chars = False 

906 base_qualifier = None 

907 lang = wxr.wtp.section 

908 

909 # If ``sense`` can be parsed as tags, treat it as tags instead 

910 if sense: 

911 cls = classify_desc(sense, no_unknown_starts=True) 

912 if cls == "tags": 

913 base_qualifier = sense 

914 sense = None 

915 

916 # Check if this item is a stand-alone sense (or tag) specifier 

917 # for following items (e.g., commonly in a table, see 滿) 

918 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item) 

919 if m: 

920 return m.group(1) 

921 

922 # Check for pre-split ignored linkages using the appropriate regexp 

923 if re.search(linkage_pre_split_ignore_re, item): 

924 return None 

925 

926 # print(" LINKAGE ITEM: {}: {} (sense {})" 

927 # .format(field, item, sense)) 

928 

929 # Replace occurrences of ~ in the item by the page title 

930 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr] 

931 item = item.replace(" ~ ", " " + safetitle + " ") 

932 item = re.sub(r"^~ ", safetitle + " ", item) 

933 item = re.sub(r" ~$", " " + safetitle, item) 

934 

935 # Many taxonomic terms contain hyponym lists that end with the 

936 # kind of the hyponym (a taxonomic level in plural). Recognize 

937 # such and add the term in singular to all linkages in the list. 

938 m = re.search(taxonomic_ending_re, item) 

939 if m: 

940 base_english = taxonomic_ending_map[m.group(1)] 

941 item = item[: m.start()] 

942 

943 # Some Korean and Japanese words use "word (romanized): english" pattern 

944 # Sometimes the parenthesized part contains comma-separated alt and roman. 

945 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item) 

946 if m: 

947 rom = m.group(2) 

948 eng = m.group(3) 

949 rest = m.group(1) 

950 if ( 

951 classify_desc(rest, no_unknown_starts=True) == "other" 

952 and classify_desc(eng, no_unknown_starts=True) == "english" 

953 ): 

954 item = rest 

955 base_roman = rom 

956 lst = base_roman.split(", ") 

957 if ( 

958 len(lst) == 2 

959 and classify_desc(lst[0], no_unknown_starts=True) == "other" 

960 ): 

961 base_alt = lst[0] 

962 base_roman = lst[1] 

963 if base_english: 

964 base_english += "; " + eng 

965 else: 

966 base_english = eng 

967 

968 # Many words have tags or similar descriptions in the beginning 

969 # followed by a colon and one or more linkages (e.g., 

970 # panetella/Finnish) 

971 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( 

972 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", 

973 item, 

974 ) 

975 if m: 

976 desc = m.group(1) 

977 rest = m.group(len(m.groups())) 

978 # Check for certain comma-separated tags combined 

979 # with English text at the beginning or end of a 

980 # comma-separated parenthesized list 

981 lst = split_at_comma_semi(desc, skipped=links) 

982 while len(lst) > 1: 

983 # Check for tags at the beginning 

984 cls = classify_desc(lst[0], no_unknown_starts=True) 

985 if cls == "tags": 

986 if base_qualifier: 

987 base_qualifier += ", " + lst[0] 

988 else: 

989 base_qualifier = lst[0] 

990 lst = lst[1:] 

991 continue 

992 # Check for tags at the end 

993 cls = classify_desc(lst[-1], no_unknown_starts=True) 

994 if cls == "tags": 

995 if base_qualifier: 

996 base_qualifier += ", " + lst[-1] 

997 else: 

998 base_qualifier = lst[-1] 

999 lst = lst[:-1] 

1000 continue 

1001 break 

1002 desc = ", ".join(lst) 

1003 

1004 # Sometimes we have e.g. "chemistry (slang)" with are 

1005 # both tags (see "stink"). Handle that case by 

1006 # removing parentheses if the value is still tags. The part with 

1007 # parentheses could be on either side of the colon. 

1008 if "(" in desc: 

1009 x = desc.replace("(", ",").replace(")", ",") 

1010 if classify_desc(x, no_unknown_starts=True) == "tags": 

1011 desc = x 

1012 elif "(" in rest: 

1013 x = rest.replace("(", ",").replace(")", ",") 

1014 if classify_desc(x, no_unknown_starts=True) == "tags": 

1015 rest = desc 

1016 desc = x 

1017 

1018 # See if the prefix should trigger special handling for script 

1019 # character, letter, digit, etc. handling 

1020 if re.search(script_chars_re, desc): 

1021 script_chars = True 

1022 

1023 # Try to determine which side is description and which is 

1024 # the linked term (both orders are widely used in Wiktionary) 

1025 cls = classify_desc(desc, no_unknown_starts=True) 

1026 cls2 = classify_desc(rest, no_unknown_starts=True) 

1027 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}" 

1028 # .format(desc, cls, rest, cls2)) 

1029 

1030 e1 = wxr.wtp.page_exists(desc) 

1031 e2 = wxr.wtp.page_exists(rest) 

1032 if cls != "tags": 

1033 if ( 

1034 cls2 == "tags" 

1035 or (e1 and not e1) 

1036 or ( 

1037 e1 

1038 and e2 

1039 and cls2 == "english" 

1040 and cls in ("other", "romanization") 

1041 ) 

1042 or ( 

1043 not e1 

1044 and not e2 

1045 and cls2 == "english" 

1046 and cls in ("other", "romanization") 

1047 ) 

1048 ): 

1049 desc, rest = rest, desc # Looks like swapped syntax 

1050 cls = cls2 

1051 if re.search(linkage_paren_ignore_contains_re, desc): 1051 ↛ 1052line 1051 didn't jump to line 1052 because the condition on line 1051 was never true

1052 desc = "" 

1053 # print("linkage colon prefix desc={!r} rest={!r} cls={}" 

1054 # .format(desc, rest, cls)) 

1055 

1056 # Handle the prefix according to its type 

1057 if cls == "tags": 

1058 if base_qualifier: 

1059 base_qualifier += ", " + desc 

1060 else: 

1061 base_qualifier = desc 

1062 item = rest 

1063 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"): 

1064 if base_english: 1064 ↛ 1065line 1064 didn't jump to line 1065 because the condition on line 1064 was never true

1065 base_english += "; " + base_english 

1066 else: 

1067 base_english = desc 

1068 item = rest 

1069 elif cls in ("english", "taxonomic"): 

1070 if sense: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true

1071 sense += "; " + desc 

1072 else: 

1073 sense = desc 

1074 item = rest 

1075 elif desc.isdigit(): 

1076 idx = int(desc) - 1 

1077 if idx >= 0 and idx < len(pos_datas): 

1078 d = pos_datas[idx] 

1079 gl = "; ".join(d.get("glosses", ())) 

1080 if not gl: 1080 ↛ 1081line 1080 didn't jump to line 1081 because the condition on line 1080 was never true

1081 wxr.wtp.debug( 

1082 "parenthesized numeric linkage prefix, " 

1083 "but the referenced sense has no gloss: " 

1084 "{}".format(desc), 

1085 sortid="linkages/355", 

1086 ) 

1087 elif sense: 

1088 sense += "; " + gl 

1089 else: 

1090 sense = gl 

1091 item = rest 

1092 else: 

1093 wxr.wtp.debug( 

1094 "parenthesized numeric linkage prefix, " 

1095 "but there is no sense with such index: {}".format(desc), 

1096 sortid="linkages/365", 

1097 ) 

1098 item = rest 

1099 else: 

1100 wxr.wtp.debug( 

1101 "unrecognized linkage prefix: {} desc={} rest={} " 

1102 "cls={} cls2={} e1={} e2={}".format( 

1103 item, desc, rest, cls, cls2, e1, e2 

1104 ), 

1105 sortid="linkages/371", 

1106 ) 

1107 item = rest 

1108 

1109 base_sense = sense 

1110 

1111 # Check for certain plural tag forms at end of items list, and apply 

1112 # them to all items if found 

1113 m = re.search( 

1114 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" 

1115 r"characters|symbols|tetragrams|letter names|names|" 

1116 r"female names|male names|proper nouns|contractions|" 

1117 r"nonstandard spellings|verbs|prepositions|postpositions|" 

1118 r"interjections|Abbreviations|abbreviations|variants|" 

1119 r"ordinals|nouns|phrases|adjectives|adverbs|" 

1120 r"augmentatives|pejoratives|compound words|numerals|" 

1121 r"Tally marks|surnames|modern nonstandard spellings)$", 

1122 item, 

1123 ) 

1124 if m: 

1125 suffix = m.group(1) 

1126 if base_qualifier: 

1127 base_qualifier += ", " + suffix 

1128 else: 

1129 base_qualifier = suffix 

1130 item = item[: m.start()] 

1131 

1132 # Certain linkage items have space-separated valus. These are 

1133 # generated by, e.g., certain templates 

1134 if base_sense and base_sense.endswith(" paper sizes"): 

1135 base_qualifier = None 

1136 item = ", ".join(item.split()) 

1137 # XXX isn't this now handled by the generic digits/letters/etc code? 

1138 # elif base_qualifier in ("Arabic digits",): 

1139 # item = ", ".join(item.split()) 

1140 

1141 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript 

1142 item = item.strip() 

1143 if not item: 

1144 return None 

1145 

1146 # Kludge: if the item contains ")/" (with possibly spaces in between), 

1147 # replace it by a comma so it gets split. 

1148 item = re.sub(r"\)\s*/", "), ", item) 

1149 

1150 # The item may contain multiple comma-separated linkages 

1151 if base_roman: 

1152 subitems = [item] 

1153 else: 

1154 # Split at commas. Also, in most cases split by " or ", but this 

1155 # is complicated - "or" may end certain words (e.g., "logical or") 

1156 # and it may separate head-final tags (e.g. "foo f or m"). Also, 

1157 # some words have parenthesizxed parts in between, e.g., 

1158 # wife/English/Translations/Yiddish: 

1159 # "ווײַב‎ n (vayb) or f, פֿרוי‎ f (froy)" 

1160 subitems = [] 

1161 for item1 in split_at_comma_semi(item, skipped=links): 

1162 if " or " not in item1: 

1163 subitems.append(item1) 

1164 continue 

1165 # Item1 contains " or " 

1166 item2 = re.sub(r"\s*\([^)]*\)", "", item1) 

1167 item2 = re.sub(r"\s+", " ", item2) 

1168 if ( 

1169 ( 

1170 lang not in head_final_bantu_langs 

1171 or not re.search(head_final_bantu_re, item2) 

1172 ) 

1173 and ( 

1174 lang not in head_final_other_langs 

1175 or not re.search(head_final_other_re, item2) 

1176 ) 

1177 and ( 

1178 not re.search(head_final_re, item2) 

1179 or ( 

1180 item2[-1].isdigit() 

1181 and lang not in head_final_numeric_langs 

1182 ) 

1183 ) 

1184 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE") 

1185 and all( 

1186 wxr.wtp.title not in x.split(" or ") 

1187 for x in split_at_comma_semi(item2, skipped=links) 

1188 if " or " in x 

1189 ) 

1190 ): 

1191 # We can split this item. Split the non-cleaned version 

1192 # that still has any intervening parenthesized parts. 

1193 subitems.extend( 

1194 split_at_comma_semi(item1, extra=[" or "], skipped=links) 

1195 ) 

1196 else: 

1197 subitems.append(item1) 

1198 if len(subitems) > 1: # Would be merged from multiple subitems 

1199 ruby = [] # XXX what is the purpose of this? 

1200 for item1 in subitems: 

1201 if len(subitems) > 1 and item1 in ("...", "…"): 

1202 # Some lists have ellipsis in the middle - don't generate 

1203 # linkages for the ellipsis 

1204 continue 

1205 item1 = item1.strip() 

1206 qualifier = base_qualifier 

1207 sense = base_sense 

1208 parts = [] 

1209 roman = base_roman # Usually None 

1210 alt = base_alt # Usually None 

1211 taxonomic = None 

1212 english = base_english 

1213 

1214 # Some words have derived terms with parenthesized quoted English 

1215 # descriptions, which can sometimes essentially be tags 

1216 # Some word (bleki/Esperanto...) can have parentheses inside 

1217 # the quotes, so let's make this regex even more unreadable. 

1218 m = re.search(r"\s*\(“([^”]+)”\)", item1) 

1219 if m: 1219 ↛ 1220line 1219 didn't jump to line 1220 because the condition on line 1219 was never true

1220 t = m.group(1) 

1221 item1 = (item1[: m.start()] + item1[m.end() :]).strip() 

1222 cls = classify_desc(t) 

1223 if cls == "tags": 

1224 if qualifier: 

1225 qualifier += ", " + t 

1226 else: 

1227 qualifier = t 

1228 else: 

1229 english = t 

1230 

1231 # Some Korean words use "word (alt, oman, “english”) pattern 

1232 # See 滿/Korean 

1233 m = re.match( 

1234 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " 

1235 r'[“”"]([^”“"]+)[“”"]\)$', 

1236 item1, 

1237 ) 

1238 if ( 

1239 m 

1240 and classify_desc(m.group(1), no_unknown_starts=True) == "other" 

1241 and classify_desc(m.group(2), no_unknown_starts=True) == "other" 

1242 ): 

1243 alt = m.group(2) 

1244 roman = m.group(3) 

1245 english = m.group(4) 

1246 item1 = m.group(1) 

1247 

1248 words = item1.split(" ") 

1249 if ( 

1250 len(words) > 1 

1251 and words[0] in linkage_beginning_tags 

1252 and words[0] != wxr.wtp.title 

1253 ): 

1254 t = linkage_beginning_tags[words[0]] 

1255 item1 = " ".join(words[1:]) 

1256 if qualifier: 1256 ↛ 1257line 1256 didn't jump to line 1257 because the condition on line 1256 was never true

1257 qualifier += ", " + t 

1258 else: 

1259 qualifier = t 

1260 

1261 # Extract quoted English translations (there are also other 

1262 # kinds of English translations) 

1263 def english_repl(m: re.Match) -> str: 

1264 nonlocal english 

1265 nonlocal qualifier 

1266 v = m.group(1).strip() 

1267 # If v is "tags: sense", handle the tags 

1268 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v) 

1269 if m1 is not None: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true

1270 desc, rest = m1.groups() 

1271 if classify_desc(desc, no_unknown_starts=True) == "tags": 

1272 if qualifier: 

1273 qualifier += ", " + desc 

1274 else: 

1275 qualifier = desc 

1276 v = rest 

1277 if english: 

1278 english += "; " + v 

1279 else: 

1280 english = v 

1281 return "" 

1282 

1283 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip() 

1284 

1285 # There could be multiple parenthesized parts, and 

1286 # sometimes both at the beginning and at the end. 

1287 # And sometimes even in the middle, as in e.g. 

1288 # wife/English/Translations/Yiddish 

1289 while not script_chars and ( 

1290 not sense or not re.search(script_chars_re, sense) 

1291 ): 

1292 par = None 

1293 nonfirst_par = False 

1294 if par is None: 1294 ↛ 1311line 1294 didn't jump to line 1311 because the condition on line 1294 was always true

1295 # Try to find a parenthesized part from the beginning. 

1296 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) 

1297 if m: 

1298 par = m.group(1) 

1299 item1 = item1[m.end() :] 

1300 else: 

1301 # Try to find a parenthesized part at the end or from the 

1302 # middle. 

1303 m = re.search( 

1304 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", 

1305 item1, 

1306 ) 

1307 if m: 

1308 par = m.group(1) 

1309 item1 = item1[: m.start()] + item1[m.end() :] 

1310 nonfirst_par = True 

1311 if not par: 

1312 break 

1313 if re.search(linkage_paren_ignore_contains_re, par): 

1314 continue # Skip these linkage descriptors 

1315 par = par.strip() 

1316 # Handle tags from beginning of par. We also handle "other" 

1317 # here as Korean entries often have Hanja form in the 

1318 # beginning of parenthesis, before romanization. Similar 

1319 # for many Japanese entries. 

1320 while par: 1320 ↛ 1341line 1320 didn't jump to line 1341 because the condition on line 1320 was always true

1321 idx = par.find(",") 

1322 if idx <= 0: 

1323 break 

1324 cls = classify_desc(par[:idx], no_unknown_starts=True) 

1325 if cls == "other" and not alt: 1325 ↛ 1326line 1325 didn't jump to line 1326 because the condition on line 1325 was never true

1326 alt = par[:idx] 

1327 elif cls == "taxonomic": 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true

1328 taxonomic = par[:idx] 

1329 elif cls == "tags": 

1330 if qualifier: 

1331 qualifier += ", " + par[:idx] 

1332 else: 

1333 qualifier = par[:idx] 

1334 else: 

1335 break 

1336 par = par[idx + 1 :].strip() 

1337 

1338 # Check for certain comma-separated tags combined 

1339 # with English text at the beginning or end of a 

1340 # comma-separated parenthesized list 

1341 lst = par.split(",") if len(par) > 1 else [par] 

1342 lst = list(x.strip() for x in lst if x.strip()) 

1343 while len(lst) > 1: 

1344 cls = classify_desc(lst[0], no_unknown_starts=True) 

1345 if cls == "tags": 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true

1346 if qualifier: 

1347 qualifier += ", " + lst[0] 

1348 else: 

1349 qualifier = lst[0] 

1350 lst = lst[1:] 

1351 continue 

1352 cls = classify_desc(lst[-1], no_unknown_starts=True) 

1353 if cls == "tags": 

1354 if qualifier: 

1355 qualifier += ", " + lst[-1] 

1356 else: 

1357 qualifier = lst[-1] 

1358 lst = lst[:-1] 

1359 continue 

1360 break 

1361 par = ", ".join(lst) 

1362 

1363 # Handle remaining types 

1364 if not par: 1364 ↛ 1365line 1364 didn't jump to line 1365 because the condition on line 1364 was never true

1365 continue 

1366 if re.search(script_chars_re, par): 

1367 script_chars = True 

1368 if classify_desc(par, no_unknown_starts=True) == "tags": 1368 ↛ 1378line 1368 didn't jump to line 1378 because the condition on line 1368 was always true

1369 if base_qualifier: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true

1370 base_qualifier += "; " + par 

1371 else: 

1372 base_qualifier = par 

1373 if qualifier: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true

1374 qualifier += "; " + par 

1375 else: 

1376 qualifier = par 

1377 else: 

1378 if base_sense: 

1379 base_sense += "; " + par 

1380 else: 

1381 base_sense = par 

1382 if sense: 

1383 sense += "; " + par 

1384 else: 

1385 sense = par 

1386 elif par.endswith(" letter names"): 1386 ↛ 1387line 1386 didn't jump to line 1387 because the condition on line 1386 was never true

1387 if base_qualifier: 

1388 base_qualifier += "; " + par 

1389 else: 

1390 base_qualifier = par 

1391 if qualifier: 

1392 qualifier += "; " + par 

1393 else: 

1394 qualifier = par 

1395 else: 

1396 cls = classify_desc(par) 

1397 # print("classify_desc: {!r} -> {}".format(par, cls)) 

1398 if cls == "tags": 

1399 if qualifier: 

1400 qualifier += ", " + par 

1401 else: 

1402 qualifier = par 

1403 elif cls == "english": 

1404 if nonfirst_par: 

1405 if english: 

1406 english += "; " + par 

1407 else: 

1408 english = par 

1409 else: 

1410 if sense: 1410 ↛ 1411line 1410 didn't jump to line 1411 because the condition on line 1410 was never true

1411 sense += "; " + par 

1412 else: 

1413 sense = par 

1414 elif cls == "romanization": 

1415 roman = par 

1416 elif cls == "taxonomic": 

1417 taxonomic = par 

1418 elif par.isdigit(): 

1419 idx = int(par) - 1 

1420 if idx >= 0 and idx < len(pos_datas): 

1421 d = pos_datas[idx] 

1422 gl = "; ".join(d.get("glosses", ())) 

1423 if not gl: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true

1424 wxr.wtp.debug( 

1425 "parenthesized number " 

1426 "but the referenced sense has no " 

1427 "gloss: {}".format(par), 

1428 sortid="linkages/665", 

1429 ) 

1430 elif sense: 1430 ↛ 1433line 1430 didn't jump to line 1433 because the condition on line 1430 was always true

1431 sense += "; " + gl 

1432 else: 

1433 sense = gl 

1434 else: 

1435 wxr.wtp.debug( 

1436 "parenthesized number but there is " 

1437 "no sense with such index: {}".format(par), 

1438 sortid="linkages/674", 

1439 ) 

1440 else: 

1441 if alt: 1441 ↛ 1442line 1441 didn't jump to line 1442 because the condition on line 1441 was never true

1442 alt += "; " + par 

1443 else: 

1444 alt = par 

1445 

1446 # Handle certain special cases, unless we are parsing 

1447 # script characters. 

1448 if not script_chars: 

1449 # Ignore all linkages with certain prefixes, suffixes, or parts 

1450 # (this is done after removing certain prefixes and suffixes) 

1451 if re.search(linkage_ignore_re, item1): 

1452 continue # Ignore linkages with certain prefixes 

1453 

1454 # Remove certain prefixes from linkages 

1455 m = re.match(linkage_remove_prefixes_re, item1) 

1456 if m: 

1457 prefix = item1[: m.end()] 

1458 item1 = item1[m.end() :] 

1459 if prefix in linkage_remove_prefixes_tags: 

1460 if qualifier: 

1461 qualifier += ", " + linkage_remove_prefixes_tags[prefix] 

1462 else: 

1463 qualifier = linkage_remove_prefixes_tags[prefix] 

1464 # Recheck ignored linkages 

1465 if re.search(linkage_ignore_re, item1): 

1466 continue 

1467 

1468 # Remove certain suffixes from linkages 

1469 m = re.search(linkage_remove_suffixes_re, item1) 

1470 if m: 

1471 item1 = item1[: m.start()] 

1472 

1473 # Parse linkages with "value = english" syntax (e.g., 

1474 # väittää/Finnish) 

1475 idx = item1.find(" = ") 

1476 if idx >= 0: 

1477 eng = item1[idx + 3 :] 

1478 if classify_desc(eng, no_unknown_starts=True) == "english": 

1479 english = eng 

1480 item1 = item1[:idx] 

1481 else: 

1482 # Some places seem to use it reversed 

1483 # "english = value" 

1484 eng = item1[:idx] 

1485 if classify_desc(eng, no_unknown_starts=True) == "english": 

1486 english = eng 

1487 item1 = item1[idx + 3 :] 

1488 

1489 # Parse linkages with "value - english" syntax (e.g., 

1490 # man/Faroese) 

1491 m = re.search(r" [-‐‑‒–—―] ", item1) 

1492 if m and "(" not in item1: 

1493 suffix = item1[m.end() :] 

1494 cls = classify_desc(suffix, no_unknown_starts=True) 

1495 if cls == "english": 

1496 # This case intentionally ignores old values from english 

1497 # (otherwise taxonomic lists fail) 

1498 english = suffix 

1499 item1 = item1[: m.start()] 

1500 elif cls == "tags": 

1501 if qualifier: 1501 ↛ 1502line 1501 didn't jump to line 1502 because the condition on line 1501 was never true

1502 qualifier += ", " + suffix 

1503 else: 

1504 qualifier = suffix 

1505 item1 = item1[: m.start()] 

1506 

1507 # Parse certain tags at the end of the linked term (unless 

1508 # we are in a letters list) 

1509 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1) 

1510 if q: 

1511 if qualifier: 1511 ↛ 1512line 1511 didn't jump to line 1512 because the condition on line 1511 was never true

1512 qualifier += ", " + ", ".join(q) 

1513 else: 

1514 qualifier = ", ".join(q) 

1515 

1516 m = re.search(linkage_truncate_re, item1) 

1517 if m: 1517 ↛ 1519line 1517 didn't jump to line 1519 because the condition on line 1517 was never true

1518 # suffix = item1[m.start():] # Currently ignored 

1519 item1 = item1[: m.start()] 

1520 if not item1: 

1521 continue # Ignore empty link targets 

1522 if item1 == word: 

1523 continue # Ignore self-links 

1524 

1525 def add(w: str, r: Optional[str]) -> None: 

1526 assert isinstance(w, str) 

1527 assert r is None or isinstance(r, str) 

1528 nonlocal alt 

1529 nonlocal taxonomic 

1530 

1531 # We remove "*" from the beginning of reconstruction linkages. 

1532 # Such linkages should only occur in reconstruction senses, so 

1533 # this should not cause ambiguity. 

1534 if is_reconstruction and w.startswith("*"): 

1535 w = w[1:] 

1536 

1537 # Check if the word contains the Fullwith Solidus, and if 

1538 # so, split by it and treat the the results as alternative 

1539 # linkages. (This is very commonly used for alternative 

1540 # written forms in Chinese compounds and other linkages.) 

1541 # However, if the word contains a comma, then we wont't 

1542 # split as this is used when we have a different number 

1543 # of romanizations than written forms, and don't know 

1544 # which is which. 

1545 if ( 

1546 (not w or "," not in w) 

1547 and (not r or "," not in r) 

1548 and not wxr.wtp.page_exists(w) 

1549 ): 

1550 lst = w.split("/") if len(w) > 1 else [w] 

1551 if len(lst) == 1: 

1552 lst = w.split(" / ") 

1553 if len(lst) == 1 and len(lst[0]) >= 6: 

1554 lst = w.split("/") 

1555 if len(lst) > 1: 

1556 # Treat each alternative as separate linkage 

1557 for w in lst: 

1558 add(w, r) 

1559 return None 

1560 

1561 # Heuristically remove "." at the end of most linkages 

1562 # (some linkage lists end in a period, but we also have 

1563 # abbreviations that end with a period that should be kept) 

1564 if ( 

1565 w.endswith(".") 

1566 and not wxr.wtp.page_exists(w) 

1567 and ( 

1568 wxr.wtp.page_exists(w[:-1]) 

1569 or (len(w) >= 5) 

1570 and "." not in w[:-1] 

1571 ) 

1572 ): 

1573 w = w[:-1] 

1574 

1575 # If we have roman but not alt and the word is ASCII, 

1576 # move roman to alt. 

1577 if r and not alt and w.isascii(): 

1578 alt = r 

1579 r = None 

1580 # Add the linkage 

1581 dt: LinkageData = {} 

1582 if qualifier: 

1583 parse_sense_qualifier(wxr, qualifier, dt) 

1584 if sense: 

1585 dt["sense"] = sense.strip() 

1586 if r: 

1587 dt["roman"] = r.strip() 

1588 if ruby: 

1589 dt["ruby"] = ruby 

1590 if english: 

1591 dt["english"] = english.strip() # DEPRECATED for "translation" 

1592 dt["translation"] = english.strip() 

1593 if taxonomic: 

1594 if re.match(r"×[A-Z]", taxonomic): 

1595 data_append(dt, "tags", "extinct") 

1596 taxonomic = taxonomic[1:] 

1597 dt["taxonomic"] = taxonomic 

1598 if re.match(r"×[A-Z]", w): 

1599 data_append(dt, "tags", "extinct") 

1600 w = w[1:] # Remove × before dead species names 

1601 if alt and re.match(r"×[A-Z]", alt): 

1602 data_append(dt, "tags", "extinct") 

1603 alt = alt[1:] # Remove × before dead species names 

1604 if alt and alt.strip() != w: 

1605 dt["alt"] = alt.strip() 

1606 if urls: 

1607 dt["urls"] = [ 

1608 url.strip() for url in urls if url and isinstance(url, str) 

1609 ] 

1610 dt["word"] = w 

1611 for old in data.get(field, ()): # type: ignore[attr-defined] 

1612 if dt == old: 

1613 break 

1614 else: 

1615 data_append(data, field, dt) 

1616 

1617 # Handle exceptional linkage splits and other linkage 

1618 # conversions (including expanding to variant forms) 

1619 if item1 in linkage_split_exceptions: 1619 ↛ 1620line 1619 didn't jump to line 1620 because the condition on line 1619 was never true

1620 for item2 in linkage_split_exceptions[item1]: 

1621 add(item2, roman) 

1622 continue 

1623 

1624 # Various templates for letters in scripts use spaces as 

1625 # separators and also have multiple characters without 

1626 # spaces consecutively. 

1627 v = sense or qualifier 

1628 # print("lang={} v={} script_chars={} item1={!r}" 

1629 # .format(wxr.wtp.section, v, script_chars, item1)) 

1630 if v and script_chars: 

1631 if ( 

1632 len(item1.split()) > 1 

1633 or len(list(re.finditer(unicode_dc_re, item1))) == 2 

1634 or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) 

1635 ): 

1636 if v == qualifier: 

1637 # if sense: 

1638 # sense += "; " + qualifier 

1639 # else: 

1640 # sense = qualifier 

1641 qualifier = None 

1642 if re.search(r" (letters|digits|script)$", v): 

1643 qualifier = v # Also parse as qualifier 

1644 elif re.search( 1644 ↛ 1651line 1644 didn't jump to line 1651 because the condition on line 1644 was always true

1645 r"Variations of letter |" 

1646 r"Letters using |" 

1647 r"Letters of the ", 

1648 v, 

1649 ): 

1650 qualifier = "letter" 

1651 parts = item1.split(". ") 

1652 extra: Sequence[str] = () 

1653 if len(parts) > 1: 1653 ↛ 1654line 1653 didn't jump to line 1654 because the condition on line 1653 was never true

1654 extra = parts[1:] 

1655 item1 = parts[0] 

1656 # Handle multi-character names for chars in language's 

1657 # alphabet, e.g., "Ny ny" in P/Hungarian. 

1658 if ( 

1659 len(subitems) > 20 

1660 and len(item1.split()) == 2 

1661 and all(len(x) <= 3 for x in item1.split()) 

1662 ): 

1663 parts = list( 

1664 m.group(0) 

1665 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) 

1666 if not m.group(0).isspace() 

1667 and m.group(0) not in ("(", ")") 

1668 ) 

1669 else: 

1670 parts = list( 

1671 m.group(0) 

1672 for m in re.finditer(r".[\u0300-\u036f]?", item1) 

1673 if not m.group(0).isspace() 

1674 and m.group(0) not in ("(", ")") 

1675 ) 

1676 for e in extra: 1676 ↛ 1677line 1676 didn't jump to line 1677 because the loop on line 1676 never started

1677 idx = e.find(":") 

1678 if idx >= 0: 

1679 e = e[idx + 1 :].strip() 

1680 if e.endswith("."): 

1681 e = e[:-1] 

1682 parts.extend(e.split()) 

1683 

1684 # XXX this is not correct - see P/Vietnamese 

1685 # While some sequences have multiple consecutive 

1686 # characters, others use pairs and some have 

1687 # 2/3 character names, e.g., "Ng ng". 

1688 

1689 rparts: Optional[list[Optional[str]]] = None 

1690 if roman: 1690 ↛ 1691line 1690 didn't jump to line 1691 because the condition on line 1690 was never true

1691 rparts = list( 

1692 m.group(0) 

1693 for m in re.finditer(r".[\u0300-\u036f]", roman) 

1694 if not m.group(0).isspace() 

1695 ) 

1696 if len(rparts) != len(parts): 

1697 rparts = None 

1698 if not rparts: 1698 ↛ 1701line 1698 didn't jump to line 1701 because the condition on line 1698 was always true

1699 rparts = [None] * len(parts) 

1700 

1701 for w, r in zip(parts, rparts): 

1702 add(w, r) 

1703 continue 

1704 

1705 add(item1, roman) 

1706 return None