Coverage for src/wiktextract/extractor/en/linkages.py: 72%

758 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-05-29 08:54 +0000

1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc) 

2# 

3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import re 

6import unicodedata 

7from typing import Optional, Sequence 

8 

9from wikitextprocessor import ( 

10 LevelNode, 

11 NodeKind, 

12 TemplateNode, 

13 WikiNode, 

14) 

15from wikitextprocessor.core import TemplateArgs 

16from wikitextprocessor.parser import ( 

17 HTMLNode, 

18 is_list, 

19 is_list_item, 

20) 

21 

22from ...datautils import ( 

23 data_append, 

24 data_extend, 

25 ns_title_prefix_tuple, 

26 split_at_comma_semi, 

27) 

28from ...page import clean_node, is_panel_template 

29from ...tags import linkage_beginning_tags, valid_tags 

30from ...wxr_context import WiktextractContext 

31from ..ruby import extract_ruby, parse_ruby # noqa: F401 

32from .form_descriptions import ( 

33 classify_desc, 

34 decode_tags, 

35 head_final_bantu_langs, 

36 head_final_bantu_re, 

37 head_final_numeric_langs, 

38 head_final_other_langs, 

39 head_final_other_re, 

40 head_final_re, 

41 parse_head_final_tags, 

42 parse_sense_qualifier, 

43) 

44from .section_titles import TRANSLATIONS_TITLE 

45from .type_utils import FormData, LinkageData, SenseData, WordData 

46 

47# Linkage will be ignored if it matches this regexp before splitting 

48linkage_pre_split_ignore_re = re.compile( 

49 r"^(" 

50 + "|".join( 

51 re.escape(x) 

52 for x in [ 

53 "For more variations, see ", 

54 "Signal flag:", 

55 "Semaphore:", 

56 ] 

57 ) 

58 + r")" 

59) 

60 

61# Linkage will be ignored if it has one of these prefixes 

62linkage_ignore_prefixes = [ 

63 "Historical and regional synonyms of ", 

64 "edit data", 

65 "or these other third-person pronouns", 

66 "introduced in Unicode ", 

67 "Entries in the ", 

68 "Wikipedia article ", 

69 "Wiktionary's coverage of ", 

70 "Ethnologue entry for ", 

71 "Any of Thesaurus:", 

72 "See contents of Category:", 

73 "See also Thesaurus:", 

74 "See also Appendix:", 

75 "see also Appendix:", 

76 "see also Thesaurus:", 

77 "As SMS messaging ", 

78 "For the reversed question mark used in some right-to-left-scripts", 

79 "such as ", 

80 "Appendix:", 

81 "Category:", 

82 ":Category:", 

83] 

84 

85# Linkage will be ignored if it has any of these suffixes 

86linkage_ignore_suffixes = [ 

87 " Wikipedia", 

88 " Wikipedia.", 

89 " edition of Wiktionary", 

90] 

91 

92# Linkage will be ignored if it is one of these (with full match) 

93linkage_ignore_whole = [ 

94 "etc.", 

95 "other derived terms:", 

96 "Formal terms", 

97 "informal and slang terms", 

98] 

99 

100# Linkage will be ignored if it matches this regexp 

101linkage_ignore_re = re.compile( 

102 r"^(" 

103 + "|".join(re.escape(x) for x in linkage_ignore_whole) 

104 + r")$|^(" 

105 + "|".join(re.escape(x) for x in linkage_ignore_prefixes) 

106 + r")|(" 

107 + "|".join(re.escape(x) for x in linkage_ignore_suffixes) 

108 + r")$" 

109) 

110 

111# These prefixes will be removed from linkages, leaving the rest. This is 

112# considered separately for each linkage in a list. 

113linkage_remove_prefixes_re = re.compile( 

114 r"^(" 

115 + r"|".join( 

116 re.escape(x) 

117 for x in [ 

118 ":", 

119 "see Thesaurus:", 

120 "See Thesaurus:", 

121 "see also Thesaurus:", 

122 "See also Thesaurus:", 

123 "see also ", 

124 "See also ", 

125 "see ", 

126 "See ", 

127 "from ", 

128 "abbreviation of ", 

129 "ISO 639-1 code ", 

130 "ISO 639-3 code ", 

131 "Thesaurus:", 

132 ] 

133 ) 

134 + ")" 

135) 

136 

137# When removing prefix from linkage, this dictionary can be used to map 

138# the removed prefix to a space-separated list of tags to add 

139linkage_remove_prefixes_tags = { 

140 "abbreviation of ": "abbreviation", 

141} 

142 

143# These suffixes will be removed from linkages, leaving the rest. This is 

144# considered separately for each linkage in a list. 

145linkage_remove_suffixes_re = re.compile( 

146 r"(\s+on (Wikispecies|Wikimedia Commons|" 

147 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" 

148 r"\s*[-–] Pre-reform orthography.*)" 

149 r"$" 

150) 

151 

152# Ignore linkage parenthesized sections that contain one of these strings 

153linkage_paren_ignore_contains_re = re.compile( 

154 r"\b(" 

155 + "|".join( 

156 re.escape(x) 

157 for x in [ 

158 "from Etymology", 

159 "used as", 

160 "usage notes", 

161 ] 

162 ) 

163 + ")([, ]|$)" 

164) 

165 

166taxonomic_ending_map = { 

167 "superkingdoms": "superkingdom", 

168 "kingdoms": "kingdom", 

169 "subkingdoms": "subkingdom", 

170 "infrakingdoms": "infrakingdom", 

171 "phylums": "phylum", 

172 "subphylums": "subphylum", 

173 "infraphylums": "infraphylum", 

174 "superclasses": "superclass", 

175 "classes": "class", 

176 "orders": "order", 

177 "suborders": "suborder", 

178 "families": "family", 

179 "subfamilies": "subfamily", 

180 "genera": "genus", 

181} 

182for k, v in list(taxonomic_ending_map.items()): 

183 taxonomic_ending_map[v] = v # Also add singular -> singular 

184taxonomic_ending_re = re.compile( 

185 r"\s+[-‐‑‒–—]\s+({})$".format( 

186 "|".join(re.escape(x) for x in taxonomic_ending_map) 

187 ) 

188) 

189 

190# Exceptional splits for linkages. This can be used to fix particular linkages 

191# that are not handled correctly by the default code. This can also be used 

192# to create automatic aliases, e.g., for mapping "..." and "…" to both. 

193linkage_split_exceptions = { 

194 "∛ ∜": ["∛", "∜"], 

195 "...": ["...", "…"], 

196 "…": ["...", "…"], 

197} 

198 

199# Truncate linkage word if it matches any of these strings 

200linkage_truncate_re = re.compile( 

201 "|".join( 

202 re.escape(x) 

203 for x in [ 

204 " and its derived terms", 

205 " UTF-16 0x214C", 

206 ] 

207 ) 

208) 

209 

210# Regexp for identifying special linkages containing lists of letters, digits, 

211# or characters 

212script_chars_re = re.compile( 

213 r"(script letters| script| letters|" 

214 r"Dialectological|Puctuation|Symbols|" 

215 r"Guillemets|Single guillemets|" 

216 r" tetragrams|" 

217 r" digits)(;|$)|" 

218 r"(^|; )(Letters using |Letters of the |" 

219 r"Variations of letter )|" 

220 r"^(Hiragana|Katakana)$" 

221) 

222 

223# Matches an unicode character including any combining diacritics (even if 

224# separate characters) 

225unicode_dc_re = re.compile( 

226 r"\w[{}]|.".format( 

227 "".join( 

228 chr(x) 

229 for x in range(0, 0x110000) 

230 if unicodedata.category(chr(x)) == "Mn" 

231 ) 

232 ) 

233) 

234 

235 

236def extract_alt_form_section( 

237 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode 

238) -> None: 

239 for list_node in level_node.find_child(NodeKind.LIST): 

240 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

241 for node in list_item.children: 

242 if isinstance(node, TemplateNode) and node.template_name in [ 

243 "l", 

244 "link", 

245 "L", 

246 "alt", 

247 "alter", 

248 ]: 

249 extract_l_template(wxr, word_entry, node) 

250 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

251 word = clean_node(wxr, None, node) 

252 if word != "": 252 ↛ 241line 252 didn't jump to line 241 because the condition on line 252 was always true

253 form: FormData = {"form": word, "tags": ["alternative"]} 

254 data_append(word_entry, "forms", form) 

255 

256 

257def extract_l_template( 

258 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

259) -> None: 

260 forms: list[FormData] = [] 

261 expanded_node = wxr.wtp.parse( 

262 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

263 ) 

264 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

265 for span_tag in expanded_node.find_html("span"): 

266 span_lang = span_tag.attrs.get("lang", "") 

267 span_class = span_tag.attrs.get("class", "") 

268 if span_lang == lang_code: 268 ↛ 273line 268 didn't jump to line 273 because the condition on line 268 was always true

269 word = clean_node(wxr, None, span_tag) 

270 if word != "": 270 ↛ 265line 270 didn't jump to line 265 because the condition on line 270 was always true

271 form: FormData = {"form": word, "tags": ["alternative"]} 

272 forms.append(form) 

273 elif span_lang.endswith("-Latn") and len(forms) > 0: 

274 roman = clean_node(wxr, None, span_tag) 

275 if roman != "": 

276 forms[-1]["roman"] = roman 

277 elif "label-content" in span_class and len(forms) > 0: 

278 tag_text = clean_node(wxr, None, span_tag) 

279 if classify_desc(tag_text) == "tags": 

280 tagsets1, _ = decode_tags(tag_text) 

281 tags: list[str] = [] 

282 for ts in tagsets1: 

283 tags.extend(ts) 

284 for form in forms: 

285 form["tags"].extend(tags) 

286 data_extend(word_entry, "forms", forms) 

287 

288 

289ZH_DIAL_TAGS = { 

290 "Classical Chinese": ["Classical-Chinese"], 

291 "Formal": ["formal"], 

292 "Written Standard Chinese": ["Written-vernacular-Chinese"], 

293 "Northeastern Mandarin": ["Northeastern-Mandarin"], 

294 "Jilu Mandarin": ["Jilu-Mandarin"], 

295 "Jiaoliao Mandarin": ["Jiaoliao-Mandarin"], 

296 "Central Plains Mandarin": ["Central-Plains-Mandarin"], 

297 "Lanyin Mandarin": ["Lanyin-Mandarin"], 

298 "Southwestern Mandarin": ["Southwestern-Mandarin"], 

299 "Jianghuai Mandarin": ["Jianghuai-Mandarin"], 

300 "Northern Min": ["Min-Bei"], 

301 "Eastern Min": ["Min-Dong"], 

302 "Southern Min": ["Min-Nan"], 

303 "Zhongshan Min": ["Zhongshan-Min"], 

304 "Southern Pinghua": ["Southern-Pinghua"], 

305 "Puxian Min": ["Puxian-Min"], 

306} 

307 

308 

309def extract_zh_dial_template( 

310 wxr: WiktextractContext, 

311 word_entry: WordData, 

312 t_node: TemplateNode, 

313 sense: str, 

314): 

315 # https://en.wiktionary.org/wiki/Template:zh-dial 

316 from .pronunciation import split_zh_pron_raw_tag 

317 

318 linkage_list: list[LinkageData] = [] 

319 expanded_node = wxr.wtp.parse( 

320 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

321 ) 

322 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 322 ↛ 323line 322 didn't jump to line 323 because the loop on line 322 never started

323 is_note_row = False 

324 note_tags = {} 

325 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

326 for cell_node in row_node.find_child( 

327 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

328 ): 

329 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

330 is_note_row = clean_node(wxr, None, cell_node) == "Note" 

331 elif is_note_row: 

332 for note_str in clean_node(wxr, None, cell_node).split(";"): 

333 if "-" in note_str: 

334 note_symbol, note = note_str.split("-", maxsplit=1) 

335 note_symbol = note_symbol.strip() 

336 note = note.strip() 

337 if note_symbol != "" and note != "": 

338 note_tags[note_symbol] = note 

339 lang_tags = [] 

340 region_tags = [] 

341 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

342 if not row_node.contain_node(NodeKind.TABLE_CELL): 

343 continue # skip header row 

344 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

345 lang_tags = split_zh_pron_raw_tag( 

346 clean_node(wxr, None, header_node) 

347 ) 

348 if lang_tags == ["Note"]: # skip last note row 

349 continue 

350 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

351 for link_node in cell_node.find_child(NodeKind.LINK): 

352 region_tags = split_zh_pron_raw_tag( 

353 clean_node(wxr, None, link_node) 

354 ) 

355 for span_tag in cell_node.find_html("span"): 

356 span_text = clean_node(wxr, None, span_tag) 

357 if span_text == "": 

358 continue 

359 if ( 

360 span_tag.attrs.get("lang", "") == "zh" 

361 and span_text != wxr.wtp.title 

362 ): 

363 l_data: LinkageData = {"word": span_text} 

364 if sense != "": 

365 l_data["sense"] = sense 

366 if len(lang_tags) > 0: 

367 data_extend(l_data, "raw_tags", lang_tags) 

368 if len(region_tags) > 0: 

369 data_extend(l_data, "raw_tags", region_tags) 

370 linkage_list.append(l_data) 

371 elif ( 

372 span_tag.attrs.get("style", "") == "font-size:60%" 

373 and len(linkage_list) > 0 

374 ): 

375 for note_symbol in span_text.split(","): 

376 note_symbol = note_symbol.strip() 

377 raw_tag = note_symbol 

378 if note_symbol in note_tags: 

379 raw_tag = note_tags[note_symbol] 

380 if raw_tag != "": 

381 data_append( 

382 linkage_list[-1], "raw_tags", raw_tag 

383 ) 

384 

385 for l_data in linkage_list: 

386 raw_tags = [] 

387 for raw_tag in l_data.get("raw_tags", []): 

388 if raw_tag in ZH_DIAL_TAGS: 

389 data_extend(l_data, "tags", ZH_DIAL_TAGS[raw_tag]) 

390 elif raw_tag in valid_tags: 

391 data_append(l_data, "tags", raw_tag) 

392 else: 

393 raw_tags.append(raw_tag) 

394 if len(raw_tags) > 0: 

395 l_data["raw_tags"] = raw_tags 

396 elif "raw_tags" in l_data: 

397 del l_data["raw_tags"] 

398 data_extend(word_entry, "synonyms", linkage_list) 

399 

400 

401def parse_linkage( 

402 wxr: WiktextractContext, 

403 data: WordData, 

404 field: str, 

405 linkagenode: LevelNode, 

406 word: str, 

407 sense_datas: list[SenseData], 

408 is_reconstruction: bool, 

409) -> None: 

410 assert isinstance(data, dict) 

411 assert isinstance(field, str) 

412 assert isinstance(linkagenode, LevelNode) 

413 # print("field", field) 

414 # print("data", data) 

415 # print("children:") 

416 if not wxr.config.capture_linkages: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 return 

418 have_panel_template = False 

419 

420 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]: 

421 nonlocal have_panel_template 

422 if is_panel_template(wxr, name): 

423 have_panel_template = True 

424 return "" 

425 # Ignore auto-filled templates like Template:table:Solar System/en 

426 if name.startswith(("table:", "list:")): 

427 return "" 

428 return None 

429 

430 # Main body of parse_linkage() 

431 l_nodes: list[str | WikiNode] = [] 

432 l_sense = "" 

433 for node in linkagenode.children: 

434 if isinstance(node, TemplateNode) and node.template_name == "zh-dial": 

435 extract_zh_dial_template(wxr, data, node, l_sense) 

436 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

437 for list_item in node.find_child(NodeKind.LIST_ITEM): 

438 for t_node in list_item.find_child(NodeKind.TEMPLATE): 

439 if t_node.template_name in ["s", "sense"]: 

440 l_sense = clean_node(wxr, None, t_node).strip("(): ") 

441 l_nodes.append(node) 

442 else: 

443 l_nodes.append(node) 

444 text = wxr.wtp.node_to_wikitext(l_nodes) 

445 parsed = wxr.wtp.parse( 

446 text, expand_all=True, template_fn=linkage_template_fn1 

447 ) 

448 

449 text_outside_list_items = parse_linkage_recurse( 

450 wxr, 

451 parsed.children, 

452 field, 

453 None, 

454 None, 

455 word, 

456 data, 

457 sense_datas, 

458 is_reconstruction, 

459 ) 

460 

461 if not data.get(field) and not have_panel_template: 

462 text = "".join(text_outside_list_items).strip() 

463 if "\n" not in text and "," in text and text.count(",") > 3: 

464 if not text.startswith("See "): 464 ↛ exitline 464 didn't return from function 'parse_linkage' because the condition on line 464 was always true

465 parse_linkage_item( 

466 wxr, 

467 [text], 

468 field, 

469 word, 

470 data, 

471 sense_datas, 

472 is_reconstruction, 

473 None, 

474 ) 

475 

476 

477def parse_linkage_recurse( 

478 wxr: WiktextractContext, 

479 contents: list[WikiNode | str], 

480 field: str, 

481 sense: str | None, 

482 block_header_sense: str | None, 

483 word: str, 

484 data, 

485 sense_datas, 

486 is_reconstruction, 

487) -> list[str]: 

488 assert isinstance(contents, (list, tuple)) 

489 assert sense is None or isinstance(sense, str) 

490 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents)) 

491 

492 # Return values 

493 text_outside_list_items: list[str] = [] 

494 

495 for node in contents: 

496 if isinstance(node, str): 

497 # Ignore top-level text, generally comments before the 

498 # linkages list. However, if no linkages are found, then 

499 # use this for linkages (not all words use bullet points 

500 # for linkages). 

501 text_outside_list_items.append(node) 

502 continue 

503 assert isinstance(node, WikiNode) 

504 kind = node.kind 

505 # print("PARSE_LINKAGE_RECURSE CHILD", kind) 

506 if is_list(node) or kind in (NodeKind.TABLE, NodeKind.TABLE_ROW): 

507 toli = parse_linkage_recurse( 

508 wxr, 

509 node.children, 

510 field, 

511 sense, 

512 block_header_sense, 

513 word, 

514 data, 

515 sense_datas, 

516 is_reconstruction, 

517 ) 

518 text_outside_list_items.extend(toli) 

519 elif is_list_item(node) or kind == NodeKind.TABLE_CELL: 

520 v = parse_linkage_item( 

521 wxr, 

522 node.children, 

523 field, 

524 word, 

525 data, 

526 sense_datas, 

527 is_reconstruction, 

528 sense, 

529 ) 

530 if v is not None: 530 ↛ 495line 530 didn't jump to line 495 because the condition on line 530 was always true

531 # parse_linkage_item() can return a value that should 

532 # be used as the sense for the follow-on linkages, 

533 # which are typically provided in a table (see 滿) 

534 block_header_sense = "".join(v) 

535 elif kind in ( 

536 NodeKind.TABLE_CAPTION, 

537 NodeKind.TABLE_HEADER_CELL, 

538 NodeKind.PREFORMATTED, 

539 NodeKind.BOLD, 

540 ): 

541 # Let's still ignore table extra stuff 

542 continue 

543 elif isinstance(node, HTMLNode): 543 ↛ 545line 543 didn't jump to line 545 because the condition on line 543 was never true

544 # Recurse to process inside the HTML for most tags 

545 if node.sarg in ("gallery", "ref", "cite", "caption"): 

546 continue 

547 classes = (node.attrs.get("class") or "").replace("+", " ").split() 

548 if "qualifier-content" in classes: 

549 sense1 = clean_node(wxr, None, node.children) 

550 if sense1.endswith(":"): 

551 sense1 = sense1[:-1].strip() 

552 if sense and sense1: 

553 wxr.wtp.debug( 

554 "linkage qualifier-content on multiple " 

555 "levels: {!r} and {!r}".format(sense, sense1), 

556 sortid="page/2170", 

557 ) 

558 toli = parse_linkage_recurse( 

559 wxr, 

560 node.children, 

561 field, 

562 sense1, 

563 block_header_sense, 

564 word, 

565 data, 

566 sense_datas, 

567 is_reconstruction, 

568 ) 

569 text_outside_list_items.extend(toli) 

570 elif "list-switcher-header" in classes: 

571 block_header_sense = clean_node(wxr, None, node.children) 

572 if block_header_sense.endswith(":"): 

573 block_header_sense = block_header_sense[:-1].strip() 

574 elif any(x in classes for x in ("NavFrame", "term-list")): 

575 # NavFrame uses previously assigned block_header_sense 

576 # (from a "(sense):" item) and clears it afterwards 

577 # print(f"{sense=}, {block_header_sense=}") 

578 toli = parse_linkage_recurse( 

579 wxr, 

580 node.children, 

581 field, 

582 sense or block_header_sense, 

583 block_header_sense, 

584 word, 

585 data, 

586 sense_datas, 

587 is_reconstruction, 

588 ) 

589 text_outside_list_items.extend(toli) 

590 block_header_sense = None 

591 else: 

592 toli = parse_linkage_recurse( 

593 wxr, 

594 node.children, 

595 field, 

596 sense, 

597 block_header_sense, 

598 word, 

599 data, 

600 sense_datas, 

601 is_reconstruction, 

602 ) 

603 text_outside_list_items.extend(toli) 

604 elif isinstance(node, LevelNode): 604 ↛ 606line 604 didn't jump to line 606 because the condition on line 604 was never true

605 # Just recurse to any possible subsections 

606 toli = parse_linkage_recurse( 

607 wxr, 

608 node.children, 

609 field, 

610 sense, 

611 block_header_sense, 

612 word, 

613 data, 

614 sense_datas, 

615 is_reconstruction, 

616 ) 

617 text_outside_list_items.extend(toli) 

618 elif kind in (NodeKind.BOLD, NodeKind.ITALIC): 

619 # Skip these on top level; at least sometimes bold is 

620 # used for indicating a subtitle 

621 continue 

622 elif kind == NodeKind.LINK: 622 ↛ 639line 622 didn't jump to line 639 because the condition on line 622 was always true

623 # Recurse into the last argument 

624 # Apparently ":/" is used as a link to "/", so strip 

625 # initial value 

626 toli = parse_linkage_recurse( 

627 wxr, 

628 node.largs[-1], 

629 field, 

630 sense, 

631 block_header_sense, 

632 word, 

633 data, 

634 sense_datas, 

635 is_reconstruction, 

636 ) 

637 text_outside_list_items.extend(toli) 

638 else: 

639 wxr.wtp.debug( 

640 "parse_linkage_recurse unhandled {}: {}".format(kind, node), 

641 sortid="page/2196", 

642 ) 

643 

644 return text_outside_list_items 

645 

646 

647def parse_linkage_item( 

648 wxr: WiktextractContext, 

649 contents: list[str | WikiNode], 

650 field: str, 

651 word: str, 

652 data: WordData, 

653 sense_datas: list[SenseData], 

654 is_reconstruction: bool, 

655 sense: str | None = None, 

656) -> list[str]: 

657 assert isinstance(contents, (list, tuple)) 

658 assert isinstance(field, str) 

659 assert sense is None or isinstance(sense, str) 

660 

661 # print("PARSE_LINKAGE_ITEM: {} ({}): {}" 

662 # .format(field, sense, contents)) 

663 

664 parts: list[str] = [] 

665 ruby: list[tuple[str, str]] = [] 

666 urls: list[str] = [] 

667 # data about link text; this is used to skip splitting on 

668 # linkage text items that contain stuff like commas; for 

669 # example "Hunde, die bellen, beißen nicht" in article 

670 # beißen is split into "Hunde", "die bellen" etc. 

671 # We take that link text and use it, eventually, 

672 # in split_at_comma_semi to skip splitting on those 

673 # commas. 

674 links_that_should_not_be_split: list[str] = [] 

675 

676 def item_recurse( 

677 contents: list[str | WikiNode], possible_sense: str | None = None 

678 ) -> bool: 

679 assert isinstance(contents, (list, tuple)) 

680 nonlocal sense 

681 nonlocal ruby 

682 nonlocal parts 

683 is_sense = False 

684 # print("ITEM_RECURSE:", contents) 

685 for node in contents: 

686 if isinstance(node, str): 

687 parts.append(node) 

688 continue 

689 kind = node.kind 

690 # print( 

691 # "ITEM_RECURSE KIND:", 

692 # kind, 

693 # node.sarg if node.sarg else node.largs, 

694 # ) 

695 

696 #### parts into possible_sense 

697 if ( 

698 is_list_item(node) 

699 or is_list(node) 

700 or kind 

701 in ( 

702 NodeKind.TABLE, 

703 NodeKind.TABLE_ROW, 

704 NodeKind.TABLE_CELL, 

705 ) 

706 and parts 

707 ): 

708 # print(f"{parts=}") 

709 candidate_sense: str | None 

710 candidate_sense = clean_node(wxr, None, parts) 

711 is_sense = False 

712 

713 if candidate_sense.endswith(":"): 

714 is_sense = True 

715 candidate_sense = candidate_sense[:-1].strip() 

716 if candidate_sense.startswith("(") and candidate_sense.endswith( 716 ↛ 719line 716 didn't jump to line 719 because the condition on line 716 was never true

717 ")" 

718 ): 

719 is_sense = True 

720 candidate_sense = candidate_sense[1:-1].strip() 

721 if ( 

722 candidate_sense.lower() == TRANSLATIONS_TITLE 

723 or not is_sense 

724 ): 

725 candidate_sense = None 

726 # print(f"{possible_sense=}, {is_sense=}") 

727 if is_sense: 

728 possible_sense = candidate_sense 

729 parts = [] 

730 else: 

731 candidate_sense = None 

732 

733 # Handle nodes 

734 if is_list_item(node): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true

735 parse_linkage_item( 

736 wxr, 

737 node.children, 

738 field, 

739 word, 

740 data, 

741 sense_datas, 

742 is_reconstruction, 

743 possible_sense or sense, 

744 ) 

745 elif is_list(node) or kind in ( 

746 NodeKind.TABLE, 

747 NodeKind.TABLE_ROW, 

748 NodeKind.TABLE_CELL, 

749 ): 

750 parse_linkage_recurse( 

751 wxr, 

752 node.children, 

753 field, 

754 possible_sense or sense, 

755 None, 

756 word, 

757 data, 

758 sense_datas, 

759 is_reconstruction, 

760 ) 

761 elif kind in ( 761 ↛ 765line 761 didn't jump to line 765 because the condition on line 761 was never true

762 NodeKind.TABLE_HEADER_CELL, 

763 NodeKind.TABLE_CAPTION, 

764 ): 

765 continue 

766 elif kind == NodeKind.HTML: 766 ↛ 767line 766 didn't jump to line 767 because the condition on line 766 was never true

767 classes = (node.attrs.get("class") or "").split() 

768 if node.sarg in ("gallery", "ref", "cite", "caption"): 

769 continue 

770 elif node.sarg == "ruby": 

771 rb = parse_ruby(wxr, node) 

772 if rb: 

773 ruby.append(rb) 

774 parts.append(rb[0]) 

775 continue 

776 elif node.sarg == "math": 

777 parts.append(clean_node(wxr, None, node)) 

778 continue 

779 elif "interProject" in classes: 

780 continue # These do not seem to be displayed 

781 if "NavFrame" in classes: 

782 parse_linkage_recurse( 

783 wxr, 

784 node.children, 

785 field, 

786 possible_sense or sense, 

787 None, 

788 word, 

789 data, 

790 sense_datas, 

791 is_reconstruction, 

792 ) 

793 else: 

794 item_recurse(node.children, possible_sense) 

795 elif kind == NodeKind.LINK: 

796 ignore = False 

797 if isinstance(node.largs[0][0], str): 797 ↛ 685line 797 didn't jump to line 685 because the condition on line 797 was always true

798 v1 = node.largs[0][0].strip().lower() 

799 if v1.startswith( 799 ↛ 803line 799 didn't jump to line 803 because the condition on line 799 was never true

800 ns_title_prefix_tuple(wxr, "Category", True) 

801 + ns_title_prefix_tuple(wxr, "File", True) 

802 ): 

803 ignore = True 

804 if not ignore: 804 ↛ 685line 804 didn't jump to line 685 because the condition on line 804 was always true

805 v = node.largs[-1] 

806 if ( 

807 len(node.largs) == 1 

808 and len(v) > 0 

809 and isinstance(v[0], str) 

810 and v[0][0] == ":" 

811 ): 

812 v = [v[0][1:]] + list(v[1:]) # type:ignore 

813 if isinstance(v[0], str) and not v[0].isalnum(): 

814 links_that_should_not_be_split.append("".join(v[0])) # type: ignore 

815 item_recurse(v, possible_sense) 

816 elif kind == NodeKind.URL: 

817 if len(node.largs) < 2 and node.largs: 

818 # Naked url captured 

819 urls.extend(node.largs[-1]) # type:ignore[arg-type] 

820 continue 

821 if len(node.largs) == 2: 821 ↛ 826line 821 didn't jump to line 826 because the condition on line 821 was always true

822 # Url from link with text 

823 urls.append(node.largs[0][-1]) # type:ignore[arg-type] 

824 # print(f"{node.largs=!r}") 

825 # print("linkage recurse URL {}".format(node)) 

826 item_recurse(node.largs[-1], possible_sense) 

827 elif kind in ( 827 ↛ 834line 827 didn't jump to line 834 because the condition on line 827 was always true

828 NodeKind.PREFORMATTED, 

829 NodeKind.BOLD, 

830 NodeKind.ITALIC, 

831 ): 

832 item_recurse(node.children) 

833 else: 

834 wxr.wtp.debug( 

835 "linkage item_recurse unhandled {}: {}".format( 

836 node.kind, node 

837 ), 

838 sortid="page/2073", 

839 ) 

840 

841 return is_sense 

842 

843 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}" 

844 # .format(contents)) 

845 

846 is_sense = item_recurse(contents) 

847 

848 if not is_sense: 

849 item = clean_node(wxr, None, parts) 

850 # print("LINKAGE ITEM CONTENTS:", parts) 

851 # print("CLEANED ITEM: {!r}".format(item)) 

852 # print(f"URLS {urls=!r}") 

853 

854 if v := parse_linkage_item_text( 854 ↛ 867line 854 didn't jump to line 867 because the condition on line 854 was never true

855 wxr, 

856 word, 

857 data, 

858 field, 

859 item, 

860 sense, 

861 ruby, 

862 sense_datas, 

863 is_reconstruction, 

864 urls or None, 

865 links_that_should_not_be_split or None, 

866 ): 

867 return [v] 

868 

869 return [] 

870 

871 

872def parse_linkage_item_text( 

873 wxr: WiktextractContext, 

874 word: str, 

875 data: WordData, 

876 field: str, 

877 item: str, 

878 sense: Optional[str], 

879 ruby: list, 

880 pos_datas: list, 

881 is_reconstruction: bool, 

882 urls: Optional[list[str]] = None, 

883 links: Optional[list[str]] = None, 

884) -> Optional[str]: 

885 """Parses a linkage item once it has been converted to a string. This 

886 may add one or more linkages to ``data`` under ``field``. This 

887 returns None or a string that contains a sense that should be applied 

888 to additional linkages (commonly used in tables for Asian characters).""" 

889 assert isinstance(wxr, WiktextractContext) 

890 assert isinstance(word, str) # Main word (derived from page title) 

891 assert isinstance(data, dict) # Parsed linkages are stored here under field 

892 assert isinstance(field, str) # The field under which to store linkage 

893 assert isinstance(item, str) # The string to parse 

894 assert sense is None or isinstance(sense, str) 

895 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" 

896 assert isinstance(pos_datas, list) # List of senses (containing "glosses") 

897 assert urls is None or isinstance(urls, list) # Captured urls 

898 assert is_reconstruction in (True, False) 

899 

900 item = item.replace("()", "") 

901 item = re.sub(r"\s+", " ", item) 

902 item = item.strip() 

903 

904 base_roman = None 

905 base_alt = None 

906 base_english = None 

907 script_chars = False 

908 base_qualifier = None 

909 lang = wxr.wtp.section 

910 

911 # If ``sense`` can be parsed as tags, treat it as tags instead 

912 if sense: 

913 cls = classify_desc(sense, no_unknown_starts=True) 

914 if cls == "tags": 

915 base_qualifier = sense 

916 sense = None 

917 

918 # Check if this item is a stand-alone sense (or tag) specifier 

919 # for following items (e.g., commonly in a table, see 滿) 

920 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item) 

921 if m: 

922 return m.group(1) 

923 

924 # Check for pre-split ignored linkages using the appropriate regexp 

925 if re.search(linkage_pre_split_ignore_re, item): 

926 return None 

927 

928 # print(" LINKAGE ITEM: {}: {} (sense {})" 

929 # .format(field, item, sense)) 

930 

931 # Replace occurrences of ~ in the item by the page title 

932 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr] 

933 item = item.replace(" ~ ", " " + safetitle + " ") 

934 item = re.sub(r"^~ ", safetitle + " ", item) 

935 item = re.sub(r" ~$", " " + safetitle, item) 

936 

937 # Many taxonomic terms contain hyponym lists that end with the 

938 # kind of the hyponym (a taxonomic level in plural). Recognize 

939 # such and add the term in singular to all linkages in the list. 

940 m = re.search(taxonomic_ending_re, item) 

941 if m: 

942 base_english = taxonomic_ending_map[m.group(1)] 

943 item = item[: m.start()] 

944 

945 # Some Korean and Japanese words use "word (romanized): english" pattern 

946 # Sometimes the parenthesized part contains comma-separated alt and roman. 

947 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item) 

948 if m: 

949 rom = m.group(2) 

950 eng = m.group(3) 

951 rest = m.group(1) 

952 if ( 

953 classify_desc(rest, no_unknown_starts=True) == "other" 

954 and classify_desc(eng, no_unknown_starts=True) == "english" 

955 ): 

956 item = rest 

957 base_roman = rom 

958 lst = base_roman.split(", ") 

959 if ( 

960 len(lst) == 2 

961 and classify_desc(lst[0], no_unknown_starts=True) == "other" 

962 ): 

963 base_alt = lst[0] 

964 base_roman = lst[1] 

965 if base_english: 

966 base_english += "; " + eng 

967 else: 

968 base_english = eng 

969 

970 # Many words have tags or similar descriptions in the beginning 

971 # followed by a colon and one or more linkages (e.g., 

972 # panetella/Finnish) 

973 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( 

974 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", 

975 item, 

976 ) 

977 if m: 

978 desc = m.group(1) 

979 rest = m.group(len(m.groups())) 

980 # Check for certain comma-separated tags combined 

981 # with English text at the beginning or end of a 

982 # comma-separated parenthesized list 

983 lst = split_at_comma_semi(desc, skipped=links) 

984 while len(lst) > 1: 

985 # Check for tags at the beginning 

986 cls = classify_desc(lst[0], no_unknown_starts=True) 

987 if cls == "tags": 

988 if base_qualifier: 

989 base_qualifier += ", " + lst[0] 

990 else: 

991 base_qualifier = lst[0] 

992 lst = lst[1:] 

993 continue 

994 # Check for tags at the end 

995 cls = classify_desc(lst[-1], no_unknown_starts=True) 

996 if cls == "tags": 

997 if base_qualifier: 

998 base_qualifier += ", " + lst[-1] 

999 else: 

1000 base_qualifier = lst[-1] 

1001 lst = lst[:-1] 

1002 continue 

1003 break 

1004 desc = ", ".join(lst) 

1005 

1006 # Sometimes we have e.g. "chemistry (slang)" with are 

1007 # both tags (see "stink"). Handle that case by 

1008 # removing parentheses if the value is still tags. The part with 

1009 # parentheses could be on either side of the colon. 

1010 if "(" in desc: 

1011 x = desc.replace("(", ",").replace(")", ",") 

1012 if classify_desc(x, no_unknown_starts=True) == "tags": 

1013 desc = x 

1014 elif "(" in rest: 

1015 x = rest.replace("(", ",").replace(")", ",") 

1016 if classify_desc(x, no_unknown_starts=True) == "tags": 

1017 rest = desc 

1018 desc = x 

1019 

1020 # See if the prefix should trigger special handling for script 

1021 # character, letter, digit, etc. handling 

1022 if re.search(script_chars_re, desc): 

1023 script_chars = True 

1024 

1025 # Try to determine which side is description and which is 

1026 # the linked term (both orders are widely used in Wiktionary) 

1027 cls = classify_desc(desc, no_unknown_starts=True) 

1028 cls2 = classify_desc(rest, no_unknown_starts=True) 

1029 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}" 

1030 # .format(desc, cls, rest, cls2)) 

1031 

1032 e1 = wxr.wtp.page_exists(desc) 

1033 e2 = wxr.wtp.page_exists(rest) 

1034 if cls != "tags": 

1035 if ( 

1036 cls2 == "tags" 

1037 or (e1 and not e1) 

1038 or ( 

1039 e1 

1040 and e2 

1041 and cls2 == "english" 

1042 and cls in ("other", "romanization") 

1043 ) 

1044 or ( 

1045 not e1 

1046 and not e2 

1047 and cls2 == "english" 

1048 and cls in ("other", "romanization") 

1049 ) 

1050 ): 

1051 desc, rest = rest, desc # Looks like swapped syntax 

1052 cls = cls2 

1053 if re.search(linkage_paren_ignore_contains_re, desc): 1053 ↛ 1054line 1053 didn't jump to line 1054 because the condition on line 1053 was never true

1054 desc = "" 

1055 # print("linkage colon prefix desc={!r} rest={!r} cls={}" 

1056 # .format(desc, rest, cls)) 

1057 

1058 # Handle the prefix according to its type 

1059 if cls == "tags": 

1060 if base_qualifier: 

1061 base_qualifier += ", " + desc 

1062 else: 

1063 base_qualifier = desc 

1064 item = rest 

1065 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"): 

1066 if base_english: 1066 ↛ 1067line 1066 didn't jump to line 1067 because the condition on line 1066 was never true

1067 base_english += "; " + base_english 

1068 else: 

1069 base_english = desc 

1070 item = rest 

1071 elif cls in ("english", "taxonomic"): 

1072 if sense: 1072 ↛ 1073line 1072 didn't jump to line 1073 because the condition on line 1072 was never true

1073 sense += "; " + desc 

1074 else: 

1075 sense = desc 

1076 item = rest 

1077 elif desc.isdigit(): 

1078 idx = int(desc) - 1 

1079 if idx >= 0 and idx < len(pos_datas): 

1080 d = pos_datas[idx] 

1081 gl = "; ".join(d.get("glosses", ())) 

1082 if not gl: 1082 ↛ 1083line 1082 didn't jump to line 1083 because the condition on line 1082 was never true

1083 wxr.wtp.debug( 

1084 "parenthesized numeric linkage prefix, " 

1085 "but the referenced sense has no gloss: " 

1086 "{}".format(desc), 

1087 sortid="linkages/355", 

1088 ) 

1089 elif sense: 

1090 sense += "; " + gl 

1091 else: 

1092 sense = gl 

1093 item = rest 

1094 else: 

1095 wxr.wtp.debug( 

1096 "parenthesized numeric linkage prefix, " 

1097 "but there is no sense with such index: {}".format(desc), 

1098 sortid="linkages/365", 

1099 ) 

1100 item = rest 

1101 else: 

1102 wxr.wtp.debug( 

1103 "unrecognized linkage prefix: {} desc={} rest={} " 

1104 "cls={} cls2={} e1={} e2={}".format( 

1105 item, desc, rest, cls, cls2, e1, e2 

1106 ), 

1107 sortid="linkages/371", 

1108 ) 

1109 item = rest 

1110 

1111 base_sense = sense 

1112 

1113 # Check for certain plural tag forms at end of items list, and apply 

1114 # them to all items if found 

1115 m = re.search( 

1116 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" 

1117 r"characters|symbols|tetragrams|letter names|names|" 

1118 r"female names|male names|proper nouns|contractions|" 

1119 r"nonstandard spellings|verbs|prepositions|postpositions|" 

1120 r"interjections|Abbreviations|abbreviations|variants|" 

1121 r"ordinals|nouns|phrases|adjectives|adverbs|" 

1122 r"augmentatives|pejoratives|compound words|numerals|" 

1123 r"Tally marks|surnames|modern nonstandard spellings)$", 

1124 item, 

1125 ) 

1126 if m: 

1127 suffix = m.group(1) 

1128 if base_qualifier: 

1129 base_qualifier += ", " + suffix 

1130 else: 

1131 base_qualifier = suffix 

1132 item = item[: m.start()] 

1133 

1134 # Certain linkage items have space-separated valus. These are 

1135 # generated by, e.g., certain templates 

1136 if base_sense and base_sense.endswith(" paper sizes"): 

1137 base_qualifier = None 

1138 item = ", ".join(item.split()) 

1139 # XXX isn't this now handled by the generic digits/letters/etc code? 

1140 # elif base_qualifier in ("Arabic digits",): 

1141 # item = ", ".join(item.split()) 

1142 

1143 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript 

1144 item = item.strip() 

1145 if not item: 

1146 return None 

1147 

1148 # Kludge: if the item contains ")/" (with possibly spaces in between), 

1149 # replace it by a comma so it gets split. 

1150 item = re.sub(r"\)\s*/", "), ", item) 

1151 

1152 # The item may contain multiple comma-separated linkages 

1153 if base_roman: 

1154 subitems = [item] 

1155 else: 

1156 # Split at commas. Also, in most cases split by " or ", but this 

1157 # is complicated - "or" may end certain words (e.g., "logical or") 

1158 # and it may separate head-final tags (e.g. "foo f or m"). Also, 

1159 # some words have parenthesizxed parts in between, e.g., 

1160 # wife/English/Translations/Yiddish: 

1161 # "ווײַב‎ n (vayb) or f, פֿרוי‎ f (froy)" 

1162 subitems = [] 

1163 for item1 in split_at_comma_semi(item, skipped=links): 

1164 if " or " not in item1: 

1165 subitems.append(item1) 

1166 continue 

1167 # Item1 contains " or " 

1168 item2 = re.sub(r"\s*\([^)]*\)", "", item1) 

1169 item2 = re.sub(r"\s+", " ", item2) 

1170 if ( 

1171 ( 

1172 lang not in head_final_bantu_langs 

1173 or not re.search(head_final_bantu_re, item2) 

1174 ) 

1175 and ( 

1176 lang not in head_final_other_langs 

1177 or not re.search(head_final_other_re, item2) 

1178 ) 

1179 and ( 

1180 not re.search(head_final_re, item2) 

1181 or ( 

1182 item2[-1].isdigit() 

1183 and lang not in head_final_numeric_langs 

1184 ) 

1185 ) 

1186 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE") 

1187 and all( 

1188 wxr.wtp.title not in x.split(" or ") 

1189 for x in split_at_comma_semi(item2, skipped=links) 

1190 if " or " in x 

1191 ) 

1192 ): 

1193 # We can split this item. Split the non-cleaned version 

1194 # that still has any intervening parenthesized parts. 

1195 subitems.extend( 

1196 split_at_comma_semi(item1, extra=[" or "], skipped=links) 

1197 ) 

1198 else: 

1199 subitems.append(item1) 

1200 if len(subitems) > 1: # Would be merged from multiple subitems 

1201 ruby = [] # XXX what is the purpose of this? 

1202 for item1 in subitems: 

1203 if len(subitems) > 1 and item1 in ("...", "…"): 

1204 # Some lists have ellipsis in the middle - don't generate 

1205 # linkages for the ellipsis 

1206 continue 

1207 item1 = item1.strip() 

1208 qualifier = base_qualifier 

1209 sense = base_sense 

1210 parts = [] 

1211 roman = base_roman # Usually None 

1212 alt = base_alt # Usually None 

1213 taxonomic = None 

1214 english = base_english 

1215 

1216 # Some words have derived terms with parenthesized quoted English 

1217 # descriptions, which can sometimes essentially be tags 

1218 # Some word (bleki/Esperanto...) can have parentheses inside 

1219 # the quotes, so let's make this regex even more unreadable. 

1220 m = re.search(r"\s*\(“([^”]+)”\)", item1) 

1221 if m: 1221 ↛ 1222line 1221 didn't jump to line 1222 because the condition on line 1221 was never true

1222 t = m.group(1) 

1223 item1 = (item1[: m.start()] + item1[m.end() :]).strip() 

1224 cls = classify_desc(t) 

1225 if cls == "tags": 

1226 if qualifier: 

1227 qualifier += ", " + t 

1228 else: 

1229 qualifier = t 

1230 else: 

1231 english = t 

1232 

1233 # Some Korean words use "word (alt, oman, “english”) pattern 

1234 # See 滿/Korean 

1235 m = re.match( 

1236 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " 

1237 r'[“”"]([^”“"]+)[“”"]\)$', 

1238 item1, 

1239 ) 

1240 if ( 

1241 m 

1242 and classify_desc(m.group(1), no_unknown_starts=True) == "other" 

1243 and classify_desc(m.group(2), no_unknown_starts=True) == "other" 

1244 ): 

1245 alt = m.group(2) 

1246 roman = m.group(3) 

1247 english = m.group(4) 

1248 item1 = m.group(1) 

1249 

1250 words = item1.split(" ") 

1251 if ( 

1252 len(words) > 1 

1253 and words[0] in linkage_beginning_tags 

1254 and words[0] != wxr.wtp.title 

1255 ): 

1256 t = linkage_beginning_tags[words[0]] 

1257 item1 = " ".join(words[1:]) 

1258 if qualifier: 1258 ↛ 1259line 1258 didn't jump to line 1259 because the condition on line 1258 was never true

1259 qualifier += ", " + t 

1260 else: 

1261 qualifier = t 

1262 

1263 # Extract quoted English translations (there are also other 

1264 # kinds of English translations) 

1265 def english_repl(m: re.Match) -> str: 

1266 nonlocal english 

1267 nonlocal qualifier 

1268 v = m.group(1).strip() 

1269 # If v is "tags: sense", handle the tags 

1270 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v) 

1271 if m1 is not None: 1271 ↛ 1272line 1271 didn't jump to line 1272 because the condition on line 1271 was never true

1272 desc, rest = m1.groups() 

1273 if classify_desc(desc, no_unknown_starts=True) == "tags": 

1274 if qualifier: 

1275 qualifier += ", " + desc 

1276 else: 

1277 qualifier = desc 

1278 v = rest 

1279 if english: 

1280 english += "; " + v 

1281 else: 

1282 english = v 

1283 return "" 

1284 

1285 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip() 

1286 

1287 # There could be multiple parenthesized parts, and 

1288 # sometimes both at the beginning and at the end. 

1289 # And sometimes even in the middle, as in e.g. 

1290 # wife/English/Translations/Yiddish 

1291 while not script_chars and ( 

1292 not sense or not re.search(script_chars_re, sense) 

1293 ): 

1294 par = None 

1295 nonfirst_par = False 

1296 if par is None: 1296 ↛ 1313line 1296 didn't jump to line 1313 because the condition on line 1296 was always true

1297 # Try to find a parenthesized part from the beginning. 

1298 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) 

1299 if m: 

1300 par = m.group(1) 

1301 item1 = item1[m.end() :] 

1302 else: 

1303 # Try to find a parenthesized part at the end or from the 

1304 # middle. 

1305 m = re.search( 

1306 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", 

1307 item1, 

1308 ) 

1309 if m: 

1310 par = m.group(1) 

1311 item1 = item1[: m.start()] + item1[m.end() :] 

1312 nonfirst_par = True 

1313 if not par: 

1314 break 

1315 if re.search(linkage_paren_ignore_contains_re, par): 

1316 continue # Skip these linkage descriptors 

1317 par = par.strip() 

1318 # Handle tags from beginning of par. We also handle "other" 

1319 # here as Korean entries often have Hanja form in the 

1320 # beginning of parenthesis, before romanization. Similar 

1321 # for many Japanese entries. 

1322 while par: 1322 ↛ 1343line 1322 didn't jump to line 1343 because the condition on line 1322 was always true

1323 idx = par.find(",") 

1324 if idx <= 0: 

1325 break 

1326 cls = classify_desc(par[:idx], no_unknown_starts=True) 

1327 if cls == "other" and not alt: 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true

1328 alt = par[:idx] 

1329 elif cls == "taxonomic": 1329 ↛ 1330line 1329 didn't jump to line 1330 because the condition on line 1329 was never true

1330 taxonomic = par[:idx] 

1331 elif cls == "tags": 

1332 if qualifier: 

1333 qualifier += ", " + par[:idx] 

1334 else: 

1335 qualifier = par[:idx] 

1336 else: 

1337 break 

1338 par = par[idx + 1 :].strip() 

1339 

1340 # Check for certain comma-separated tags combined 

1341 # with English text at the beginning or end of a 

1342 # comma-separated parenthesized list 

1343 lst = par.split(",") if len(par) > 1 else [par] 

1344 lst = list(x.strip() for x in lst if x.strip()) 

1345 while len(lst) > 1: 

1346 cls = classify_desc(lst[0], no_unknown_starts=True) 

1347 if cls == "tags": 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true

1348 if qualifier: 

1349 qualifier += ", " + lst[0] 

1350 else: 

1351 qualifier = lst[0] 

1352 lst = lst[1:] 

1353 continue 

1354 cls = classify_desc(lst[-1], no_unknown_starts=True) 

1355 if cls == "tags": 

1356 if qualifier: 

1357 qualifier += ", " + lst[-1] 

1358 else: 

1359 qualifier = lst[-1] 

1360 lst = lst[:-1] 

1361 continue 

1362 break 

1363 par = ", ".join(lst) 

1364 

1365 # Handle remaining types 

1366 if not par: 1366 ↛ 1367line 1366 didn't jump to line 1367 because the condition on line 1366 was never true

1367 continue 

1368 if re.search(script_chars_re, par): 

1369 script_chars = True 

1370 if classify_desc(par, no_unknown_starts=True) == "tags": 1370 ↛ 1380line 1370 didn't jump to line 1380 because the condition on line 1370 was always true

1371 if base_qualifier: 1371 ↛ 1372line 1371 didn't jump to line 1372 because the condition on line 1371 was never true

1372 base_qualifier += "; " + par 

1373 else: 

1374 base_qualifier = par 

1375 if qualifier: 1375 ↛ 1376line 1375 didn't jump to line 1376 because the condition on line 1375 was never true

1376 qualifier += "; " + par 

1377 else: 

1378 qualifier = par 

1379 else: 

1380 if base_sense: 

1381 base_sense += "; " + par 

1382 else: 

1383 base_sense = par 

1384 if sense: 

1385 sense += "; " + par 

1386 else: 

1387 sense = par 

1388 elif par.endswith(" letter names"): 1388 ↛ 1389line 1388 didn't jump to line 1389 because the condition on line 1388 was never true

1389 if base_qualifier: 

1390 base_qualifier += "; " + par 

1391 else: 

1392 base_qualifier = par 

1393 if qualifier: 

1394 qualifier += "; " + par 

1395 else: 

1396 qualifier = par 

1397 else: 

1398 cls = classify_desc(par) 

1399 # print("classify_desc: {!r} -> {}".format(par, cls)) 

1400 if cls == "tags": 

1401 if qualifier: 

1402 qualifier += ", " + par 

1403 else: 

1404 qualifier = par 

1405 elif cls == "english": 

1406 if nonfirst_par: 

1407 if english: 

1408 english += "; " + par 

1409 else: 

1410 english = par 

1411 else: 

1412 if sense: 1412 ↛ 1413line 1412 didn't jump to line 1413 because the condition on line 1412 was never true

1413 sense += "; " + par 

1414 else: 

1415 sense = par 

1416 elif cls == "romanization": 

1417 roman = par 

1418 elif cls == "taxonomic": 

1419 taxonomic = par 

1420 elif par.isdigit(): 

1421 idx = int(par) - 1 

1422 if idx >= 0 and idx < len(pos_datas): 

1423 d = pos_datas[idx] 

1424 gl = "; ".join(d.get("glosses", ())) 

1425 if not gl: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true

1426 wxr.wtp.debug( 

1427 "parenthesized number " 

1428 "but the referenced sense has no " 

1429 "gloss: {}".format(par), 

1430 sortid="linkages/665", 

1431 ) 

1432 elif sense: 1432 ↛ 1435line 1432 didn't jump to line 1435 because the condition on line 1432 was always true

1433 sense += "; " + gl 

1434 else: 

1435 sense = gl 

1436 else: 

1437 wxr.wtp.debug( 

1438 "parenthesized number but there is " 

1439 "no sense with such index: {}".format(par), 

1440 sortid="linkages/674", 

1441 ) 

1442 else: 

1443 if alt: 1443 ↛ 1444line 1443 didn't jump to line 1444 because the condition on line 1443 was never true

1444 alt += "; " + par 

1445 else: 

1446 alt = par 

1447 

1448 # Handle certain special cases, unless we are parsing 

1449 # script characters. 

1450 if not script_chars: 

1451 # Ignore all linkages with certain prefixes, suffixes, or parts 

1452 # (this is done after removing certain prefixes and suffixes) 

1453 if re.search(linkage_ignore_re, item1): 

1454 continue # Ignore linkages with certain prefixes 

1455 

1456 # Remove certain prefixes from linkages 

1457 m = re.match(linkage_remove_prefixes_re, item1) 

1458 if m: 

1459 prefix = item1[: m.end()] 

1460 item1 = item1[m.end() :] 

1461 if prefix in linkage_remove_prefixes_tags: 

1462 if qualifier: 

1463 qualifier += ", " + linkage_remove_prefixes_tags[prefix] 

1464 else: 

1465 qualifier = linkage_remove_prefixes_tags[prefix] 

1466 # Recheck ignored linkages 

1467 if re.search(linkage_ignore_re, item1): 

1468 continue 

1469 

1470 # Remove certain suffixes from linkages 

1471 m = re.search(linkage_remove_suffixes_re, item1) 

1472 if m: 

1473 item1 = item1[: m.start()] 

1474 

1475 # Parse linkages with "value = english" syntax (e.g., 

1476 # väittää/Finnish) 

1477 idx = item1.find(" = ") 

1478 if idx >= 0: 

1479 eng = item1[idx + 3 :] 

1480 if classify_desc(eng, no_unknown_starts=True) == "english": 

1481 english = eng 

1482 item1 = item1[:idx] 

1483 else: 

1484 # Some places seem to use it reversed 

1485 # "english = value" 

1486 eng = item1[:idx] 

1487 if classify_desc(eng, no_unknown_starts=True) == "english": 

1488 english = eng 

1489 item1 = item1[idx + 3 :] 

1490 

1491 # Parse linkages with "value - english" syntax (e.g., 

1492 # man/Faroese) 

1493 m = re.search(r" [-‐‑‒–—―] ", item1) 

1494 if m and "(" not in item1: 

1495 suffix = item1[m.end() :] 

1496 cls = classify_desc(suffix, no_unknown_starts=True) 

1497 if cls == "english": 

1498 # This case intentionally ignores old values from english 

1499 # (otherwise taxonomic lists fail) 

1500 english = suffix 

1501 item1 = item1[: m.start()] 

1502 elif cls == "tags": 

1503 if qualifier: 1503 ↛ 1504line 1503 didn't jump to line 1504 because the condition on line 1503 was never true

1504 qualifier += ", " + suffix 

1505 else: 

1506 qualifier = suffix 

1507 item1 = item1[: m.start()] 

1508 

1509 # Parse certain tags at the end of the linked term (unless 

1510 # we are in a letters list) 

1511 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1) 

1512 if q: 

1513 if qualifier: 1513 ↛ 1514line 1513 didn't jump to line 1514 because the condition on line 1513 was never true

1514 qualifier += ", " + ", ".join(q) 

1515 else: 

1516 qualifier = ", ".join(q) 

1517 

1518 m = re.search(linkage_truncate_re, item1) 

1519 if m: 1519 ↛ 1521line 1519 didn't jump to line 1521 because the condition on line 1519 was never true

1520 # suffix = item1[m.start():] # Currently ignored 

1521 item1 = item1[: m.start()] 

1522 if not item1: 

1523 continue # Ignore empty link targets 

1524 if item1 == word: 

1525 continue # Ignore self-links 

1526 

1527 def add(w: str, r: Optional[str]) -> None: 

1528 assert isinstance(w, str) 

1529 assert r is None or isinstance(r, str) 

1530 nonlocal alt 

1531 nonlocal taxonomic 

1532 

1533 # We remove "*" from the beginning of reconstruction linkages. 

1534 # Such linkages should only occur in reconstruction senses, so 

1535 # this should not cause ambiguity. 

1536 if is_reconstruction and w.startswith("*"): 

1537 w = w[1:] 

1538 

1539 # Check if the word contains the Fullwith Solidus, and if 

1540 # so, split by it and treat the the results as alternative 

1541 # linkages. (This is very commonly used for alternative 

1542 # written forms in Chinese compounds and other linkages.) 

1543 # However, if the word contains a comma, then we wont't 

1544 # split as this is used when we have a different number 

1545 # of romanizations than written forms, and don't know 

1546 # which is which. 

1547 if ( 

1548 (not w or "," not in w) 

1549 and (not r or "," not in r) 

1550 and not wxr.wtp.page_exists(w) 

1551 ): 

1552 lst = w.split("/") if len(w) > 1 else [w] 

1553 if len(lst) == 1: 

1554 lst = w.split(" / ") 

1555 if len(lst) == 1 and len(lst[0]) >= 6: 

1556 lst = w.split("/") 

1557 if len(lst) > 1: 

1558 # Treat each alternative as separate linkage 

1559 for w in lst: 

1560 add(w, r) 

1561 return None 

1562 

1563 # Heuristically remove "." at the end of most linkages 

1564 # (some linkage lists end in a period, but we also have 

1565 # abbreviations that end with a period that should be kept) 

1566 if ( 

1567 w.endswith(".") 

1568 and not wxr.wtp.page_exists(w) 

1569 and ( 

1570 wxr.wtp.page_exists(w[:-1]) 

1571 or (len(w) >= 5) 

1572 and "." not in w[:-1] 

1573 ) 

1574 ): 

1575 w = w[:-1] 

1576 

1577 # If we have roman but not alt and the word is ASCII, 

1578 # move roman to alt. 

1579 if r and not alt and w.isascii(): 

1580 alt = r 

1581 r = None 

1582 # Add the linkage 

1583 dt: LinkageData = {} 

1584 if qualifier: 

1585 parse_sense_qualifier(wxr, qualifier, dt) 

1586 if sense: 

1587 dt["sense"] = sense.strip() 

1588 if r: 

1589 dt["roman"] = r.strip() 

1590 if ruby: 

1591 dt["ruby"] = ruby 

1592 if english: 

1593 dt["english"] = english.strip() # DEPRECATED for "translation" 

1594 dt["translation"] = english.strip() 

1595 if taxonomic: 

1596 if re.match(r"×[A-Z]", taxonomic): 

1597 data_append(dt, "tags", "extinct") 

1598 taxonomic = taxonomic[1:] 

1599 dt["taxonomic"] = taxonomic 

1600 if re.match(r"×[A-Z]", w): 

1601 data_append(dt, "tags", "extinct") 

1602 w = w[1:] # Remove × before dead species names 

1603 if alt and re.match(r"×[A-Z]", alt): 

1604 data_append(dt, "tags", "extinct") 

1605 alt = alt[1:] # Remove × before dead species names 

1606 if alt and alt.strip() != w: 

1607 dt["alt"] = alt.strip() 

1608 if urls: 

1609 dt["urls"] = [ 

1610 url.strip() for url in urls if url and isinstance(url, str) 

1611 ] 

1612 dt["word"] = w 

1613 for old in data.get(field, ()): # type: ignore[attr-defined] 

1614 if dt == old: 

1615 break 

1616 else: 

1617 data_append(data, field, dt) 

1618 

1619 # Handle exceptional linkage splits and other linkage 

1620 # conversions (including expanding to variant forms) 

1621 if item1 in linkage_split_exceptions: 1621 ↛ 1622line 1621 didn't jump to line 1622 because the condition on line 1621 was never true

1622 for item2 in linkage_split_exceptions[item1]: 

1623 add(item2, roman) 

1624 continue 

1625 

1626 # Various templates for letters in scripts use spaces as 

1627 # separators and also have multiple characters without 

1628 # spaces consecutively. 

1629 v = sense or qualifier 

1630 # print("lang={} v={} script_chars={} item1={!r}" 

1631 # .format(wxr.wtp.section, v, script_chars, item1)) 

1632 if v and script_chars: 

1633 if ( 

1634 len(item1.split()) > 1 

1635 or len(list(re.finditer(unicode_dc_re, item1))) == 2 

1636 or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) 

1637 ): 

1638 if v == qualifier: 

1639 # if sense: 

1640 # sense += "; " + qualifier 

1641 # else: 

1642 # sense = qualifier 

1643 qualifier = None 

1644 if re.search(r" (letters|digits|script)$", v): 

1645 qualifier = v # Also parse as qualifier 

1646 elif re.search( 1646 ↛ 1653line 1646 didn't jump to line 1653 because the condition on line 1646 was always true

1647 r"Variations of letter |" 

1648 r"Letters using |" 

1649 r"Letters of the ", 

1650 v, 

1651 ): 

1652 qualifier = "letter" 

1653 parts = item1.split(". ") 

1654 extra: Sequence[str] = () 

1655 if len(parts) > 1: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true

1656 extra = parts[1:] 

1657 item1 = parts[0] 

1658 # Handle multi-character names for chars in language's 

1659 # alphabet, e.g., "Ny ny" in P/Hungarian. 

1660 if ( 

1661 len(subitems) > 20 

1662 and len(item1.split()) == 2 

1663 and all(len(x) <= 3 for x in item1.split()) 

1664 ): 

1665 parts = list( 

1666 m.group(0) 

1667 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) 

1668 if not m.group(0).isspace() 

1669 and m.group(0) not in ("(", ")") 

1670 ) 

1671 else: 

1672 parts = list( 

1673 m.group(0) 

1674 for m in re.finditer(r".[\u0300-\u036f]?", item1) 

1675 if not m.group(0).isspace() 

1676 and m.group(0) not in ("(", ")") 

1677 ) 

1678 for e in extra: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the loop on line 1678 never started

1679 idx = e.find(":") 

1680 if idx >= 0: 

1681 e = e[idx + 1 :].strip() 

1682 if e.endswith("."): 

1683 e = e[:-1] 

1684 parts.extend(e.split()) 

1685 

1686 # XXX this is not correct - see P/Vietnamese 

1687 # While some sequences have multiple consecutive 

1688 # characters, others use pairs and some have 

1689 # 2/3 character names, e.g., "Ng ng". 

1690 

1691 rparts: Optional[list[Optional[str]]] = None 

1692 if roman: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true

1693 rparts = list( 

1694 m.group(0) 

1695 for m in re.finditer(r".[\u0300-\u036f]", roman) 

1696 if not m.group(0).isspace() 

1697 ) 

1698 if len(rparts) != len(parts): 

1699 rparts = None 

1700 if not rparts: 1700 ↛ 1703line 1700 didn't jump to line 1703 because the condition on line 1700 was always true

1701 rparts = [None] * len(parts) 

1702 

1703 for w, r in zip(parts, rparts): 

1704 add(w, r) 

1705 continue 

1706 

1707 add(item1, roman) 

1708 return None