Coverage for src/wiktextract/extractor/en/linkages.py: 83%

517 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc) 

2# 

3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import re 

6import unicodedata 

7from typing import Optional, Sequence 

8 

9from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode 

10 

11from ...datautils import data_append, data_extend, split_at_comma_semi 

12from ...page import clean_node 

13from ...tags import linkage_beginning_tags 

14from ...wxr_context import WiktextractContext 

15from .form_descriptions import ( 

16 classify_desc, 

17 decode_tags, 

18 head_final_bantu_langs, 

19 head_final_bantu_re, 

20 head_final_numeric_langs, 

21 head_final_other_langs, 

22 head_final_other_re, 

23 head_final_re, 

24 parse_head_final_tags, 

25 parse_sense_qualifier, 

26) 

27from .type_utils import FormData, LinkageData, WordData 

28 

29# Linkage will be ignored if it matches this regexp before splitting 

30linkage_pre_split_ignore_re = re.compile( 

31 r"^(" 

32 + "|".join( 

33 re.escape(x) 

34 for x in [ 

35 "For more variations, see ", 

36 "Signal flag:", 

37 "Semaphore:", 

38 ] 

39 ) 

40 + r")" 

41) 

42 

43# Linkage will be ignored if it has one of these prefixes 

44linkage_ignore_prefixes = [ 

45 "Historical and regional synonyms of ", 

46 "edit data", 

47 "or these other third-person pronouns", 

48 "introduced in Unicode ", 

49 "Entries in the ", 

50 "Wikipedia article ", 

51 "Wiktionary's coverage of ", 

52 "Ethnologue entry for ", 

53 "Any of Thesaurus:", 

54 "See contents of Category:", 

55 "See also Thesaurus:", 

56 "See also Appendix:", 

57 "As SMS messaging ", 

58 "For the reversed question mark used in some right-to-left-scripts", 

59 "such as ", 

60 "Appendix:", 

61 "Category:", 

62 ":Category:", 

63] 

64 

65# Linkage will be ignored if it has any of these suffixes 

66linkage_ignore_suffixes = [ 

67 " Wikipedia", 

68 " Wikipedia.", 

69 " edition of Wiktionary", 

70] 

71 

72# Linkage will be ignored if it is one of these (with full match) 

73linkage_ignore_whole = [ 

74 "etc.", 

75 "other derived terms:", 

76 "Formal terms", 

77 "informal and slang terms", 

78] 

79 

80# Linkage will be ignored if it matches this regexp 

81linkage_ignore_re = re.compile( 

82 r"^(" 

83 + "|".join(re.escape(x) for x in linkage_ignore_whole) 

84 + r")$|^(" 

85 + "|".join(re.escape(x) for x in linkage_ignore_prefixes) 

86 + r")|(" 

87 + "|".join(re.escape(x) for x in linkage_ignore_suffixes) 

88 + r")$" 

89) 

90 

91# These prefixes will be removed from linkages, leaving the rest. This is 

92# considered separately for each linkage in a list. 

93linkage_remove_prefixes_re = re.compile( 

94 r"^(" 

95 + r"|".join( 

96 re.escape(x) 

97 for x in [ 

98 ":", 

99 "see Thesaurus:", 

100 "See Thesaurus:", 

101 "see also Thesaurus:", 

102 "See also Thesaurus:", 

103 "see also ", 

104 "See also ", 

105 "see ", 

106 "See ", 

107 "from ", 

108 "abbreviation of ", 

109 "ISO 639-1 code ", 

110 "ISO 639-3 code ", 

111 "Thesaurus:", 

112 ] 

113 ) 

114 + ")" 

115) 

116 

117# When removing prefix from linkage, this dictionary can be used to map 

118# the removed prefix to a space-separated list of tags to add 

119linkage_remove_prefixes_tags = { 

120 "abbreviation of ": "abbreviation", 

121} 

122 

123# These suffixes will be removed from linkages, leaving the rest. This is 

124# considered separately for each linkage in a list. 

125linkage_remove_suffixes_re = re.compile( 

126 r"(\s+on (Wikispecies|Wikimedia Commons|" 

127 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" 

128 r"\s*[-–] Pre-reform orthography.*)" 

129 r"$" 

130) 

131 

132# Ignore linkage parenthesized sections that contain one of these strings 

133linkage_paren_ignore_contains_re = re.compile( 

134 r"\b(" 

135 + "|".join( 

136 re.escape(x) 

137 for x in [ 

138 "from Etymology", 

139 "used as", 

140 "usage notes", 

141 ] 

142 ) 

143 + ")([, ]|$)" 

144) 

145 

146taxonomic_ending_map = { 

147 "superkingdoms": "superkingdom", 

148 "kingdoms": "kingdom", 

149 "subkingdoms": "subkingdom", 

150 "infrakingdoms": "infrakingdom", 

151 "phylums": "phylum", 

152 "subphylums": "subphylum", 

153 "infraphylums": "infraphylum", 

154 "superclasses": "superclass", 

155 "classes": "class", 

156 "orders": "order", 

157 "suborders": "suborder", 

158 "families": "family", 

159 "subfamilies": "subfamily", 

160 "genera": "genus", 

161} 

162for k, v in list(taxonomic_ending_map.items()): 

163 taxonomic_ending_map[v] = v # Also add singular -> singular 

164taxonomic_ending_re = re.compile( 

165 r"\s+[-‐‑‒–—]\s+({})$".format( 

166 "|".join(re.escape(x) for x in taxonomic_ending_map) 

167 ) 

168) 

169 

170# Exceptional splits for linkages. This can be used to fix particular linkages 

171# that are not handled correctly by the default code. This can also be used 

172# to create automatic aliases, e.g., for mapping "..." and "…" to both. 

173linkage_split_exceptions = { 

174 "∛ ∜": ["∛", "∜"], 

175 "...": ["...", "…"], 

176 "…": ["...", "…"], 

177} 

178 

179# Truncate linkage word if it matches any of these strings 

180linkage_truncate_re = re.compile( 

181 "|".join( 

182 re.escape(x) 

183 for x in [ 

184 " and its derived terms", 

185 " UTF-16 0x214C", 

186 ] 

187 ) 

188) 

189 

190# Regexp for identifying special linkages containing lists of letters, digits, 

191# or characters 

192script_chars_re = re.compile( 

193 r"(script letters| script| letters|" 

194 r"Dialectological|Puctuation|Symbols|" 

195 r"Guillemets|Single guillemets|" 

196 r" tetragrams|" 

197 r" digits)(;|$)|" 

198 r"(^|; )(Letters using |Letters of the |" 

199 r"Variations of letter )|" 

200 r"^(Hiragana|Katakana)$" 

201) 

202 

203# Matches an unicode character including any combining diacritics (even if 

204# separate characters) 

205unicode_dc_re = re.compile( 

206 r"\w[{}]|.".format( 

207 "".join( 

208 chr(x) 

209 for x in range(0, 0x110000) 

210 if unicodedata.category(chr(x)) == "Mn" 

211 ) 

212 ) 

213) 

214 

215 

216def parse_linkage_item_text( 

217 wxr: WiktextractContext, 

218 word: str, 

219 data: WordData, 

220 field: str, 

221 item: str, 

222 sense: Optional[str], 

223 ruby: list, 

224 pos_datas: list, 

225 is_reconstruction: bool, 

226 urls: Optional[list[str]] = None, 

227 links: Optional[list[str]] = None, 

228) -> Optional[str]: 

229 """Parses a linkage item once it has been converted to a string. This 

230 may add one or more linkages to ``data`` under ``field``. This 

231 returns None or a string that contains tags that should be applied 

232 to additional linkages (commonly used in tables for Asian characters).""" 

233 assert isinstance(wxr, WiktextractContext) 

234 assert isinstance(word, str) # Main word (derived from page title) 

235 assert isinstance(data, dict) # Parsed linkages are stored here under field 

236 assert isinstance(field, str) # The field under which to store linkage 

237 assert isinstance(item, str) # The string to parse 

238 assert sense is None or isinstance(sense, str) 

239 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" 

240 assert isinstance(pos_datas, list) # List of senses (containing "glosses") 

241 assert urls is None or isinstance(urls, list) # Captured urls 

242 assert is_reconstruction in (True, False) 

243 

244 item = item.replace("()", "") 

245 item = re.sub(r"\s+", " ", item) 

246 item = item.strip() 

247 

248 base_roman = None 

249 base_alt = None 

250 base_english = None 

251 script_chars = False 

252 base_qualifier = None 

253 lang = wxr.wtp.section 

254 

255 # If ``sense`` can be parsed as tags, treat it as tags instead 

256 if sense: 

257 cls = classify_desc(sense, no_unknown_starts=True) 

258 if cls == "tags": 

259 base_qualifier = sense 

260 sense = None 

261 

262 # Check if this item is a stand-alone sense (or tag) specifier 

263 # for following items (e.g., commonly in a table, see 滿) 

264 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item) 

265 if m: 

266 return m.group(1) 

267 

268 # Check for pre-split ignored linkages using the appropriate regexp 

269 if re.search(linkage_pre_split_ignore_re, item): 

270 return None 

271 

272 # print(" LINKAGE ITEM: {}: {} (sense {})" 

273 # .format(field, item, sense)) 

274 

275 # Replace occurrences of ~ in the item by the page title 

276 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr] 

277 item = item.replace(" ~ ", " " + safetitle + " ") 

278 item = re.sub(r"^~ ", safetitle + " ", item) 

279 item = re.sub(r" ~$", " " + safetitle, item) 

280 

281 # Many taxonomic terms contain hyponym lists that end with the 

282 # kind of the hyponym (a taxonomic level in plural). Recognize 

283 # such and add the term in singular to all linkages in the list. 

284 m = re.search(taxonomic_ending_re, item) 

285 if m: 

286 base_english = taxonomic_ending_map[m.group(1)] 

287 item = item[: m.start()] 

288 

289 # Some Korean and Japanese words use "word (romanized): english" pattern 

290 # Sometimes the parenthesized part contains comma-separated alt and roman. 

291 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item) 

292 if m: 

293 rom = m.group(2) 

294 eng = m.group(3) 

295 rest = m.group(1) 

296 if ( 

297 classify_desc(rest, no_unknown_starts=True) == "other" 

298 and classify_desc(eng, no_unknown_starts=True) == "english" 

299 ): 

300 item = rest 

301 base_roman = rom 

302 lst = base_roman.split(", ") 

303 if ( 

304 len(lst) == 2 

305 and classify_desc(lst[0], no_unknown_starts=True) == "other" 

306 ): 

307 base_alt = lst[0] 

308 base_roman = lst[1] 

309 if base_english: 

310 base_english += "; " + eng 

311 else: 

312 base_english = eng 

313 

314 # Many words have tags or similar descriptions in the beginning 

315 # followed by a colon and one or more linkages (e.g., 

316 # panetella/Finnish) 

317 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( 

318 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", 

319 item, 

320 ) 

321 if m: 

322 desc = m.group(1) 

323 rest = m.group(len(m.groups())) 

324 # Check for certain comma-separated tags combined 

325 # with English text at the beginning or end of a 

326 # comma-separated parenthesized list 

327 lst = split_at_comma_semi(desc, skipped=links) 

328 while len(lst) > 1: 

329 # Check for tags at the beginning 

330 cls = classify_desc(lst[0], no_unknown_starts=True) 

331 if cls == "tags": 

332 if base_qualifier: 

333 base_qualifier += ", " + lst[0] 

334 else: 

335 base_qualifier = lst[0] 

336 lst = lst[1:] 

337 continue 

338 # Check for tags at the end 

339 cls = classify_desc(lst[-1], no_unknown_starts=True) 

340 if cls == "tags": 

341 if base_qualifier: 

342 base_qualifier += ", " + lst[-1] 

343 else: 

344 base_qualifier = lst[-1] 

345 lst = lst[:-1] 

346 continue 

347 break 

348 desc = ", ".join(lst) 

349 

350 # Sometimes we have e.g. "chemistry (slang)" with are 

351 # both tags (see "stink"). Handle that case by 

352 # removing parentheses if the value is still tags. The part with 

353 # parentheses could be on either side of the colon. 

354 if "(" in desc: 

355 x = desc.replace("(", ",").replace(")", ",") 

356 if classify_desc(x, no_unknown_starts=True) == "tags": 

357 desc = x 

358 elif "(" in rest: 

359 x = rest.replace("(", ",").replace(")", ",") 

360 if classify_desc(x, no_unknown_starts=True) == "tags": 

361 rest = desc 

362 desc = x 

363 

364 # See if the prefix should trigger special handling for script 

365 # character, letter, digit, etc. handling 

366 if re.search(script_chars_re, desc): 

367 script_chars = True 

368 

369 # Try to determine which side is description and which is 

370 # the linked term (both orders are widely used in Wiktionary) 

371 cls = classify_desc(desc, no_unknown_starts=True) 

372 cls2 = classify_desc(rest, no_unknown_starts=True) 

373 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}" 

374 # .format(desc, cls, rest, cls2)) 

375 

376 e1 = wxr.wtp.page_exists(desc) 

377 e2 = wxr.wtp.page_exists(rest) 

378 if cls != "tags": 

379 if ( 

380 cls2 == "tags" 

381 or (e1 and not e1) 

382 or ( 

383 e1 

384 and e2 

385 and cls2 == "english" 

386 and cls in ("other", "romanization") 

387 ) 

388 or ( 

389 not e1 

390 and not e2 

391 and cls2 == "english" 

392 and cls in ("other", "romanization") 

393 ) 

394 ): 

395 desc, rest = rest, desc # Looks like swapped syntax 

396 cls = cls2 

397 if re.search(linkage_paren_ignore_contains_re, desc): 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 desc = "" 

399 # print("linkage colon prefix desc={!r} rest={!r} cls={}" 

400 # .format(desc, rest, cls)) 

401 

402 # Handle the prefix according to its type 

403 if cls == "tags": 

404 if base_qualifier: 

405 base_qualifier += ", " + desc 

406 else: 

407 base_qualifier = desc 

408 item = rest 

409 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"): 

410 if base_english: 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 base_english += "; " + base_english 

412 else: 

413 base_english = desc 

414 item = rest 

415 elif cls in ("english", "taxonomic"): 

416 if sense: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 sense += "; " + desc 

418 else: 

419 sense = desc 

420 item = rest 

421 elif desc.isdigit(): 

422 idx = int(desc) - 1 

423 if idx >= 0 and idx < len(pos_datas): 

424 d = pos_datas[idx] 

425 gl = "; ".join(d.get("glosses", ())) 

426 if not gl: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 wxr.wtp.debug( 

428 "parenthesized numeric linkage prefix, " 

429 "but the referenced sense has no gloss: " 

430 "{}".format(desc), 

431 sortid="linkages/355", 

432 ) 

433 elif sense: 

434 sense += "; " + gl 

435 else: 

436 sense = gl 

437 item = rest 

438 else: 

439 wxr.wtp.debug( 

440 "parenthesized numeric linkage prefix, " 

441 "but there is no sense with such index: {}".format(desc), 

442 sortid="linkages/365", 

443 ) 

444 item = rest 

445 else: 

446 wxr.wtp.debug( 

447 "unrecognized linkage prefix: {} desc={} rest={} " 

448 "cls={} cls2={} e1={} e2={}".format( 

449 item, desc, rest, cls, cls2, e1, e2 

450 ), 

451 sortid="linkages/371", 

452 ) 

453 item = rest 

454 

455 base_sense = sense 

456 

457 # Check for certain plural tag forms at end of items list, and apply 

458 # them to all items if found 

459 m = re.search( 

460 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" 

461 r"characters|symbols|tetragrams|letter names|names|" 

462 r"female names|male names|proper nouns|contractions|" 

463 r"nonstandard spellings|verbs|prepositions|postpositions|" 

464 r"interjections|Abbreviations|abbreviations|variants|" 

465 r"ordinals|nouns|phrases|adjectives|adverbs|" 

466 r"augmentatives|pejoratives|compound words|numerals|" 

467 r"Tally marks|surnames|modern nonstandard spellings)$", 

468 item, 

469 ) 

470 if m: 

471 suffix = m.group(1) 

472 if base_qualifier: 

473 base_qualifier += ", " + suffix 

474 else: 

475 base_qualifier = suffix 

476 item = item[: m.start()] 

477 

478 # Certain linkage items have space-separated valus. These are 

479 # generated by, e.g., certain templates 

480 if base_sense and base_sense.endswith(" paper sizes"): 

481 base_qualifier = None 

482 item = ", ".join(item.split()) 

483 # XXX isn't this now handled by the generic digits/letters/etc code? 

484 # elif base_qualifier in ("Arabic digits",): 

485 # item = ", ".join(item.split()) 

486 

487 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript 

488 item = item.strip() 

489 if not item: 

490 return None 

491 

492 # Kludge: if the item contains ")/" (with possibly spaces in between), 

493 # replace it by a comma so it gets split. 

494 item = re.sub(r"\)\s*/", "), ", item) 

495 

496 # The item may contain multiple comma-separated linkages 

497 if base_roman: 

498 subitems = [item] 

499 else: 

500 # Split at commas. Also, in most cases split by " or ", but this 

501 # is complicated - "or" may end certain words (e.g., "logical or") 

502 # and it may separate head-final tags (e.g. "foo f or m"). Also, 

503 # some words have parenthesizxed parts in between, e.g., 

504 # wife/English/Translations/Yiddish: 

505 # "ווײַב‎ n (vayb) or f, פֿרוי‎ f (froy)" 

506 subitems = [] 

507 for item1 in split_at_comma_semi(item, skipped=links): 

508 if " or " not in item1: 

509 subitems.append(item1) 

510 continue 

511 # Item1 contains " or " 

512 item2 = re.sub(r"\s*\([^)]*\)", "", item1) 

513 item2 = re.sub(r"\s+", " ", item2) 

514 if ( 

515 ( 

516 lang not in head_final_bantu_langs 

517 or not re.search(head_final_bantu_re, item2) 

518 ) 

519 and ( 

520 lang not in head_final_other_langs 

521 or not re.search(head_final_other_re, item2) 

522 ) 

523 and ( 

524 not re.search(head_final_re, item2) 

525 or ( 

526 item2[-1].isdigit() 

527 and lang not in head_final_numeric_langs 

528 ) 

529 ) 

530 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE") 

531 and all( 

532 wxr.wtp.title not in x.split(" or ") 

533 for x in split_at_comma_semi(item2, skipped=links) 

534 if " or " in x 

535 ) 

536 ): 

537 # We can split this item. Split the non-cleaned version 

538 # that still has any intervening parenthesized parts. 

539 subitems.extend( 

540 split_at_comma_semi(item1, extra=[" or "], skipped=links) 

541 ) 

542 else: 

543 subitems.append(item1) 

544 if len(subitems) > 1: # Would be merged from multiple subitems 

545 ruby = [] # XXX what is the purpose of this? 

546 for item1 in subitems: 

547 if len(subitems) > 1 and item1 in ("...", "…"): 

548 # Some lists have ellipsis in the middle - don't generate 

549 # linkages for the ellipsis 

550 continue 

551 item1 = item1.strip() 

552 qualifier = base_qualifier 

553 sense = base_sense 

554 parts = [] 

555 roman = base_roman # Usually None 

556 alt = base_alt # Usually None 

557 taxonomic = None 

558 english = base_english 

559 

560 # Some words have derived terms with parenthesized quoted English 

561 # descriptions, which can sometimes essentially be tags 

562 # Some word (bleki/Esperanto...) can have parentheses inside 

563 # the quotes, so let's make this regex even more unreadable. 

564 m = re.search(r"\s*\(“([^”]+)”\)", item1) 

565 if m: 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true

566 t = m.group(1) 

567 item1 = (item1[: m.start()] + item1[m.end() :]).strip() 

568 cls = classify_desc(t) 

569 if cls == "tags": 

570 if qualifier: 

571 qualifier += ", " + t 

572 else: 

573 qualifier = t 

574 else: 

575 english = t 

576 

577 # Some Korean words use "word (alt, oman, “english”) pattern 

578 # See 滿/Korean 

579 m = re.match( 

580 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " 

581 r'[“”"]([^”“"]+)[“”"]\)$', 

582 item1, 

583 ) 

584 if ( 

585 m 

586 and classify_desc(m.group(1), no_unknown_starts=True) == "other" 

587 and classify_desc(m.group(2), no_unknown_starts=True) == "other" 

588 ): 

589 alt = m.group(2) 

590 roman = m.group(3) 

591 english = m.group(4) 

592 item1 = m.group(1) 

593 

594 words = item1.split(" ") 

595 if ( 

596 len(words) > 1 

597 and words[0] in linkage_beginning_tags 

598 and words[0] != wxr.wtp.title 

599 ): 

600 t = linkage_beginning_tags[words[0]] 

601 item1 = " ".join(words[1:]) 

602 if qualifier: 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true

603 qualifier += ", " + t 

604 else: 

605 qualifier = t 

606 

607 # Extract quoted English translations (there are also other 

608 # kinds of English translations) 

609 def english_repl(m: re.Match) -> str: 

610 nonlocal english 

611 nonlocal qualifier 

612 v = m.group(1).strip() 

613 # If v is "tags: sense", handle the tags 

614 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v) 

615 if m1 is not None: 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true

616 desc, rest = m1.groups() 

617 if classify_desc(desc, no_unknown_starts=True) == "tags": 

618 if qualifier: 

619 qualifier += ", " + desc 

620 else: 

621 qualifier = desc 

622 v = rest 

623 if english: 

624 english += "; " + v 

625 else: 

626 english = v 

627 return "" 

628 

629 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip() 

630 

631 # There could be multiple parenthesized parts, and 

632 # sometimes both at the beginning and at the end. 

633 # And sometimes even in the middle, as in e.g. 

634 # wife/English/Translations/Yiddish 

635 while not script_chars and ( 

636 not sense or not re.search(script_chars_re, sense) 

637 ): 

638 par = None 

639 nonfirst_par = False 

640 if par is None: 640 ↛ 657line 640 didn't jump to line 657 because the condition on line 640 was always true

641 # Try to find a parenthesized part from the beginning. 

642 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) 

643 if m: 

644 par = m.group(1) 

645 item1 = item1[m.end() :] 

646 else: 

647 # Try to find a parenthesized part at the end or from the 

648 # middle. 

649 m = re.search( 

650 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", 

651 item1, 

652 ) 

653 if m: 

654 par = m.group(1) 

655 item1 = item1[: m.start()] + item1[m.end() :] 

656 nonfirst_par = True 

657 if not par: 

658 break 

659 if re.search(linkage_paren_ignore_contains_re, par): 

660 continue # Skip these linkage descriptors 

661 par = par.strip() 

662 # Handle tags from beginning of par. We also handle "other" 

663 # here as Korean entries often have Hanja form in the 

664 # beginning of parenthesis, before romanization. Similar 

665 # for many Japanese entries. 

666 while par: 666 ↛ 687line 666 didn't jump to line 687 because the condition on line 666 was always true

667 idx = par.find(",") 

668 if idx <= 0: 

669 break 

670 cls = classify_desc(par[:idx], no_unknown_starts=True) 

671 if cls == "other" and not alt: 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 alt = par[:idx] 

673 elif cls == "taxonomic": 673 ↛ 674line 673 didn't jump to line 674 because the condition on line 673 was never true

674 taxonomic = par[:idx] 

675 elif cls == "tags": 

676 if qualifier: 

677 qualifier += ", " + par[:idx] 

678 else: 

679 qualifier = par[:idx] 

680 else: 

681 break 

682 par = par[idx + 1 :].strip() 

683 

684 # Check for certain comma-separated tags combined 

685 # with English text at the beginning or end of a 

686 # comma-separated parenthesized list 

687 lst = par.split(",") if len(par) > 1 else [par] 

688 lst = list(x.strip() for x in lst if x.strip()) 

689 while len(lst) > 1: 

690 cls = classify_desc(lst[0], no_unknown_starts=True) 

691 if cls == "tags": 691 ↛ 692line 691 didn't jump to line 692 because the condition on line 691 was never true

692 if qualifier: 

693 qualifier += ", " + lst[0] 

694 else: 

695 qualifier = lst[0] 

696 lst = lst[1:] 

697 continue 

698 cls = classify_desc(lst[-1], no_unknown_starts=True) 

699 if cls == "tags": 

700 if qualifier: 

701 qualifier += ", " + lst[-1] 

702 else: 

703 qualifier = lst[-1] 

704 lst = lst[:-1] 

705 continue 

706 break 

707 par = ", ".join(lst) 

708 

709 # Handle remaining types 

710 if not par: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 continue 

712 if re.search(script_chars_re, par): 

713 script_chars = True 

714 if classify_desc(par, no_unknown_starts=True) == "tags": 714 ↛ 724line 714 didn't jump to line 724 because the condition on line 714 was always true

715 if base_qualifier: 715 ↛ 716line 715 didn't jump to line 716 because the condition on line 715 was never true

716 base_qualifier += "; " + par 

717 else: 

718 base_qualifier = par 

719 if qualifier: 719 ↛ 720line 719 didn't jump to line 720 because the condition on line 719 was never true

720 qualifier += "; " + par 

721 else: 

722 qualifier = par 

723 else: 

724 if base_sense: 

725 base_sense += "; " + par 

726 else: 

727 base_sense = par 

728 if sense: 

729 sense += "; " + par 

730 else: 

731 sense = par 

732 elif par.endswith(" letter names"): 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 if base_qualifier: 

734 base_qualifier += "; " + par 

735 else: 

736 base_qualifier = par 

737 if qualifier: 

738 qualifier += "; " + par 

739 else: 

740 qualifier = par 

741 else: 

742 cls = classify_desc(par) 

743 # print("classify_desc: {!r} -> {}".format(par, cls)) 

744 if cls == "tags": 

745 if qualifier: 

746 qualifier += ", " + par 

747 else: 

748 qualifier = par 

749 elif cls == "english": 

750 if nonfirst_par: 

751 if english: 

752 english += "; " + par 

753 else: 

754 english = par 

755 else: 

756 if sense: 756 ↛ 757line 756 didn't jump to line 757 because the condition on line 756 was never true

757 sense += "; " + par 

758 else: 

759 sense = par 

760 elif cls == "romanization": 

761 roman = par 

762 elif cls == "taxonomic": 

763 taxonomic = par 

764 elif par.isdigit(): 

765 idx = int(par) - 1 

766 if idx >= 0 and idx < len(pos_datas): 

767 d = pos_datas[idx] 

768 gl = "; ".join(d.get("glosses", ())) 

769 if not gl: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true

770 wxr.wtp.debug( 

771 "parenthesized number " 

772 "but the referenced sense has no " 

773 "gloss: {}".format(par), 

774 sortid="linkages/665", 

775 ) 

776 elif sense: 776 ↛ 779line 776 didn't jump to line 779 because the condition on line 776 was always true

777 sense += "; " + gl 

778 else: 

779 sense = gl 

780 else: 

781 wxr.wtp.debug( 

782 "parenthesized number but there is " 

783 "no sense with such index: {}".format(par), 

784 sortid="linkages/674", 

785 ) 

786 else: 

787 if alt: 787 ↛ 788line 787 didn't jump to line 788 because the condition on line 787 was never true

788 alt += "; " + par 

789 else: 

790 alt = par 

791 

792 # Handle certain special cases, unless we are parsing 

793 # script characters. 

794 if not script_chars: 

795 # Ignore all linkages with certain prefixes, suffixes, or parts 

796 # (this is done after removing certain prefixes and suffixes) 

797 if re.search(linkage_ignore_re, item1): 

798 continue # Ignore linkages with certain prefixes 

799 

800 # Remove certain prefixes from linkages 

801 m = re.match(linkage_remove_prefixes_re, item1) 

802 if m: 

803 prefix = item1[: m.end()] 

804 item1 = item1[m.end() :] 

805 if prefix in linkage_remove_prefixes_tags: 

806 if qualifier: 

807 qualifier += ", " + linkage_remove_prefixes_tags[prefix] 

808 else: 

809 qualifier = linkage_remove_prefixes_tags[prefix] 

810 # Recheck ignored linkages 

811 if re.search(linkage_ignore_re, item1): 

812 continue 

813 

814 # Remove certain suffixes from linkages 

815 m = re.search(linkage_remove_suffixes_re, item1) 

816 if m: 

817 item1 = item1[: m.start()] 

818 

819 # Parse linkages with "value = english" syntax (e.g., 

820 # väittää/Finnish) 

821 idx = item1.find(" = ") 

822 if idx >= 0: 

823 eng = item1[idx + 3 :] 

824 if classify_desc(eng, no_unknown_starts=True) == "english": 

825 english = eng 

826 item1 = item1[:idx] 

827 else: 

828 # Some places seem to use it reversed 

829 # "english = value" 

830 eng = item1[:idx] 

831 if classify_desc(eng, no_unknown_starts=True) == "english": 

832 english = eng 

833 item1 = item1[idx + 3 :] 

834 

835 # Parse linkages with "value - english" syntax (e.g., 

836 # man/Faroese) 

837 m = re.search(r" [-‐‑‒–—―] ", item1) 

838 if m and "(" not in item1: 

839 suffix = item1[m.end() :] 

840 cls = classify_desc(suffix, no_unknown_starts=True) 

841 if cls == "english": 

842 # This case intentionally ignores old values from english 

843 # (otherwise taxonomic lists fail) 

844 english = suffix 

845 item1 = item1[: m.start()] 

846 elif cls == "tags": 

847 if qualifier: 847 ↛ 848line 847 didn't jump to line 848 because the condition on line 847 was never true

848 qualifier += ", " + suffix 

849 else: 

850 qualifier = suffix 

851 item1 = item1[: m.start()] 

852 

853 # Parse certain tags at the end of the linked term (unless 

854 # we are in a letters list) 

855 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1) 

856 if q: 

857 if qualifier: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true

858 qualifier += ", " + ", ".join(q) 

859 else: 

860 qualifier = ", ".join(q) 

861 

862 m = re.search(linkage_truncate_re, item1) 

863 if m: 863 ↛ 865line 863 didn't jump to line 865 because the condition on line 863 was never true

864 # suffix = item1[m.start():] # Currently ignored 

865 item1 = item1[: m.start()] 

866 if not item1: 

867 continue # Ignore empty link targets 

868 if item1 == word: 

869 continue # Ignore self-links 

870 

871 def add(w: str, r: Optional[str]) -> None: 

872 assert isinstance(w, str) 

873 assert r is None or isinstance(r, str) 

874 nonlocal alt 

875 nonlocal taxonomic 

876 

877 # We remove "*" from the beginning of reconstruction linkages. 

878 # Such linkages should only occur in reconstruction senses, so 

879 # this should not cause ambiguity. 

880 if is_reconstruction and w.startswith("*"): 

881 w = w[1:] 

882 

883 # Check if the word contains the Fullwith Solidus, and if 

884 # so, split by it and treat the the results as alternative 

885 # linkages. (This is very commonly used for alternative 

886 # written forms in Chinese compounds and other linkages.) 

887 # However, if the word contains a comma, then we wont't 

888 # split as this is used when we have a different number 

889 # of romanizations than written forms, and don't know 

890 # which is which. 

891 if ( 

892 (not w or "," not in w) 

893 and (not r or "," not in r) 

894 and not wxr.wtp.page_exists(w) 

895 ): 

896 lst = w.split("/") if len(w) > 1 else [w] 

897 if len(lst) == 1: 

898 lst = w.split(" / ") 

899 if len(lst) == 1 and len(lst[0]) >= 6: 

900 lst = w.split("/") 

901 if len(lst) > 1: 

902 # Treat each alternative as separate linkage 

903 for w in lst: 

904 add(w, r) 

905 return None 

906 

907 # Heuristically remove "." at the end of most linkages 

908 # (some linkage lists end in a period, but we also have 

909 # abbreviations that end with a period that should be kept) 

910 if ( 

911 w.endswith(".") 

912 and not wxr.wtp.page_exists(w) 

913 and ( 

914 wxr.wtp.page_exists(w[:-1]) 

915 or (len(w) >= 5) 

916 and "." not in w[:-1] 

917 ) 

918 ): 

919 w = w[:-1] 

920 

921 # If we have roman but not alt and the word is ASCII, 

922 # move roman to alt. 

923 if r and not alt and w.isascii(): 

924 alt = r 

925 r = None 

926 # Add the linkage 

927 dt: LinkageData = {} 

928 if qualifier: 

929 parse_sense_qualifier(wxr, qualifier, dt) 

930 if sense: 

931 dt["sense"] = sense.strip() 

932 if r: 

933 dt["roman"] = r.strip() 

934 if ruby: 

935 dt["ruby"] = ruby 

936 if english: 

937 dt["english"] = english.strip() 

938 if taxonomic: 

939 if re.match(r"×[A-Z]", taxonomic): 

940 data_append(dt, "tags", "extinct") 

941 taxonomic = taxonomic[1:] 

942 dt["taxonomic"] = taxonomic 

943 if re.match(r"×[A-Z]", w): 

944 data_append(dt, "tags", "extinct") 

945 w = w[1:] # Remove × before dead species names 

946 if alt and re.match(r"×[A-Z]", alt): 

947 data_append(dt, "tags", "extinct") 

948 alt = alt[1:] # Remove × before dead species names 

949 if alt and alt.strip() != w: 

950 dt["alt"] = alt.strip() 

951 if urls: 

952 dt["urls"] = [ 

953 url.strip() for url in urls if url and isinstance(url, str) 

954 ] 

955 dt["word"] = w 

956 for old in data.get(field, ()): # type: ignore[attr-defined] 

957 if dt == old: 

958 break 

959 else: 

960 data_append(data, field, dt) 

961 

962 # Handle exceptional linkage splits and other linkage 

963 # conversions (including expanding to variant forms) 

964 if item1 in linkage_split_exceptions: 964 ↛ 965line 964 didn't jump to line 965 because the condition on line 964 was never true

965 for item2 in linkage_split_exceptions[item1]: 

966 add(item2, roman) 

967 continue 

968 

969 # Various templates for letters in scripts use spaces as 

970 # separators and also have multiple characters without 

971 # spaces consecutively. 

972 v = sense or qualifier 

973 # print("lang={} v={} script_chars={} item1={!r}" 

974 # .format(wxr.wtp.section, v, script_chars, item1)) 

975 if v and script_chars: 

976 if ( 

977 len(item1.split()) > 1 

978 or len(list(re.finditer(unicode_dc_re, item1))) == 2 

979 or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) 

980 ): 

981 if v == qualifier: 

982 # if sense: 

983 # sense += "; " + qualifier 

984 # else: 

985 # sense = qualifier 

986 qualifier = None 

987 if re.search(r" (letters|digits|script)$", v): 

988 qualifier = v # Also parse as qualifier 

989 elif re.search( 989 ↛ 996line 989 didn't jump to line 996 because the condition on line 989 was always true

990 r"Variations of letter |" 

991 r"Letters using |" 

992 r"Letters of the ", 

993 v, 

994 ): 

995 qualifier = "letter" 

996 parts = item1.split(". ") 

997 extra: Sequence[str] = () 

998 if len(parts) > 1: 998 ↛ 999line 998 didn't jump to line 999 because the condition on line 998 was never true

999 extra = parts[1:] 

1000 item1 = parts[0] 

1001 # Handle multi-character names for chars in language's 

1002 # alphabet, e.g., "Ny ny" in P/Hungarian. 

1003 if ( 

1004 len(subitems) > 20 

1005 and len(item1.split()) == 2 

1006 and all(len(x) <= 3 for x in item1.split()) 

1007 ): 

1008 parts = list( 

1009 m.group(0) 

1010 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) 

1011 if not m.group(0).isspace() 

1012 and m.group(0) not in ("(", ")") 

1013 ) 

1014 else: 

1015 parts = list( 

1016 m.group(0) 

1017 for m in re.finditer(r".[\u0300-\u036f]?", item1) 

1018 if not m.group(0).isspace() 

1019 and m.group(0) not in ("(", ")") 

1020 ) 

1021 for e in extra: 1021 ↛ 1022line 1021 didn't jump to line 1022 because the loop on line 1021 never started

1022 idx = e.find(":") 

1023 if idx >= 0: 

1024 e = e[idx + 1 :].strip() 

1025 if e.endswith("."): 

1026 e = e[:-1] 

1027 parts.extend(e.split()) 

1028 

1029 # XXX this is not correct - see P/Vietnamese 

1030 # While some sequences have multiple consecutive 

1031 # characters, others use pairs and some have 

1032 # 2/3 character names, e.g., "Ng ng". 

1033 

1034 rparts: Optional[list[Optional[str]]] = None 

1035 if roman: 1035 ↛ 1036line 1035 didn't jump to line 1036 because the condition on line 1035 was never true

1036 rparts = list( 

1037 m.group(0) 

1038 for m in re.finditer(r".[\u0300-\u036f]", roman) 

1039 if not m.group(0).isspace() 

1040 ) 

1041 if len(rparts) != len(parts): 

1042 rparts = None 

1043 if not rparts: 1043 ↛ 1046line 1043 didn't jump to line 1046 because the condition on line 1043 was always true

1044 rparts = [None] * len(parts) 

1045 

1046 for w, r in zip(parts, rparts): 

1047 add(w, r) 

1048 continue 

1049 

1050 add(item1, roman) 

1051 return None 

1052 

1053 

1054def extract_alt_form_section( 

1055 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode 

1056) -> None: 

1057 for list_node in level_node.find_child(NodeKind.LIST): 

1058 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

1059 for node in list_item.children: 

1060 if isinstance(node, TemplateNode) and node.template_name in [ 

1061 "l", 

1062 "link", 

1063 "L", 

1064 "alt", 

1065 "alter", 

1066 ]: 

1067 extract_l_template(wxr, word_entry, node) 

1068 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

1069 word = clean_node(wxr, None, node) 

1070 if word != "": 1070 ↛ 1059line 1070 didn't jump to line 1059 because the condition on line 1070 was always true

1071 form: FormData = {"form": word, "tags": ["alternative"]} 

1072 data_append(word_entry, "forms", form) 

1073 

1074 

1075def extract_l_template( 

1076 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode 

1077) -> None: 

1078 forms: list[FormData] = [] 

1079 expanded_node = wxr.wtp.parse( 

1080 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

1081 ) 

1082 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

1083 for span_tag in expanded_node.find_html("span"): 

1084 span_lang = span_tag.attrs.get("lang", "") 

1085 span_class = span_tag.attrs.get("class", "") 

1086 if span_lang == lang_code: 1086 ↛ 1091line 1086 didn't jump to line 1091 because the condition on line 1086 was always true

1087 word = clean_node(wxr, None, span_tag) 

1088 if word != "": 1088 ↛ 1083line 1088 didn't jump to line 1083 because the condition on line 1088 was always true

1089 form: FormData = {"form": word, "tags": ["alternative"]} 

1090 forms.append(form) 

1091 elif span_lang.endswith("-Latn") and len(forms) > 0: 

1092 roman = clean_node(wxr, None, span_tag) 

1093 if roman != "": 

1094 forms[-1]["roman"] = roman 

1095 elif "label-content" in span_class and len(forms) > 0: 

1096 tag_text = clean_node(wxr, None, span_tag) 

1097 if classify_desc(tag_text) == "tags": 

1098 tagsets1, _ = decode_tags(tag_text) 

1099 tags: list[str] = [] 

1100 for ts in tagsets1: 

1101 tags.extend(ts) 

1102 for form in forms: 

1103 form["tags"].extend(tags) 

1104 data_extend(word_entry, "forms", forms)