Coverage for src/wiktextract/extractor/en/linkages.py: 81%

478 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc) 

2# 

3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org 

4 

5import re 

6import unicodedata 

7from typing import Optional, Sequence 

8 

9from ...datautils import data_append, split_at_comma_semi 

10from ...tags import linkage_beginning_tags 

11from ...wxr_context import WiktextractContext 

12from .form_descriptions import ( 

13 classify_desc, 

14 head_final_bantu_langs, 

15 head_final_bantu_re, 

16 head_final_numeric_langs, 

17 head_final_other_langs, 

18 head_final_other_re, 

19 head_final_re, 

20 parse_head_final_tags, 

21 parse_sense_qualifier, 

22) 

23from .type_utils import LinkageData, WordData 

24 

25# Linkage will be ignored if it matches this regexp before splitting 

26linkage_pre_split_ignore_re = re.compile( 

27 r"^(" 

28 + "|".join( 

29 re.escape(x) 

30 for x in [ 

31 "For more variations, see ", 

32 "Signal flag:", 

33 "Semaphore:", 

34 ] 

35 ) 

36 + r")" 

37) 

38 

39# Linkage will be ignored if it has one of these prefixes 

40linkage_ignore_prefixes = [ 

41 "Historical and regional synonyms of ", 

42 "edit data", 

43 "or these other third-person pronouns", 

44 "introduced in Unicode ", 

45 "Entries in the ", 

46 "Wikipedia article ", 

47 "Wiktionary's coverage of ", 

48 "Ethnologue entry for ", 

49 "Any of Thesaurus:", 

50 "See contents of Category:", 

51 "See also Thesaurus:", 

52 "See also Appendix:", 

53 "As SMS messaging ", 

54 "For the reversed question mark used in some right-to-left-scripts", 

55 "such as ", 

56 "Appendix:", 

57 "Category:", 

58 ":Category:", 

59] 

60 

61# Linkage will be ignored if it has any of these suffixes 

62linkage_ignore_suffixes = [ 

63 " Wikipedia", 

64 " Wikipedia.", 

65 " edition of Wiktionary", 

66] 

67 

68# Linkage will be ignored if it is one of these (with full match) 

69linkage_ignore_whole = [ 

70 "etc.", 

71 "other derived terms:", 

72 "Formal terms", 

73 "informal and slang terms", 

74] 

75 

76# Linkage will be ignored if it matches this regexp 

77linkage_ignore_re = re.compile( 

78 r"^(" 

79 + "|".join(re.escape(x) for x in linkage_ignore_whole) 

80 + r")$|^(" 

81 + "|".join(re.escape(x) for x in linkage_ignore_prefixes) 

82 + r")|(" 

83 + "|".join(re.escape(x) for x in linkage_ignore_suffixes) 

84 + r")$" 

85) 

86 

87# These prefixes will be removed from linkages, leaving the rest. This is 

88# considered separately for each linkage in a list. 

89linkage_remove_prefixes_re = re.compile( 

90 r"^(" 

91 + r"|".join( 

92 re.escape(x) 

93 for x in [ 

94 ":", 

95 "see Thesaurus:", 

96 "See Thesaurus:", 

97 "see also Thesaurus:", 

98 "See also Thesaurus:", 

99 "see also ", 

100 "See also ", 

101 "see ", 

102 "See ", 

103 "from ", 

104 "abbreviation of ", 

105 "ISO 639-1 code ", 

106 "ISO 639-3 code ", 

107 "Thesaurus:", 

108 ] 

109 ) 

110 + ")" 

111) 

112 

113# When removing prefix from linkage, this dictionary can be used to map 

114# the removed prefix to a space-separated list of tags to add 

115linkage_remove_prefixes_tags = { 

116 "abbreviation of ": "abbreviation", 

117} 

118 

119# These suffixes will be removed from linkages, leaving the rest. This is 

120# considered separately for each linkage in a list. 

121linkage_remove_suffixes_re = re.compile( 

122 r"(\s+on (Wikispecies|Wikimedia Commons|" 

123 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|" 

124 r"\s*[-–] Pre-reform orthography.*)" 

125 r"$" 

126) 

127 

128# Ignore linkage parenthesized sections that contain one of these strings 

129linkage_paren_ignore_contains_re = re.compile( 

130 r"\b(" 

131 + "|".join( 

132 re.escape(x) 

133 for x in [ 

134 "from Etymology", 

135 "used as", 

136 "usage notes", 

137 ] 

138 ) 

139 + ")([, ]|$)" 

140) 

141 

142taxonomic_ending_map = { 

143 "superkingdoms": "superkingdom", 

144 "kingdoms": "kingdom", 

145 "subkingdoms": "subkingdom", 

146 "infrakingdoms": "infrakingdom", 

147 "phylums": "phylum", 

148 "subphylums": "subphylum", 

149 "infraphylums": "infraphylum", 

150 "superclasses": "superclass", 

151 "classes": "class", 

152 "orders": "order", 

153 "suborders": "suborder", 

154 "families": "family", 

155 "subfamilies": "subfamily", 

156 "genera": "genus", 

157} 

158for k, v in list(taxonomic_ending_map.items()): 

159 taxonomic_ending_map[v] = v # Also add singular -> singular 

160taxonomic_ending_re = re.compile( 

161 r"\s+[-‐‑‒–—]\s+({})$".format( 

162 "|".join(re.escape(x) for x in taxonomic_ending_map) 

163 ) 

164) 

165 

166# Exceptional splits for linkages. This can be used to fix particular linkages 

167# that are not handled correctly by the default code. This can also be used 

168# to create automatic aliases, e.g., for mapping "..." and "…" to both. 

169linkage_split_exceptions = { 

170 "∛ ∜": ["∛", "∜"], 

171 "...": ["...", "…"], 

172 "…": ["...", "…"], 

173} 

174 

175# Truncate linkage word if it matches any of these strings 

176linkage_truncate_re = re.compile( 

177 "|".join( 

178 re.escape(x) 

179 for x in [ 

180 " and its derived terms", 

181 " UTF-16 0x214C", 

182 ] 

183 ) 

184) 

185 

186# Regexp for identifying special linkages containing lists of letters, digits, 

187# or characters 

188script_chars_re = re.compile( 

189 r"(script letters| script| letters|" 

190 r"Dialectological|Puctuation|Symbols|" 

191 r"Guillemets|Single guillemets|" 

192 r" tetragrams|" 

193 r" digits)(;|$)|" 

194 r"(^|; )(Letters using |Letters of the |" 

195 r"Variations of letter )|" 

196 r"^(Hiragana|Katakana)$" 

197) 

198 

199# Matches an unicode character including any combining diacritics (even if 

200# separate characters) 

201unicode_dc_re = re.compile( 

202 r"\w[{}]|.".format( 

203 "".join( 

204 chr(x) 

205 for x in range(0, 0x110000) 

206 if unicodedata.category(chr(x)) == "Mn" 

207 ) 

208 ) 

209) 

210 

211 

212def parse_linkage_item_text( 

213 wxr: WiktextractContext, 

214 word: str, 

215 data: WordData, 

216 field: str, 

217 item: str, 

218 sense: Optional[str], 

219 ruby: list, 

220 pos_datas: list, 

221 is_reconstruction: bool, 

222 urls: Optional[list[str]] = None, 

223 links: Optional[list[str]] = None, 

224) -> Optional[str]: 

225 """Parses a linkage item once it has been converted to a string. This 

226 may add one or more linkages to ``data`` under ``field``. This 

227 returns None or a string that contains tags that should be applied 

228 to additional linkages (commonly used in tables for Asian characters).""" 

229 assert isinstance(wxr, WiktextractContext) 

230 assert isinstance(word, str) # Main word (derived from page title) 

231 assert isinstance(data, dict) # Parsed linkages are stored here under field 

232 assert isinstance(field, str) # The field under which to store linkage 

233 assert isinstance(item, str) # The string to parse 

234 assert sense is None or isinstance(sense, str) 

235 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or "" 

236 assert isinstance(pos_datas, list) # List of senses (containing "glosses") 

237 assert urls is None or isinstance(urls, list) # Captured urls 

238 assert is_reconstruction in (True, False) 

239 

240 item = item.replace("()", "") 

241 item = re.sub(r"\s+", " ", item) 

242 item = item.strip() 

243 

244 base_roman = None 

245 base_alt = None 

246 base_english = None 

247 script_chars = False 

248 base_qualifier = None 

249 lang = wxr.wtp.section 

250 

251 # If ``sense`` can be parsed as tags, treat it as tags instead 

252 if sense: 

253 cls = classify_desc(sense, no_unknown_starts=True) 

254 if cls == "tags": 

255 base_qualifier = sense 

256 sense = None 

257 

258 # Check if this item is a stand-alone sense (or tag) specifier 

259 # for following items (e.g., commonly in a table, see 滿) 

260 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item) 

261 if m: 

262 return m.group(1) 

263 

264 # Check for pre-split ignored linkages using the appropriate regexp 

265 if re.search(linkage_pre_split_ignore_re, item): 

266 return None 

267 

268 # print(" LINKAGE ITEM: {}: {} (sense {})" 

269 # .format(field, item, sense)) 

270 

271 # Replace occurrences of ~ in the item by the page title 

272 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr] 

273 item = item.replace(" ~ ", " " + safetitle + " ") 

274 item = re.sub(r"^~ ", safetitle + " ", item) 

275 item = re.sub(r" ~$", " " + safetitle, item) 

276 

277 # Many taxonomic terms contain hyponym lists that end with the 

278 # kind of the hyponym (a taxonomic level in plural). Recognize 

279 # such and add the term in singular to all linkages in the list. 

280 m = re.search(taxonomic_ending_re, item) 

281 if m: 

282 base_english = taxonomic_ending_map[m.group(1)] 

283 item = item[: m.start()] 

284 

285 # Some Korean and Japanese words use "word (romanized): english" pattern 

286 # Sometimes the parenthesized part contains comma-separated alt and roman. 

287 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item) 

288 if m: 

289 rom = m.group(2) 

290 eng = m.group(3) 

291 rest = m.group(1) 

292 if ( 

293 classify_desc(rest, no_unknown_starts=True) == "other" 

294 and classify_desc(eng, no_unknown_starts=True) == "english" 

295 ): 

296 item = rest 

297 base_roman = rom 

298 lst = base_roman.split(", ") 

299 if ( 

300 len(lst) == 2 

301 and classify_desc(lst[0], no_unknown_starts=True) == "other" 

302 ): 

303 base_alt = lst[0] 

304 base_roman = lst[1] 

305 if base_english: 

306 base_english += "; " + eng 

307 else: 

308 base_english = eng 

309 

310 # Many words have tags or similar descriptions in the beginning 

311 # followed by a colon and one or more linkages (e.g., 

312 # panetella/Finnish) 

313 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match( 

314 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$", 

315 item, 

316 ) 

317 if m: 

318 desc = m.group(1) 

319 rest = m.group(len(m.groups())) 

320 # Check for certain comma-separated tags combined 

321 # with English text at the beginning or end of a 

322 # comma-separated parenthesized list 

323 lst = split_at_comma_semi(desc, skipped=links) 

324 while len(lst) > 1: 

325 # Check for tags at the beginning 

326 cls = classify_desc(lst[0], no_unknown_starts=True) 

327 if cls == "tags": 

328 if base_qualifier: 

329 base_qualifier += ", " + lst[0] 

330 else: 

331 base_qualifier = lst[0] 

332 lst = lst[1:] 

333 continue 

334 # Check for tags at the end 

335 cls = classify_desc(lst[-1], no_unknown_starts=True) 

336 if cls == "tags": 

337 if base_qualifier: 

338 base_qualifier += ", " + lst[-1] 

339 else: 

340 base_qualifier = lst[-1] 

341 lst = lst[:-1] 

342 continue 

343 break 

344 desc = ", ".join(lst) 

345 

346 # Sometimes we have e.g. "chemistry (slang)" with are 

347 # both tags (see "stink"). Handle that case by 

348 # removing parentheses if the value is still tags. The part with 

349 # parentheses could be on either side of the colon. 

350 if "(" in desc: 

351 x = desc.replace("(", ",").replace(")", ",") 

352 if classify_desc(x, no_unknown_starts=True) == "tags": 

353 desc = x 

354 elif "(" in rest: 

355 x = rest.replace("(", ",").replace(")", ",") 

356 if classify_desc(x, no_unknown_starts=True) == "tags": 

357 rest = desc 

358 desc = x 

359 

360 # See if the prefix should trigger special handling for script 

361 # character, letter, digit, etc. handling 

362 if re.search(script_chars_re, desc): 

363 script_chars = True 

364 

365 # Try to determine which side is description and which is 

366 # the linked term (both orders are widely used in Wiktionary) 

367 cls = classify_desc(desc, no_unknown_starts=True) 

368 cls2 = classify_desc(rest, no_unknown_starts=True) 

369 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}" 

370 # .format(desc, cls, rest, cls2)) 

371 

372 e1 = wxr.wtp.page_exists(desc) 

373 e2 = wxr.wtp.page_exists(rest) 

374 if cls != "tags": 

375 if ( 

376 cls2 == "tags" 

377 or (e1 and not e1) 

378 or ( 

379 e1 

380 and e2 

381 and cls2 == "english" 

382 and cls in ("other", "romanization") 

383 ) 

384 or ( 

385 not e1 

386 and not e2 

387 and cls2 == "english" 

388 and cls in ("other", "romanization") 

389 ) 

390 ): 

391 desc, rest = rest, desc # Looks like swapped syntax 

392 cls = cls2 

393 if re.search(linkage_paren_ignore_contains_re, desc): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true

394 desc = "" 

395 # print("linkage colon prefix desc={!r} rest={!r} cls={}" 

396 # .format(desc, rest, cls)) 

397 

398 # Handle the prefix according to its type 

399 if cls == "tags": 

400 if base_qualifier: 

401 base_qualifier += ", " + desc 

402 else: 

403 base_qualifier = desc 

404 item = rest 

405 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"): 

406 if base_english: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 base_english += "; " + base_english 

408 else: 

409 base_english = desc 

410 item = rest 

411 elif cls in ("english", "taxonomic"): 

412 if sense: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true

413 sense += "; " + desc 

414 else: 

415 sense = desc 

416 item = rest 

417 elif desc.isdigit(): 

418 idx = int(desc) - 1 

419 if idx >= 0 and idx < len(pos_datas): 

420 d = pos_datas[idx] 

421 gl = "; ".join(d.get("glosses", ())) 

422 if not gl: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true

423 wxr.wtp.debug( 

424 "parenthesized numeric linkage prefix, " 

425 "but the referenced sense has no gloss: " 

426 "{}".format(desc), 

427 sortid="linkages/355", 

428 ) 

429 elif sense: 

430 sense += "; " + gl 

431 else: 

432 sense = gl 

433 item = rest 

434 else: 

435 wxr.wtp.debug( 

436 "parenthesized numeric linkage prefix, " 

437 "but there is no sense with such index: {}".format(desc), 

438 sortid="linkages/365", 

439 ) 

440 item = rest 

441 else: 

442 wxr.wtp.debug( 

443 "unrecognized linkage prefix: {} desc={} rest={} " 

444 "cls={} cls2={} e1={} e2={}".format( 

445 item, desc, rest, cls, cls2, e1, e2 

446 ), 

447 sortid="linkages/371", 

448 ) 

449 item = rest 

450 

451 base_sense = sense 

452 

453 # Check for certain plural tag forms at end of items list, and apply 

454 # them to all items if found 

455 m = re.search( 

456 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|" 

457 r"characters|symbols|tetragrams|letter names|names|" 

458 r"female names|male names|proper nouns|contractions|" 

459 r"nonstandard spellings|verbs|prepositions|postpositions|" 

460 r"interjections|Abbreviations|abbreviations|variants|" 

461 r"ordinals|nouns|phrases|adjectives|adverbs|" 

462 r"augmentatives|pejoratives|compound words|numerals|" 

463 r"Tally marks|surnames|modern nonstandard spellings)$", 

464 item, 

465 ) 

466 if m: 

467 suffix = m.group(1) 

468 if base_qualifier: 

469 base_qualifier += ", " + suffix 

470 else: 

471 base_qualifier = suffix 

472 item = item[: m.start()] 

473 

474 # Certain linkage items have space-separated valus. These are 

475 # generated by, e.g., certain templates 

476 if base_sense and base_sense.endswith(" paper sizes"): 

477 base_qualifier = None 

478 item = ", ".join(item.split()) 

479 # XXX isn't this now handled by the generic digits/letters/etc code? 

480 # elif base_qualifier in ("Arabic digits",): 

481 # item = ", ".join(item.split()) 

482 

483 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript 

484 item = item.strip() 

485 if not item: 485 ↛ 486line 485 didn't jump to line 486 because the condition on line 485 was never true

486 return None 

487 

488 # Kludge: if the item contains ")/" (with possibly spaces in between), 

489 # replace it by a comma so it gets split. 

490 item = re.sub(r"\)\s*/", "), ", item) 

491 

492 # The item may contain multiple comma-separated linkages 

493 if base_roman: 

494 subitems = [item] 

495 else: 

496 # Split at commas. Also, in most cases split by " or ", but this 

497 # is complicated - "or" may end certain words (e.g., "logical or") 

498 # and it may separate head-final tags (e.g. "foo f or m"). Also, 

499 # some words have parenthesizxed parts in between, e.g., 

500 # wife/English/Translations/Yiddish: 

501 # "ווײַב‎ n (vayb) or f, פֿרוי‎ f (froy)" 

502 subitems = [] 

503 for item1 in split_at_comma_semi(item, skipped=links): 

504 if " or " not in item1: 

505 subitems.append(item1) 

506 continue 

507 # Item1 contains " or " 

508 item2 = re.sub(r"\s*\([^)]*\)", "", item1) 

509 item2 = re.sub(r"\s+", " ", item2) 

510 if ( 

511 ( 

512 lang not in head_final_bantu_langs 

513 or not re.search(head_final_bantu_re, item2) 

514 ) 

515 and ( 

516 lang not in head_final_other_langs 

517 or not re.search(head_final_other_re, item2) 

518 ) 

519 and ( 

520 not re.search(head_final_re, item2) 

521 or ( 

522 item2[-1].isdigit() 

523 and lang not in head_final_numeric_langs 

524 ) 

525 ) 

526 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE") 

527 and all( 

528 wxr.wtp.title not in x.split(" or ") 

529 for x in split_at_comma_semi(item2, skipped=links) 

530 if " or " in x 

531 ) 

532 ): 

533 # We can split this item. Split the non-cleaned version 

534 # that still has any intervening parenthesized parts. 

535 subitems.extend( 

536 split_at_comma_semi(item1, extra=[" or "], skipped=links) 

537 ) 

538 else: 

539 subitems.append(item1) 

540 if len(subitems) > 1: # Would be merged from multiple subitems 

541 ruby = [] # XXX what is the purpose of this? 

542 for item1 in subitems: 

543 if len(subitems) > 1 and item1 in ("...", "…"): 

544 # Some lists have ellipsis in the middle - don't generate 

545 # linkages for the ellipsis 

546 continue 

547 item1 = item1.strip() 

548 qualifier = base_qualifier 

549 sense = base_sense 

550 parts = [] 

551 roman = base_roman # Usually None 

552 alt = base_alt # Usually None 

553 taxonomic = None 

554 english = base_english 

555 

556 # Some words have derived terms with parenthesized quoted English 

557 # descriptions, which can sometimes essentially be tags 

558 # Some word (bleki/Esperanto...) can have parentheses inside 

559 # the quotes, so let's make this regex even more unreadable. 

560 m = re.search(r"\s*\(“([^”]+)”\)", item1) 

561 if m: 561 ↛ 562line 561 didn't jump to line 562 because the condition on line 561 was never true

562 t = m.group(1) 

563 item1 = (item1[: m.start()] + item1[m.end() :]).strip() 

564 cls = classify_desc(t) 

565 if cls == "tags": 

566 if qualifier: 

567 qualifier += ", " + t 

568 else: 

569 qualifier = t 

570 else: 

571 english = t 

572 

573 # Some Korean words use "word (alt, oman, “english”) pattern 

574 # See 滿/Korean 

575 m = re.match( 

576 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), " 

577 r'[“”"]([^”“"]+)[“”"]\)$', 

578 item1, 

579 ) 

580 if ( 

581 m 

582 and classify_desc(m.group(1), no_unknown_starts=True) == "other" 

583 and classify_desc(m.group(2), no_unknown_starts=True) == "other" 

584 ): 

585 alt = m.group(2) 

586 roman = m.group(3) 

587 english = m.group(4) 

588 item1 = m.group(1) 

589 

590 words = item1.split(" ") 

591 if ( 

592 len(words) > 1 

593 and words[0] in linkage_beginning_tags 

594 and words[0] != wxr.wtp.title 

595 ): 

596 t = linkage_beginning_tags[words[0]] 

597 item1 = " ".join(words[1:]) 

598 if qualifier: 598 ↛ 599line 598 didn't jump to line 599 because the condition on line 598 was never true

599 qualifier += ", " + t 

600 else: 

601 qualifier = t 

602 

603 # Extract quoted English translations (there are also other 

604 # kinds of English translations) 

605 def english_repl(m: re.Match) -> str: 

606 nonlocal english 

607 nonlocal qualifier 

608 v = m.group(1).strip() 

609 # If v is "tags: sense", handle the tags 

610 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v) 

611 if m1 is not None: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 desc, rest = m1.groups() 

613 if classify_desc(desc, no_unknown_starts=True) == "tags": 

614 if qualifier: 

615 qualifier += ", " + desc 

616 else: 

617 qualifier = desc 

618 v = rest 

619 if english: 619 ↛ 620line 619 didn't jump to line 620 because the condition on line 619 was never true

620 english += "; " + v 

621 else: 

622 english = v 

623 return "" 

624 

625 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip() 

626 

627 # There could be multiple parenthesized parts, and 

628 # sometimes both at the beginning and at the end. 

629 # And sometimes even in the middle, as in e.g. 

630 # wife/English/Translations/Yiddish 

631 while not script_chars and ( 

632 not sense or not re.search(script_chars_re, sense) 

633 ): 

634 par = None 

635 nonfirst_par = False 

636 if par is None: 636 ↛ 653line 636 didn't jump to line 653 because the condition on line 636 was always true

637 # Try to find a parenthesized part from the beginning. 

638 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1) 

639 if m: 

640 par = m.group(1) 

641 item1 = item1[m.end() :] 

642 else: 

643 # Try to find a parenthesized part at the end or from the 

644 # middle. 

645 m = re.search( 

646 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?", 

647 item1, 

648 ) 

649 if m: 

650 par = m.group(1) 

651 item1 = item1[: m.start()] + item1[m.end() :] 

652 nonfirst_par = True 

653 if not par: 

654 break 

655 if re.search(linkage_paren_ignore_contains_re, par): 

656 continue # Skip these linkage descriptors 

657 par = par.strip() 

658 # Handle tags from beginning of par. We also handle "other" 

659 # here as Korean entries often have Hanja form in the 

660 # beginning of parenthesis, before romanization. Similar 

661 # for many Japanese entries. 

662 while par: 662 ↛ 683line 662 didn't jump to line 683 because the condition on line 662 was always true

663 idx = par.find(",") 

664 if idx <= 0: 

665 break 

666 cls = classify_desc(par[:idx], no_unknown_starts=True) 

667 if cls == "other" and not alt: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true

668 alt = par[:idx] 

669 elif cls == "taxonomic": 669 ↛ 670line 669 didn't jump to line 670 because the condition on line 669 was never true

670 taxonomic = par[:idx] 

671 elif cls == "tags": 

672 if qualifier: 

673 qualifier += ", " + par[:idx] 

674 else: 

675 qualifier = par[:idx] 

676 else: 

677 break 

678 par = par[idx + 1 :].strip() 

679 

680 # Check for certain comma-separated tags combined 

681 # with English text at the beginning or end of a 

682 # comma-separated parenthesized list 

683 lst = par.split(",") if len(par) > 1 else [par] 

684 lst = list(x.strip() for x in lst if x.strip()) 

685 while len(lst) > 1: 

686 cls = classify_desc(lst[0], no_unknown_starts=True) 

687 if cls == "tags": 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true

688 if qualifier: 

689 qualifier += ", " + lst[0] 

690 else: 

691 qualifier = lst[0] 

692 lst = lst[1:] 

693 continue 

694 cls = classify_desc(lst[-1], no_unknown_starts=True) 

695 if cls == "tags": 

696 if qualifier: 

697 qualifier += ", " + lst[-1] 

698 else: 

699 qualifier = lst[-1] 

700 lst = lst[:-1] 

701 continue 

702 break 

703 par = ", ".join(lst) 

704 

705 # Handle remaining types 

706 if not par: 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true

707 continue 

708 if re.search(script_chars_re, par): 

709 script_chars = True 

710 if classify_desc(par, no_unknown_starts=True) == "tags": 710 ↛ 720line 710 didn't jump to line 720 because the condition on line 710 was always true

711 if base_qualifier: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true

712 base_qualifier += "; " + par 

713 else: 

714 base_qualifier = par 

715 if qualifier: 715 ↛ 716line 715 didn't jump to line 716 because the condition on line 715 was never true

716 qualifier += "; " + par 

717 else: 

718 qualifier = par 

719 else: 

720 if base_sense: 

721 base_sense += "; " + par 

722 else: 

723 base_sense = par 

724 if sense: 

725 sense += "; " + par 

726 else: 

727 sense = par 

728 elif par.endswith(" letter names"): 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true

729 if base_qualifier: 

730 base_qualifier += "; " + par 

731 else: 

732 base_qualifier = par 

733 if qualifier: 

734 qualifier += "; " + par 

735 else: 

736 qualifier = par 

737 else: 

738 cls = classify_desc(par) 

739 # print("classify_desc: {!r} -> {}".format(par, cls)) 

740 if cls == "tags": 

741 if qualifier: 741 ↛ 742line 741 didn't jump to line 742 because the condition on line 741 was never true

742 qualifier += ", " + par 

743 else: 

744 qualifier = par 

745 elif cls == "english": 

746 if nonfirst_par: 

747 if english: 

748 english += "; " + par 

749 else: 

750 english = par 

751 else: 

752 if sense: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true

753 sense += "; " + par 

754 else: 

755 sense = par 

756 elif cls == "romanization": 

757 roman = par 

758 elif cls == "taxonomic": 

759 taxonomic = par 

760 elif par.isdigit(): 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true

761 idx = int(par) - 1 

762 if idx >= 0 and idx < len(pos_datas): 

763 d = pos_datas[idx] 

764 gl = "; ".join(d.get("glosses", ())) 

765 if not gl: 

766 wxr.wtp.debug( 

767 "parenthesized number " 

768 "but the referenced sense has no " 

769 "gloss: {}".format(par), 

770 sortid="linkages/665", 

771 ) 

772 elif sense: 

773 sense += "; " + gl 

774 else: 

775 sense = gl 

776 else: 

777 wxr.wtp.debug( 

778 "parenthesized number but there is " 

779 "no sense with such index: {}".format(par), 

780 sortid="linkages/674", 

781 ) 

782 else: 

783 if alt: 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true

784 alt += "; " + par 

785 else: 

786 alt = par 

787 

788 # Handle certain special cases, unless we are parsing 

789 # script characters. 

790 if not script_chars: 

791 # Ignore all linkages with certain prefixes, suffixes, or parts 

792 # (this is done after removing certain prefixes and suffixes) 

793 if re.search(linkage_ignore_re, item1): 

794 continue # Ignore linkages with certain prefixes 

795 

796 # Remove certain prefixes from linkages 

797 m = re.match(linkage_remove_prefixes_re, item1) 

798 if m: 

799 prefix = item1[: m.end()] 

800 item1 = item1[m.end() :] 

801 if prefix in linkage_remove_prefixes_tags: 

802 if qualifier: 

803 qualifier += ", " + linkage_remove_prefixes_tags[prefix] 

804 else: 

805 qualifier = linkage_remove_prefixes_tags[prefix] 

806 # Recheck ignored linkages 

807 if re.search(linkage_ignore_re, item1): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true

808 continue 

809 

810 # Remove certain suffixes from linkages 

811 m = re.search(linkage_remove_suffixes_re, item1) 

812 if m: 

813 item1 = item1[: m.start()] 

814 

815 # Parse linkages with "value = english" syntax (e.g., 

816 # väittää/Finnish) 

817 idx = item1.find(" = ") 

818 if idx >= 0: 

819 eng = item1[idx + 3 :] 

820 if classify_desc(eng, no_unknown_starts=True) == "english": 

821 english = eng 

822 item1 = item1[:idx] 

823 else: 

824 # Some places seem to use it reversed 

825 # "english = value" 

826 eng = item1[:idx] 

827 if classify_desc(eng, no_unknown_starts=True) == "english": 827 ↛ 833line 827 didn't jump to line 833 because the condition on line 827 was always true

828 english = eng 

829 item1 = item1[idx + 3 :] 

830 

831 # Parse linkages with "value - english" syntax (e.g., 

832 # man/Faroese) 

833 m = re.search(r" [-‐‑‒–—―] ", item1) 

834 if m and "(" not in item1: 

835 suffix = item1[m.end() :] 

836 cls = classify_desc(suffix, no_unknown_starts=True) 

837 if cls == "english": 

838 # This case intentionally ignores old values from english 

839 # (otherwise taxonomic lists fail) 

840 english = suffix 

841 item1 = item1[: m.start()] 

842 elif cls == "tags": 842 ↛ 843line 842 didn't jump to line 843 because the condition on line 842 was never true

843 if qualifier: 

844 qualifier += ", " + suffix 

845 else: 

846 qualifier = suffix 

847 item1 = item1[: m.start()] 

848 

849 # Parse certain tags at the end of the linked term (unless 

850 # we are in a letters list) 

851 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1) 

852 if q: 

853 if qualifier: 853 ↛ 854line 853 didn't jump to line 854 because the condition on line 853 was never true

854 qualifier += ", " + ", ".join(q) 

855 else: 

856 qualifier = ", ".join(q) 

857 

858 m = re.search(linkage_truncate_re, item1) 

859 if m: 859 ↛ 861line 859 didn't jump to line 861 because the condition on line 859 was never true

860 # suffix = item1[m.start():] # Currently ignored 

861 item1 = item1[: m.start()] 

862 if not item1: 

863 continue # Ignore empty link targets 

864 if item1 == word: 

865 continue # Ignore self-links 

866 

867 def add(w: str, r: Optional[str]) -> None: 

868 assert isinstance(w, str) 

869 assert r is None or isinstance(r, str) 

870 nonlocal alt 

871 nonlocal taxonomic 

872 

873 # We remove "*" from the beginning of reconstruction linkages. 

874 # Such linkages should only occur in reconstruction senses, so 

875 # this should not cause ambiguity. 

876 if is_reconstruction and w.startswith("*"): 

877 w = w[1:] 

878 

879 # Check if the word contains the Fullwith Solidus, and if 

880 # so, split by it and treat the the results as alternative 

881 # linkages. (This is very commonly used for alternative 

882 # written forms in Chinese compounds and other linkages.) 

883 # However, if the word contains a comma, then we wont't 

884 # split as this is used when we have a different number 

885 # of romanizations than written forms, and don't know 

886 # which is which. 

887 if ( 

888 (not w or "," not in w) 

889 and (not r or "," not in r) 

890 and not wxr.wtp.page_exists(w) 

891 ): 

892 lst = w.split("/") if len(w) > 1 else [w] 

893 if len(lst) == 1: 

894 lst = w.split(" / ") 

895 if len(lst) == 1 and len(lst[0]) >= 6: 

896 lst = w.split("/") 

897 if len(lst) > 1: 

898 # Treat each alternative as separate linkage 

899 for w in lst: 

900 add(w, r) 

901 return None 

902 

903 # Heuristically remove "." at the end of most linkages 

904 # (some linkage lists end in a period, but we also have 

905 # abbreviations that end with a period that should be kept) 

906 if ( 906 ↛ 915line 906 didn't jump to line 915 because the condition on line 906 was never true

907 w.endswith(".") 

908 and not wxr.wtp.page_exists(w) 

909 and ( 

910 wxr.wtp.page_exists(w[:-1]) 

911 or (len(w) >= 5) 

912 and "." not in w[:-1] 

913 ) 

914 ): 

915 w = w[:-1] 

916 

917 # If we have roman but not alt and the word is ASCII, 

918 # move roman to alt. 

919 if r and not alt and w.isascii(): 

920 alt = r 

921 r = None 

922 # Add the linkage 

923 dt: LinkageData = {} 

924 if qualifier: 

925 parse_sense_qualifier(wxr, qualifier, dt) 

926 if sense: 

927 dt["sense"] = sense.strip() 

928 if r: 

929 dt["roman"] = r.strip() 

930 if ruby: 

931 dt["ruby"] = ruby 

932 if english: 

933 dt["english"] = english.strip() 

934 if taxonomic: 

935 if re.match(r"×[A-Z]", taxonomic): 

936 data_append(dt, "tags", "extinct") 

937 taxonomic = taxonomic[1:] 

938 dt["taxonomic"] = taxonomic 

939 if re.match(r"×[A-Z]", w): 

940 data_append(dt, "tags", "extinct") 

941 w = w[1:] # Remove × before dead species names 

942 if alt and re.match(r"×[A-Z]", alt): 

943 data_append(dt, "tags", "extinct") 

944 alt = alt[1:] # Remove × before dead species names 

945 if alt and alt.strip() != w: 

946 dt["alt"] = alt.strip() 

947 if urls: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true

948 dt["urls"] = [ 

949 url.strip() for url in urls if url and isinstance(url, str) 

950 ] 

951 dt["word"] = w 

952 for old in data.get(field, ()): # type: ignore[attr-defined] 

953 if dt == old: 

954 break 

955 else: 

956 data_append(data, field, dt) 

957 

958 # Handle exceptional linkage splits and other linkage 

959 # conversions (including expanding to variant forms) 

960 if item1 in linkage_split_exceptions: 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true

961 for item2 in linkage_split_exceptions[item1]: 

962 add(item2, roman) 

963 continue 

964 

965 # Various templates for letters in scripts use spaces as 

966 # separators and also have multiple characters without 

967 # spaces consecutively. 

968 v = sense or qualifier 

969 # print("lang={} v={} script_chars={} item1={!r}" 

970 # .format(wxr.wtp.section, v, script_chars, item1)) 

971 if v and script_chars: 

972 if ( 

973 len(item1.split()) > 1 

974 or len(list(re.finditer(unicode_dc_re, item1))) == 2 

975 or (len(subitems) > 10 and v in ("Hiragana", "Katakana")) 

976 ): 

977 if v == qualifier: 

978 # if sense: 

979 # sense += "; " + qualifier 

980 # else: 

981 # sense = qualifier 

982 qualifier = None 

983 if re.search(r" (letters|digits|script)$", v): 

984 qualifier = v # Also parse as qualifier 

985 elif re.search( 985 ↛ 992line 985 didn't jump to line 992 because the condition on line 985 was always true

986 r"Variations of letter |" 

987 r"Letters using |" 

988 r"Letters of the ", 

989 v, 

990 ): 

991 qualifier = "letter" 

992 parts = item1.split(". ") 

993 extra: Sequence[str] = () 

994 if len(parts) > 1: 994 ↛ 995line 994 didn't jump to line 995 because the condition on line 994 was never true

995 extra = parts[1:] 

996 item1 = parts[0] 

997 # Handle multi-character names for chars in language's 

998 # alphabet, e.g., "Ny ny" in P/Hungarian. 

999 if ( 

1000 len(subitems) > 20 

1001 and len(item1.split()) == 2 

1002 and all(len(x) <= 3 for x in item1.split()) 

1003 ): 

1004 parts = list( 

1005 m.group(0) 

1006 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1) 

1007 if not m.group(0).isspace() 

1008 and m.group(0) not in ("(", ")") 

1009 ) 

1010 else: 

1011 parts = list( 

1012 m.group(0) 

1013 for m in re.finditer(r".[\u0300-\u036f]?", item1) 

1014 if not m.group(0).isspace() 

1015 and m.group(0) not in ("(", ")") 

1016 ) 

1017 for e in extra: 1017 ↛ 1018line 1017 didn't jump to line 1018 because the loop on line 1017 never started

1018 idx = e.find(":") 

1019 if idx >= 0: 

1020 e = e[idx + 1 :].strip() 

1021 if e.endswith("."): 

1022 e = e[:-1] 

1023 parts.extend(e.split()) 

1024 

1025 # XXX this is not correct - see P/Vietnamese 

1026 # While some sequences have multiple consecutive 

1027 # characters, others use pairs and some have 

1028 # 2/3 character names, e.g., "Ng ng". 

1029 

1030 rparts: Optional[list[Optional[str]]] = None 

1031 if roman: 1031 ↛ 1032line 1031 didn't jump to line 1032 because the condition on line 1031 was never true

1032 rparts = list( 

1033 m.group(0) 

1034 for m in re.finditer(r".[\u0300-\u036f]", roman) 

1035 if not m.group(0).isspace() 

1036 ) 

1037 if len(rparts) != len(parts): 

1038 rparts = None 

1039 if not rparts: 1039 ↛ 1042line 1039 didn't jump to line 1042 because the condition on line 1039 was always true

1040 rparts = [None] * len(parts) 

1041 

1042 for w, r in zip(parts, rparts): 

1043 add(w, r) 

1044 continue 

1045 

1046 add(item1, roman) 

1047 return None