Coverage for src/wiktextract/extractor/en/linkages.py: 73%

1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc)

5import re

6import unicodedata

7from typing import Optional, Sequence

9from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode

11from ...datautils import data_append, data_extend, split_at_comma_semi

12from ...page import clean_node

13from ...tags import linkage_beginning_tags, valid_tags

14from ...wxr_context import WiktextractContext

15from .form_descriptions import (

16 classify_desc,

17 decode_tags,

18 head_final_bantu_langs,

19 head_final_bantu_re,

20 head_final_numeric_langs,

21 head_final_other_langs,

22 head_final_other_re,

23 head_final_re,

24 parse_head_final_tags,

25 parse_sense_qualifier,

26)

27from .type_utils import FormData, LinkageData, WordData

29# Linkage will be ignored if it matches this regexp before splitting

30linkage_pre_split_ignore_re = re.compile(

31 r"^("

32 + "|".join(

33 re.escape(x)

34 for x in [

35 "For more variations, see ",

36 "Signal flag:",

37 "Semaphore:",

38 ]

39 )

40 + r")"

41)

43# Linkage will be ignored if it has one of these prefixes

44linkage_ignore_prefixes = [

45 "Historical and regional synonyms of ",

46 "edit data",

47 "or these other third-person pronouns",

48 "introduced in Unicode ",

49 "Entries in the ",

50 "Wikipedia article ",

51 "Wiktionary's coverage of ",

52 "Ethnologue entry for ",

53 "Any of Thesaurus:",

54 "See contents of Category:",

55 "See also Thesaurus:",

56 "See also Appendix:",

57 "As SMS messaging ",

58 "For the reversed question mark used in some right-to-left-scripts",

59 "such as ",

60 "Appendix:",

61 "Category:",

62 ":Category:",

63]

65# Linkage will be ignored if it has any of these suffixes

66linkage_ignore_suffixes = [

67 " Wikipedia",

68 " Wikipedia.",

69 " edition of Wiktionary",

70]

72# Linkage will be ignored if it is one of these (with full match)

73linkage_ignore_whole = [

74 "etc.",

75 "other derived terms:",

76 "Formal terms",

77 "informal and slang terms",

78]

80# Linkage will be ignored if it matches this regexp

81linkage_ignore_re = re.compile(

82 r"^("

83 + "|".join(re.escape(x) for x in linkage_ignore_whole)

84 + r")$|^("

85 + "|".join(re.escape(x) for x in linkage_ignore_prefixes)

86 + r")|("

87 + "|".join(re.escape(x) for x in linkage_ignore_suffixes)

88 + r")$"

89)

91# These prefixes will be removed from linkages, leaving the rest. This is

92# considered separately for each linkage in a list.

93linkage_remove_prefixes_re = re.compile(

94 r"^("

95 + r"|".join(

96 re.escape(x)

97 for x in [

98 ":",

99 "see Thesaurus:",

100 "See Thesaurus:",

101 "see also Thesaurus:",

102 "See also Thesaurus:",

103 "see also ",

104 "See also ",

105 "see ",

106 "See ",

107 "from ",

108 "abbreviation of ",

109 "ISO 639-1 code ",

110 "ISO 639-3 code ",

111 "Thesaurus:",

112 ]

113 )

114 + ")"

115)

116

117# When removing prefix from linkage, this dictionary can be used to map

118# the removed prefix to a space-separated list of tags to add

119linkage_remove_prefixes_tags = {

120 "abbreviation of ": "abbreviation",

121}

122

123# These suffixes will be removed from linkages, leaving the rest. This is

124# considered separately for each linkage in a list.

125linkage_remove_suffixes_re = re.compile(

126 r"(\s+on (Wikispecies|Wikimedia Commons|"

127 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"

128 r"\s*[-–] Pre-reform orthography.*)"

129 r"$"

130)

131

132# Ignore linkage parenthesized sections that contain one of these strings

133linkage_paren_ignore_contains_re = re.compile(

134 r"\b("

135 + "|".join(

136 re.escape(x)

137 for x in [

138 "from Etymology",

139 "used as",

140 "usage notes",

141 ]

142 )

143 + ")([, ]|$)"

144)

145

146taxonomic_ending_map = {

147 "superkingdoms": "superkingdom",

148 "kingdoms": "kingdom",

149 "subkingdoms": "subkingdom",

150 "infrakingdoms": "infrakingdom",

151 "phylums": "phylum",

152 "subphylums": "subphylum",

153 "infraphylums": "infraphylum",

154 "superclasses": "superclass",

155 "classes": "class",

156 "orders": "order",

157 "suborders": "suborder",

158 "families": "family",

159 "subfamilies": "subfamily",

160 "genera": "genus",

161}

162for k, v in list(taxonomic_ending_map.items()):

163 taxonomic_ending_map[v] = v # Also add singular -> singular

164taxonomic_ending_re = re.compile(

165 r"\s+[-‐‑‒–—]\s+({})$".format(

166 "|".join(re.escape(x) for x in taxonomic_ending_map)

167 )

168)

169

170# Exceptional splits for linkages. This can be used to fix particular linkages

171# that are not handled correctly by the default code. This can also be used

172# to create automatic aliases, e.g., for mapping "..." and "…" to both.

173linkage_split_exceptions = {

174 "∛ ∜": ["∛", "∜"],

175 "...": ["...", "…"],

176 "…": ["...", "…"],

177}

178

179# Truncate linkage word if it matches any of these strings

180linkage_truncate_re = re.compile(

181 "|".join(

182 re.escape(x)

183 for x in [

184 " and its derived terms",

185 " UTF-16 0x214C",

186 ]

187 )

188)

189

190# Regexp for identifying special linkages containing lists of letters, digits,

191# or characters

192script_chars_re = re.compile(

193 r"(script letters| script| letters|"

194 r"Dialectological|Puctuation|Symbols|"

195 r"Guillemets|Single guillemets|"

196 r" tetragrams|"

197 r" digits)(;|$)|"

198 r"(^|; )(Letters using |Letters of the |"

199 r"Variations of letter )|"

200 r"^(Hiragana|Katakana)$"

201)

202

203# Matches an unicode character including any combining diacritics (even if

204# separate characters)

205unicode_dc_re = re.compile(

206 r"\w[{}]|.".format(

207 "".join(

208 chr(x)

209 for x in range(0, 0x110000)

210 if unicodedata.category(chr(x)) == "Mn"

211 )

212 )

213)

214

215

216def parse_linkage_item_text(

217 wxr: WiktextractContext,

218 word: str,

219 data: WordData,

220 field: str,

221 item: str,

222 sense: Optional[str],

223 ruby: list,

224 pos_datas: list,

225 is_reconstruction: bool,

226 urls: Optional[list[str]] = None,

227 links: Optional[list[str]] = None,

228) -> Optional[str]:

229 """Parses a linkage item once it has been converted to a string. This

230 may add one or more linkages to ``data`` under ``field``. This

231 returns None or a string that contains tags that should be applied

232 to additional linkages (commonly used in tables for Asian characters)."""

233 assert isinstance(wxr, WiktextractContext)

234 assert isinstance(word, str) # Main word (derived from page title)

235 assert isinstance(data, dict) # Parsed linkages are stored here under field

236 assert isinstance(field, str) # The field under which to store linkage

237 assert isinstance(item, str) # The string to parse

238 assert sense is None or isinstance(sense, str)

239 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""

240 assert isinstance(pos_datas, list) # List of senses (containing "glosses")

241 assert urls is None or isinstance(urls, list) # Captured urls

242 assert is_reconstruction in (True, False)

243

244 item = item.replace("()", "")

245 item = re.sub(r"\s+", " ", item)

246 item = item.strip()

247

248 base_roman = None

249 base_alt = None

250 base_english = None

251 script_chars = False

252 base_qualifier = None

253 lang = wxr.wtp.section

254

255 # If ``sense`` can be parsed as tags, treat it as tags instead

256 if sense:

257 cls = classify_desc(sense, no_unknown_starts=True)

258 if cls == "tags":

259 base_qualifier = sense

260 sense = None

261

262 # Check if this item is a stand-alone sense (or tag) specifier

263 # for following items (e.g., commonly in a table, see 滿)

264 m = re.match(r"$([-a-zA-Z0-9 ]+)$:$", item)

265 if m:

266 return m.group(1)

267

268 # Check for pre-split ignored linkages using the appropriate regexp

269 if re.search(linkage_pre_split_ignore_re, item):

270 return None

271

272 # print(" LINKAGE ITEM: {}: {} (sense {})"

273 # .format(field, item, sense))

274

275 # Replace occurrences of ~ in the item by the page title

276 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr]

277 item = item.replace(" ~ ", " " + safetitle + " ")

278 item = re.sub(r"^~ ", safetitle + " ", item)

279 item = re.sub(r" ~$", " " + safetitle, item)

280

281 # Many taxonomic terms contain hyponym lists that end with the

282 # kind of the hyponym (a taxonomic level in plural). Recognize

283 # such and add the term in singular to all linkages in the list.

284 m = re.search(taxonomic_ending_re, item)

285 if m:

286 base_english = taxonomic_ending_map[m.group(1)]

287 item = item[: m.start()]

288

289 # Some Korean and Japanese words use "word (romanized): english" pattern

290 # Sometimes the parenthesized part contains comma-separated alt and roman.

291 m = re.match(r"(.+?) $([^():]+)$: ([-a-zA-Z0-9,. ]+)$", item)

292 if m:

293 rom = m.group(2)

294 eng = m.group(3)

295 rest = m.group(1)

296 if (

297 classify_desc(rest, no_unknown_starts=True) == "other"

298 and classify_desc(eng, no_unknown_starts=True) == "english"

299 ):

300 item = rest

301 base_roman = rom

302 lst = base_roman.split(", ")

303 if (

304 len(lst) == 2

305 and classify_desc(lst[0], no_unknown_starts=True) == "other"

306 ):

307 base_alt = lst[0]

308 base_roman = lst[1]

309 if base_english:

310 base_english += "; " + eng

311 else:

312 base_english = eng

313

314 # Many words have tags or similar descriptions in the beginning

315 # followed by a colon and one or more linkages (e.g.,

316 # panetella/Finnish)

317 m = re.match(r"^$(([^():]|\([^()]*$)+)\): ([^:]*)$", item) or re.match(

318 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"($[^()]+$[-'a-zA-Z0-9 ]*)*): ([^:]*)$",

319 item,

320 )

321 if m:

322 desc = m.group(1)

323 rest = m.group(len(m.groups()))

324 # Check for certain comma-separated tags combined

325 # with English text at the beginning or end of a

326 # comma-separated parenthesized list

327 lst = split_at_comma_semi(desc, skipped=links)

328 while len(lst) > 1:

329 # Check for tags at the beginning

330 cls = classify_desc(lst[0], no_unknown_starts=True)

331 if cls == "tags":

332 if base_qualifier:

333 base_qualifier += ", " + lst[0]

334 else:

335 base_qualifier = lst[0]

336 lst = lst[1:]

337 continue

338 # Check for tags at the end

339 cls = classify_desc(lst[-1], no_unknown_starts=True)

340 if cls == "tags":

341 if base_qualifier:

342 base_qualifier += ", " + lst[-1]

343 else:

344 base_qualifier = lst[-1]

345 lst = lst[:-1]

346 continue

347 break

348 desc = ", ".join(lst)

349

350 # Sometimes we have e.g. "chemistry (slang)" with are

351 # both tags (see "stink"). Handle that case by

352 # removing parentheses if the value is still tags. The part with

353 # parentheses could be on either side of the colon.

354 if "(" in desc:

355 x = desc.replace("(", ",").replace(")", ",")

356 if classify_desc(x, no_unknown_starts=True) == "tags":

357 desc = x

358 elif "(" in rest:

359 x = rest.replace("(", ",").replace(")", ",")

360 if classify_desc(x, no_unknown_starts=True) == "tags":

361 rest = desc

362 desc = x

363

364 # See if the prefix should trigger special handling for script

365 # character, letter, digit, etc. handling

366 if re.search(script_chars_re, desc):

367 script_chars = True

368

369 # Try to determine which side is description and which is

370 # the linked term (both orders are widely used in Wiktionary)

371 cls = classify_desc(desc, no_unknown_starts=True)

372 cls2 = classify_desc(rest, no_unknown_starts=True)

373 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}"

374 # .format(desc, cls, rest, cls2))

375

376 e1 = wxr.wtp.page_exists(desc)

377 e2 = wxr.wtp.page_exists(rest)

378 if cls != "tags":

379 if (

380 cls2 == "tags"

381 or (e1 and not e1)

382 or (

383 e1

384 and e2

385 and cls2 == "english"

386 and cls in ("other", "romanization")

387 )

388 or (

389 not e1

390 and not e2

391 and cls2 == "english"

392 and cls in ("other", "romanization")

393 )

394 ):

395 desc, rest = rest, desc # Looks like swapped syntax

396 cls = cls2

397 if re.search(linkage_paren_ignore_contains_re, desc): 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true

398 desc = ""

399 # print("linkage colon prefix desc={!r} rest={!r} cls={}"

400 # .format(desc, rest, cls))

401

402 # Handle the prefix according to its type

403 if cls == "tags":

404 if base_qualifier:

405 base_qualifier += ", " + desc

406 else:

407 base_qualifier = desc

408 item = rest

409 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"):

410 if base_english: 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true

411 base_english += "; " + base_english

412 else:

413 base_english = desc

414 item = rest

415 elif cls in ("english", "taxonomic"):

416 if sense: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 sense += "; " + desc

418 else:

419 sense = desc

420 item = rest

421 elif desc.isdigit():

422 idx = int(desc) - 1

423 if idx >= 0 and idx < len(pos_datas):

424 d = pos_datas[idx]

425 gl = "; ".join(d.get("glosses", ()))

426 if not gl: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true

427 wxr.wtp.debug(

428 "parenthesized numeric linkage prefix, "

429 "but the referenced sense has no gloss: "

430 "{}".format(desc),

431 sortid="linkages/355",

432 )

433 elif sense:

434 sense += "; " + gl

435 else:

436 sense = gl

437 item = rest

438 else:

439 wxr.wtp.debug(

440 "parenthesized numeric linkage prefix, "

441 "but there is no sense with such index: {}".format(desc),

442 sortid="linkages/365",

443 )

444 item = rest

445 else:

446 wxr.wtp.debug(

447 "unrecognized linkage prefix: {} desc={} rest={} "

448 "cls={} cls2={} e1={} e2={}".format(

449 item, desc, rest, cls, cls2, e1, e2

450 ),

451 sortid="linkages/371",

452 )

453 item = rest

454

455 base_sense = sense

456

457 # Check for certain plural tag forms at end of items list, and apply

458 # them to all items if found

459 m = re.search(

460 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"

462 r"female names|male names|proper nouns|contractions|"

463 r"nonstandard spellings|verbs|prepositions|postpositions|"

464 r"interjections|Abbreviations|abbreviations|variants|"

466 r"augmentatives|pejoratives|compound words|numerals|"

467 r"Tally marks|surnames|modern nonstandard spellings)$",

468 item,

469 )

470 if m:

471 suffix = m.group(1)

472 if base_qualifier:

473 base_qualifier += ", " + suffix

474 else:

475 base_qualifier = suffix

476 item = item[: m.start()]

477

478 # Certain linkage items have space-separated valus. These are

479 # generated by, e.g., certain templates

480 if base_sense and base_sense.endswith(" paper sizes"):

481 base_qualifier = None

482 item = ", ".join(item.split())

483 # XXX isn't this now handled by the generic digits/letters/etc code?

484 # elif base_qualifier in ("Arabic digits",):

485 # item = ", ".join(item.split())

486

487 item = re.sub(r"\s*\^$\s*$|\s*\^\s+", "", item) # Now empty superscript

488 item = item.strip()

489 if not item:

490 return None

491

492 # Kludge: if the item contains ")/" (with possibly spaces in between),

493 # replace it by a comma so it gets split.

494 item = re.sub(r"\)\s*/", "), ", item)

495

496 # The item may contain multiple comma-separated linkages

497 if base_roman:

498 subitems = [item]

499 else:

500 # Split at commas. Also, in most cases split by " or ", but this

501 # is complicated - "or" may end certain words (e.g., "logical or")

502 # and it may separate head-final tags (e.g. "foo f or m"). Also,

503 # some words have parenthesizxed parts in between, e.g.,

504 # wife/English/Translations/Yiddish:

505 # "ווײַב‎ n (vayb) or f, פֿרוי‎ f (froy)"

506 subitems = []

507 for item1 in split_at_comma_semi(item, skipped=links):

508 if " or " not in item1:

509 subitems.append(item1)

510 continue

511 # Item1 contains " or "

512 item2 = re.sub(r"\s*$[^)]*$", "", item1)

513 item2 = re.sub(r"\s+", " ", item2)

514 if (

515 (

516 lang not in head_final_bantu_langs

517 or not re.search(head_final_bantu_re, item2)

518 )

519 and (

520 lang not in head_final_other_langs

521 or not re.search(head_final_other_re, item2)

522 )

523 and (

524 not re.search(head_final_re, item2)

525 or (

526 item2[-1].isdigit()

527 and lang not in head_final_numeric_langs

528 )

529 )

530 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE")

531 and all(

532 wxr.wtp.title not in x.split(" or ")

533 for x in split_at_comma_semi(item2, skipped=links)

534 if " or " in x

535 )

536 ):

537 # We can split this item. Split the non-cleaned version

538 # that still has any intervening parenthesized parts.

539 subitems.extend(

540 split_at_comma_semi(item1, extra=[" or "], skipped=links)

541 )

542 else:

543 subitems.append(item1)

544 if len(subitems) > 1: # Would be merged from multiple subitems

545 ruby = [] # XXX what is the purpose of this?

546 for item1 in subitems:

547 if len(subitems) > 1 and item1 in ("...", "…"):

548 # Some lists have ellipsis in the middle - don't generate

549 # linkages for the ellipsis

550 continue

551 item1 = item1.strip()

552 qualifier = base_qualifier

553 sense = base_sense

554 parts = []

555 roman = base_roman # Usually None

556 alt = base_alt # Usually None

557 taxonomic = None

558 english = base_english

559

560 # Some words have derived terms with parenthesized quoted English

561 # descriptions, which can sometimes essentially be tags

562 # Some word (bleki/Esperanto...) can have parentheses inside

563 # the quotes, so let's make this regex even more unreadable.

564 m = re.search(r"\s*$“([^”]+)”$", item1)

565 if m: 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true

566 t = m.group(1)

567 item1 = (item1[: m.start()] + item1[m.end() :]).strip()

568 cls = classify_desc(t)

569 if cls == "tags":

570 if qualifier:

571 qualifier += ", " + t

572 else:

573 qualifier = t

574 else:

575 english = t

576

577 # Some Korean words use "word (alt, oman, “english”) pattern

578 # See 滿/Korean

579 m = re.match(

580 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "

581 r'[“”"]([^”“"]+)[“”"]\)$',

582 item1,

583 )

584 if (

585 m

586 and classify_desc(m.group(1), no_unknown_starts=True) == "other"

587 and classify_desc(m.group(2), no_unknown_starts=True) == "other"

588 ):

589 alt = m.group(2)

590 roman = m.group(3)

591 english = m.group(4)

592 item1 = m.group(1)

593

594 words = item1.split(" ")

595 if (

596 len(words) > 1

597 and words[0] in linkage_beginning_tags

598 and words[0] != wxr.wtp.title

599 ):

600 t = linkage_beginning_tags[words[0]]

601 item1 = " ".join(words[1:])

602 if qualifier: 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true

603 qualifier += ", " + t

604 else:

605 qualifier = t

606

607 # Extract quoted English translations (there are also other

608 # kinds of English translations)

609 def english_repl(m: re.Match) -> str:

610 nonlocal english

611 nonlocal qualifier

612 v = m.group(1).strip()

613 # If v is "tags: sense", handle the tags

614 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v)

615 if m1 is not None: 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true

616 desc, rest = m1.groups()

617 if classify_desc(desc, no_unknown_starts=True) == "tags":

618 if qualifier:

619 qualifier += ", " + desc

620 else:

621 qualifier = desc

622 v = rest

623 if english:

624 english += "; " + v

625 else:

626 english = v

627 return ""

628

629 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip()

630

631 # There could be multiple parenthesized parts, and

632 # sometimes both at the beginning and at the end.

633 # And sometimes even in the middle, as in e.g.

634 # wife/English/Translations/Yiddish

635 while not script_chars and (

636 not sense or not re.search(script_chars_re, sense)

637 ):

638 par = None

639 nonfirst_par = False

640 if par is None: 640 ↛ 657line 640 didn't jump to line 657 because the condition on line 640 was always true

641 # Try to find a parenthesized part from the beginning.

642 m = re.match(r"$(([^()]|\([^()]*$)*)\):?\s*", item1)

643 if m:

644 par = m.group(1)

645 item1 = item1[m.end() :]

646 else:

647 # Try to find a parenthesized part at the end or from the

648 # middle.

649 m = re.search(

650 r"\s+$(\d|\d\d|[^\d]([^()]|\([^()]*$)*)\)" r"(\.$)?",

651 item1,

652 )

653 if m:

654 par = m.group(1)

655 item1 = item1[: m.start()] + item1[m.end() :]

656 nonfirst_par = True

657 if not par:

658 break

659 if re.search(linkage_paren_ignore_contains_re, par):

660 continue # Skip these linkage descriptors

661 par = par.strip()

662 # Handle tags from beginning of par. We also handle "other"

663 # here as Korean entries often have Hanja form in the

664 # beginning of parenthesis, before romanization. Similar

665 # for many Japanese entries.

666 while par: 666 ↛ 687line 666 didn't jump to line 687 because the condition on line 666 was always true

667 idx = par.find(",")

668 if idx <= 0:

669 break

670 cls = classify_desc(par[:idx], no_unknown_starts=True)

671 if cls == "other" and not alt: 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true

672 alt = par[:idx]

673 elif cls == "taxonomic": 673 ↛ 674line 673 didn't jump to line 674 because the condition on line 673 was never true

674 taxonomic = par[:idx]

675 elif cls == "tags":

676 if qualifier:

677 qualifier += ", " + par[:idx]

678 else:

679 qualifier = par[:idx]

680 else:

681 break

682 par = par[idx + 1 :].strip()

683

684 # Check for certain comma-separated tags combined

685 # with English text at the beginning or end of a

686 # comma-separated parenthesized list

687 lst = par.split(",") if len(par) > 1 else [par]

688 lst = list(x.strip() for x in lst if x.strip())

689 while len(lst) > 1:

690 cls = classify_desc(lst[0], no_unknown_starts=True)

691 if cls == "tags": 691 ↛ 692line 691 didn't jump to line 692 because the condition on line 691 was never true

692 if qualifier:

693 qualifier += ", " + lst[0]

694 else:

695 qualifier = lst[0]

696 lst = lst[1:]

697 continue

698 cls = classify_desc(lst[-1], no_unknown_starts=True)

699 if cls == "tags":

700 if qualifier:

701 qualifier += ", " + lst[-1]

702 else:

703 qualifier = lst[-1]

704 lst = lst[:-1]

705 continue

706 break

707 par = ", ".join(lst)

708

709 # Handle remaining types

710 if not par: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true

711 continue

712 if re.search(script_chars_re, par):

713 script_chars = True

714 if classify_desc(par, no_unknown_starts=True) == "tags": 714 ↛ 724line 714 didn't jump to line 724 because the condition on line 714 was always true

715 if base_qualifier: 715 ↛ 716line 715 didn't jump to line 716 because the condition on line 715 was never true

716 base_qualifier += "; " + par

717 else:

718 base_qualifier = par

719 if qualifier: 719 ↛ 720line 719 didn't jump to line 720 because the condition on line 719 was never true

720 qualifier += "; " + par

721 else:

722 qualifier = par

723 else:

724 if base_sense:

725 base_sense += "; " + par

726 else:

727 base_sense = par

728 if sense:

729 sense += "; " + par

730 else:

731 sense = par

732 elif par.endswith(" letter names"): 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true

733 if base_qualifier:

734 base_qualifier += "; " + par

735 else:

736 base_qualifier = par

737 if qualifier:

738 qualifier += "; " + par

739 else:

740 qualifier = par

741 else:

742 cls = classify_desc(par)

743 # print("classify_desc: {!r} -> {}".format(par, cls))

744 if cls == "tags":

745 if qualifier:

746 qualifier += ", " + par

747 else:

748 qualifier = par

749 elif cls == "english":

750 if nonfirst_par:

751 if english:

752 english += "; " + par

753 else:

754 english = par

755 else:

756 if sense: 756 ↛ 757line 756 didn't jump to line 757 because the condition on line 756 was never true

757 sense += "; " + par

758 else:

759 sense = par

760 elif cls == "romanization":

761 roman = par

762 elif cls == "taxonomic":

763 taxonomic = par

764 elif par.isdigit():

765 idx = int(par) - 1

766 if idx >= 0 and idx < len(pos_datas):

767 d = pos_datas[idx]

768 gl = "; ".join(d.get("glosses", ()))

769 if not gl: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true

770 wxr.wtp.debug(

771 "parenthesized number "

772 "but the referenced sense has no "

773 "gloss: {}".format(par),

774 sortid="linkages/665",

775 )

776 elif sense: 776 ↛ 779line 776 didn't jump to line 779 because the condition on line 776 was always true

777 sense += "; " + gl

778 else:

779 sense = gl

780 else:

781 wxr.wtp.debug(

782 "parenthesized number but there is "

783 "no sense with such index: {}".format(par),

784 sortid="linkages/674",

785 )

786 else:

787 if alt: 787 ↛ 788line 787 didn't jump to line 788 because the condition on line 787 was never true

788 alt += "; " + par

789 else:

790 alt = par

791

792 # Handle certain special cases, unless we are parsing

793 # script characters.

794 if not script_chars:

795 # Ignore all linkages with certain prefixes, suffixes, or parts

796 # (this is done after removing certain prefixes and suffixes)

797 if re.search(linkage_ignore_re, item1):

798 continue # Ignore linkages with certain prefixes

799

800 # Remove certain prefixes from linkages

801 m = re.match(linkage_remove_prefixes_re, item1)

802 if m:

803 prefix = item1[: m.end()]

804 item1 = item1[m.end() :]

805 if prefix in linkage_remove_prefixes_tags:

806 if qualifier:

807 qualifier += ", " + linkage_remove_prefixes_tags[prefix]

808 else:

809 qualifier = linkage_remove_prefixes_tags[prefix]

810 # Recheck ignored linkages

811 if re.search(linkage_ignore_re, item1):

812 continue

813

814 # Remove certain suffixes from linkages

815 m = re.search(linkage_remove_suffixes_re, item1)

816 if m:

817 item1 = item1[: m.start()]

818

819 # Parse linkages with "value = english" syntax (e.g.,

820 # väittää/Finnish)

821 idx = item1.find(" = ")

822 if idx >= 0:

823 eng = item1[idx + 3 :]

824 if classify_desc(eng, no_unknown_starts=True) == "english":

825 english = eng

826 item1 = item1[:idx]

827 else:

828 # Some places seem to use it reversed

829 # "english = value"

830 eng = item1[:idx]

831 if classify_desc(eng, no_unknown_starts=True) == "english":

832 english = eng

833 item1 = item1[idx + 3 :]

834

835 # Parse linkages with "value - english" syntax (e.g.,

836 # man/Faroese)

837 m = re.search(r" [-‐‑‒–—―] ", item1)

838 if m and "(" not in item1:

839 suffix = item1[m.end() :]

840 cls = classify_desc(suffix, no_unknown_starts=True)

841 if cls == "english":

842 # This case intentionally ignores old values from english

843 # (otherwise taxonomic lists fail)

844 english = suffix

845 item1 = item1[: m.start()]

846 elif cls == "tags":

847 if qualifier: 847 ↛ 848line 847 didn't jump to line 848 because the condition on line 847 was never true

848 qualifier += ", " + suffix

849 else:

850 qualifier = suffix

851 item1 = item1[: m.start()]

852

853 # Parse certain tags at the end of the linked term (unless

854 # we are in a letters list)

855 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1)

856 if q:

857 if qualifier: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true

858 qualifier += ", " + ", ".join(q)

859 else:

860 qualifier = ", ".join(q)

861

862 m = re.search(linkage_truncate_re, item1)

863 if m: 863 ↛ 865line 863 didn't jump to line 865 because the condition on line 863 was never true

864 # suffix = item1[m.start():] # Currently ignored

865 item1 = item1[: m.start()]

866 if not item1:

867 continue # Ignore empty link targets

868 if item1 == word:

869 continue # Ignore self-links

870

871 def add(w: str, r: Optional[str]) -> None:

872 assert isinstance(w, str)

873 assert r is None or isinstance(r, str)

874 nonlocal alt

875 nonlocal taxonomic

876

877 # We remove "*" from the beginning of reconstruction linkages.

878 # Such linkages should only occur in reconstruction senses, so

879 # this should not cause ambiguity.

880 if is_reconstruction and w.startswith("*"):

881 w = w[1:]

882

883 # Check if the word contains the Fullwith Solidus, and if

884 # so, split by it and treat the the results as alternative

885 # linkages. (This is very commonly used for alternative

886 # written forms in Chinese compounds and other linkages.)

887 # However, if the word contains a comma, then we wont't

888 # split as this is used when we have a different number

889 # of romanizations than written forms, and don't know

890 # which is which.

891 if (

892 (not w or "," not in w)

893 and (not r or "," not in r)

894 and not wxr.wtp.page_exists(w)

895 ):

896 lst = w.split("／") if len(w) > 1 else [w]

897 if len(lst) == 1:

898 lst = w.split(" / ")

899 if len(lst) == 1 and len(lst[0]) >= 6:

900 lst = w.split("/")

901 if len(lst) > 1:

902 # Treat each alternative as separate linkage

903 for w in lst:

904 add(w, r)

905 return None

906

907 # Heuristically remove "." at the end of most linkages

908 # (some linkage lists end in a period, but we also have

909 # abbreviations that end with a period that should be kept)

910 if (

911 w.endswith(".")

912 and not wxr.wtp.page_exists(w)

913 and (

914 wxr.wtp.page_exists(w[:-1])

915 or (len(w) >= 5)

916 and "." not in w[:-1]

917 )

918 ):

919 w = w[:-1]

920

921 # If we have roman but not alt and the word is ASCII,

922 # move roman to alt.

923 if r and not alt and w.isascii():

924 alt = r

925 r = None

926 # Add the linkage

927 dt: LinkageData = {}

928 if qualifier:

929 parse_sense_qualifier(wxr, qualifier, dt)

930 if sense:

931 dt["sense"] = sense.strip()

932 if r:

933 dt["roman"] = r.strip()

934 if ruby:

935 dt["ruby"] = ruby

936 if english:

937 dt["english"] = english.strip()

938 if taxonomic:

939 if re.match(r"×[A-Z]", taxonomic):

940 data_append(dt, "tags", "extinct")

941 taxonomic = taxonomic[1:]

942 dt["taxonomic"] = taxonomic

943 if re.match(r"×[A-Z]", w):

944 data_append(dt, "tags", "extinct")

945 w = w[1:] # Remove × before dead species names

946 if alt and re.match(r"×[A-Z]", alt):

947 data_append(dt, "tags", "extinct")

948 alt = alt[1:] # Remove × before dead species names

949 if alt and alt.strip() != w:

950 dt["alt"] = alt.strip()

951 if urls:

952 dt["urls"] = [

953 url.strip() for url in urls if url and isinstance(url, str)

954 ]

955 dt["word"] = w

956 for old in data.get(field, ()): # type: ignore[attr-defined]

957 if dt == old:

958 break

959 else:

960 data_append(data, field, dt)

961

962 # Handle exceptional linkage splits and other linkage

963 # conversions (including expanding to variant forms)

964 if item1 in linkage_split_exceptions: 964 ↛ 965line 964 didn't jump to line 965 because the condition on line 964 was never true

965 for item2 in linkage_split_exceptions[item1]:

966 add(item2, roman)

967 continue

968

969 # Various templates for letters in scripts use spaces as

970 # separators and also have multiple characters without

971 # spaces consecutively.

972 v = sense or qualifier

973 # print("lang={} v={} script_chars={} item1={!r}"

974 # .format(wxr.wtp.section, v, script_chars, item1))

975 if v and script_chars:

976 if (

977 len(item1.split()) > 1

978 or len(list(re.finditer(unicode_dc_re, item1))) == 2

979 or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))

980 ):

981 if v == qualifier:

982 # if sense:

983 # sense += "; " + qualifier

984 # else:

985 # sense = qualifier

986 qualifier = None

987 if re.search(r" (letters|digits|script)$", v):

988 qualifier = v # Also parse as qualifier

989 elif re.search( 989 ↛ 996line 989 didn't jump to line 996 because the condition on line 989 was always true

990 r"Variations of letter |"

991 r"Letters using |"

992 r"Letters of the ",

993 v,

994 ):

995 qualifier = "letter"

996 parts = item1.split(". ")

997 extra: Sequence[str] = ()

998 if len(parts) > 1: 998 ↛ 999line 998 didn't jump to line 999 because the condition on line 998 was never true

999 extra = parts[1:]

1000 item1 = parts[0]

1001 # Handle multi-character names for chars in language's

1002 # alphabet, e.g., "Ny ny" in P/Hungarian.

1003 if (

1004 len(subitems) > 20

1005 and len(item1.split()) == 2

1006 and all(len(x) <= 3 for x in item1.split())

1007 ):

1008 parts = list(

1009 m.group(0)

1010 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)

1011 if not m.group(0).isspace()

1012 and m.group(0) not in ("(", ")")

1013 )

1014 else:

1015 parts = list(

1016 m.group(0)

1017 for m in re.finditer(r".[\u0300-\u036f]?", item1)

1018 if not m.group(0).isspace()

1019 and m.group(0) not in ("(", ")")

1020 )

1021 for e in extra: 1021 ↛ 1022line 1021 didn't jump to line 1022 because the loop on line 1021 never started

1022 idx = e.find(":")

1023 if idx >= 0:

1024 e = e[idx + 1 :].strip()

1025 if e.endswith("."):

1026 e = e[:-1]

1027 parts.extend(e.split())

1028

1029 # XXX this is not correct - see P/Vietnamese

1030 # While some sequences have multiple consecutive

1031 # characters, others use pairs and some have

1032 # 2/3 character names, e.g., "Ng ng".

1033

1034 rparts: Optional[list[Optional[str]]] = None

1035 if roman: 1035 ↛ 1036line 1035 didn't jump to line 1036 because the condition on line 1035 was never true

1036 rparts = list(

1037 m.group(0)

1038 for m in re.finditer(r".[\u0300-\u036f]", roman)

1039 if not m.group(0).isspace()

1040 )

1041 if len(rparts) != len(parts):

1042 rparts = None

1043 if not rparts: 1043 ↛ 1046line 1043 didn't jump to line 1046 because the condition on line 1043 was always true

1044 rparts = [None] * len(parts)

1045

1046 for w, r in zip(parts, rparts):

1047 add(w, r)

1048 continue

1049

1050 add(item1, roman)

1051 return None

1052

1053

1054def extract_alt_form_section(

1055 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode

1056) -> None:

1057 for list_node in level_node.find_child(NodeKind.LIST):

1058 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

1059 for node in list_item.children:

1060 if isinstance(node, TemplateNode) and node.template_name in [

1061 "l",

1062 "link",

1063 "L",

1064 "alt",

1065 "alter",

1066 ]:

1067 extract_l_template(wxr, word_entry, node)

1068 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

1069 word = clean_node(wxr, None, node)

1070 if word != "": 1070 ↛ 1059line 1070 didn't jump to line 1059 because the condition on line 1070 was always true

1071 form: FormData = {"form": word, "tags": ["alternative"]}

1072 data_append(word_entry, "forms", form)

1073

1074

1075def extract_l_template(

1076 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode

1077) -> None:

1078 forms: list[FormData] = []

1079 expanded_node = wxr.wtp.parse(

1080 wxr.wtp.node_to_wikitext(t_node), expand_all=True

1081 )

1082 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

1083 for span_tag in expanded_node.find_html("span"):

1084 span_lang = span_tag.attrs.get("lang", "")

1085 span_class = span_tag.attrs.get("class", "")

1086 if span_lang == lang_code: 1086 ↛ 1091line 1086 didn't jump to line 1091 because the condition on line 1086 was always true

1087 word = clean_node(wxr, None, span_tag)

1088 if word != "": 1088 ↛ 1083line 1088 didn't jump to line 1083 because the condition on line 1088 was always true

1089 form: FormData = {"form": word, "tags": ["alternative"]}

1090 forms.append(form)

1091 elif span_lang.endswith("-Latn") and len(forms) > 0:

1092 roman = clean_node(wxr, None, span_tag)

1093 if roman != "":

1094 forms[-1]["roman"] = roman

1095 elif "label-content" in span_class and len(forms) > 0:

1096 tag_text = clean_node(wxr, None, span_tag)

1097 if classify_desc(tag_text) == "tags":

1098 tagsets1, _ = decode_tags(tag_text)

1099 tags: list[str] = []

1100 for ts in tagsets1:

1101 tags.extend(ts)

1102 for form in forms:

1103 form["tags"].extend(tags)

1104 data_extend(word_entry, "forms", forms)

1105

1106

1107ZH_DIAL_TAGS = {

1108 "Classical Chinese": ["Classical-Chinese"],

1109 "Formal": ["formal"],

1110 "Written Standard Chinese": ["Written-vernacular-Chinese"],

1111 "Northeastern Mandarin": ["Northeastern-Mandarin"],

1112 "Jilu Mandarin": ["Jilu-Mandarin"],

1113 "Jiaoliao Mandarin": ["Jiaoliao-Mandarin"],

1114 "Central Plains Mandarin": ["Central-Plains-Mandarin"],

1115 "Lanyin Mandarin": ["Lanyin-Mandarin"],

1116 "Southwestern Mandarin": ["Southwestern-Mandarin"],

1117 "Jianghuai Mandarin": ["Jianghuai-Mandarin"],

1118 "Northern Min": ["Min-Bei"],

1119 "Eastern Min": ["Min-Dong"],

1120 "Southern Min": ["Min-Nan"],

1121 "Zhongshan Min": ["Zhongshan-Min"],

1122 "Southern Pinghua": ["Southern-Pinghua"],

1123}

1124

1125

1126def extract_zh_dial_template(

1127 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode, sense: str

1128):

1129 # https://en.wiktionary.org/wiki/Template:zh-dial

1130 from .pronunciation import split_zh_pron_raw_tag

1131

1132 linkage_list = []

1133 expanded_node = wxr.wtp.parse(

1134 wxr.wtp.node_to_wikitext(t_node), expand_all=True

1135 )

1136 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 1136 ↛ 1137line 1136 didn't jump to line 1137 because the loop on line 1136 never started

1137 is_note_row = False

1138 note_tags = {}

1139 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

1140 for cell_node in row_node.find_child(

1141 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

1142 ):

1143 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

1144 is_note_row = clean_node(wxr, None, cell_node) == "Note"

1145 elif is_note_row:

1146 for note_str in clean_node(wxr, None, cell_node).split(";"):

1147 if "-" in note_str:

1148 note_symbol, note = note_str.split("-", maxsplit=1)

1149 note_symbol = note_symbol.strip()

1150 note = note.strip()

1151 if note_symbol != "" and note != "":

1152 note_tags[note_symbol] = note

1153 lang_tags = []

1154 region_tags = []

1155 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

1156 if not row_node.contain_node(NodeKind.TABLE_CELL):

1157 continue # skip header row

1158 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):

1159 lang_tags = split_zh_pron_raw_tag(

1160 clean_node(wxr, None, header_node)

1161 )

1162 if lang_tags == ["Note"]: # skip last note row

1163 continue

1164 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):

1165 for link_node in cell_node.find_child(NodeKind.LINK):

1166 region_tags = split_zh_pron_raw_tag(

1167 clean_node(wxr, None, link_node)

1168 )

1169 for span_tag in cell_node.find_html("span"):

1170 span_text = clean_node(wxr, None, span_tag)

1171 if span_text == "":

1172 continue

1173 if (

1174 span_tag.attrs.get("lang", "") == "zh"

1175 and span_text != wxr.wtp.title

1176 ):

1177 l_data = {"word": span_text}

1178 if sense != "":

1179 l_data["sense"] = sense

1180 if len(lang_tags) > 0:

1181 data_extend(l_data, "raw_tags", lang_tags)

1182 if len(region_tags) > 0:

1183 data_extend(l_data, "raw_tags", region_tags)

1184 linkage_list.append(l_data)

1185 elif (

1186 span_tag.attrs.get("style", "") == "font-size:60%"

1187 and len(linkage_list) > 0

1188 ):

1189 for note_symbol in span_text.split(","):

1190 note_symbol = note_symbol.strip()

1191 raw_tag = note_symbol

1192 if note_symbol in note_tags:

1193 raw_tag = note_tags[note_symbol]

1194 if raw_tag != "":

1195 data_append(

1196 linkage_list[-1], "raw_tags", raw_tag

1197 )

1198

1199 for l_data in linkage_list:

1200 raw_tags = []

1201 for raw_tag in l_data.get("raw_tags", []):

1202 if raw_tag in ZH_DIAL_TAGS:

1203 data_extend(l_data, "tags", ZH_DIAL_TAGS[raw_tag])

1204 elif raw_tag in valid_tags:

1205 data_append(l_data, "tags", raw_tag)

1206 else:

1207 raw_tags.append(raw_tag)

1208 if len(raw_tags) > 0:

1209 l_data["raw_tags"] = raw_tags

1210 elif "raw_tags" in l_data:

1211 del l_data["raw_tags"]

1212 data_extend(word_entry, "synonyms", linkage_list)