Coverage for src/wiktextract/extractor/en/linkages.py: 81%
478 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc)
2#
3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import re
6import unicodedata
7from typing import Optional, Sequence
9from ...datautils import data_append, split_at_comma_semi
10from ...tags import linkage_beginning_tags
11from ...wxr_context import WiktextractContext
12from .form_descriptions import (
13 classify_desc,
14 head_final_bantu_langs,
15 head_final_bantu_re,
16 head_final_numeric_langs,
17 head_final_other_langs,
18 head_final_other_re,
19 head_final_re,
20 parse_head_final_tags,
21 parse_sense_qualifier,
22)
23from .type_utils import LinkageData, WordData
25# Linkage will be ignored if it matches this regexp before splitting
26linkage_pre_split_ignore_re = re.compile(
27 r"^("
28 + "|".join(
29 re.escape(x)
30 for x in [
31 "For more variations, see ",
32 "Signal flag:",
33 "Semaphore:",
34 ]
35 )
36 + r")"
37)
39# Linkage will be ignored if it has one of these prefixes
40linkage_ignore_prefixes = [
41 "Historical and regional synonyms of ",
42 "edit data",
43 "or these other third-person pronouns",
44 "introduced in Unicode ",
45 "Entries in the ",
46 "Wikipedia article ",
47 "Wiktionary's coverage of ",
48 "Ethnologue entry for ",
49 "Any of Thesaurus:",
50 "See contents of Category:",
51 "See also Thesaurus:",
52 "See also Appendix:",
53 "As SMS messaging ",
54 "For the reversed question mark used in some right-to-left-scripts",
55 "such as ",
56 "Appendix:",
57 "Category:",
58 ":Category:",
59]
61# Linkage will be ignored if it has any of these suffixes
62linkage_ignore_suffixes = [
63 " Wikipedia",
64 " Wikipedia.",
65 " edition of Wiktionary",
66]
68# Linkage will be ignored if it is one of these (with full match)
69linkage_ignore_whole = [
70 "etc.",
71 "other derived terms:",
72 "Formal terms",
73 "informal and slang terms",
74]
76# Linkage will be ignored if it matches this regexp
77linkage_ignore_re = re.compile(
78 r"^("
79 + "|".join(re.escape(x) for x in linkage_ignore_whole)
80 + r")$|^("
81 + "|".join(re.escape(x) for x in linkage_ignore_prefixes)
82 + r")|("
83 + "|".join(re.escape(x) for x in linkage_ignore_suffixes)
84 + r")$"
85)
87# These prefixes will be removed from linkages, leaving the rest. This is
88# considered separately for each linkage in a list.
89linkage_remove_prefixes_re = re.compile(
90 r"^("
91 + r"|".join(
92 re.escape(x)
93 for x in [
94 ":",
95 "see Thesaurus:",
96 "See Thesaurus:",
97 "see also Thesaurus:",
98 "See also Thesaurus:",
99 "see also ",
100 "See also ",
101 "see ",
102 "See ",
103 "from ",
104 "abbreviation of ",
105 "ISO 639-1 code ",
106 "ISO 639-3 code ",
107 "Thesaurus:",
108 ]
109 )
110 + ")"
111)
113# When removing prefix from linkage, this dictionary can be used to map
114# the removed prefix to a space-separated list of tags to add
115linkage_remove_prefixes_tags = {
116 "abbreviation of ": "abbreviation",
117}
119# These suffixes will be removed from linkages, leaving the rest. This is
120# considered separately for each linkage in a list.
121linkage_remove_suffixes_re = re.compile(
122 r"(\s+on (Wikispecies|Wikimedia Commons|"
123 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"
124 r"\s*[-–] Pre-reform orthography.*)"
125 r"$"
126)
128# Ignore linkage parenthesized sections that contain one of these strings
129linkage_paren_ignore_contains_re = re.compile(
130 r"\b("
131 + "|".join(
132 re.escape(x)
133 for x in [
134 "from Etymology",
135 "used as",
136 "usage notes",
137 ]
138 )
139 + ")([, ]|$)"
140)
142taxonomic_ending_map = {
143 "superkingdoms": "superkingdom",
144 "kingdoms": "kingdom",
145 "subkingdoms": "subkingdom",
146 "infrakingdoms": "infrakingdom",
147 "phylums": "phylum",
148 "subphylums": "subphylum",
149 "infraphylums": "infraphylum",
150 "superclasses": "superclass",
151 "classes": "class",
152 "orders": "order",
153 "suborders": "suborder",
154 "families": "family",
155 "subfamilies": "subfamily",
156 "genera": "genus",
157}
158for k, v in list(taxonomic_ending_map.items()):
159 taxonomic_ending_map[v] = v # Also add singular -> singular
160taxonomic_ending_re = re.compile(
161 r"\s+[-‐‑‒–—]\s+({})$".format(
162 "|".join(re.escape(x) for x in taxonomic_ending_map)
163 )
164)
166# Exceptional splits for linkages. This can be used to fix particular linkages
167# that are not handled correctly by the default code. This can also be used
168# to create automatic aliases, e.g., for mapping "..." and "…" to both.
169linkage_split_exceptions = {
170 "∛ ∜": ["∛", "∜"],
171 "...": ["...", "…"],
172 "…": ["...", "…"],
173}
175# Truncate linkage word if it matches any of these strings
176linkage_truncate_re = re.compile(
177 "|".join(
178 re.escape(x)
179 for x in [
180 " and its derived terms",
181 " UTF-16 0x214C",
182 ]
183 )
184)
186# Regexp for identifying special linkages containing lists of letters, digits,
187# or characters
188script_chars_re = re.compile(
189 r"(script letters| script| letters|"
190 r"Dialectological|Puctuation|Symbols|"
191 r"Guillemets|Single guillemets|"
192 r" tetragrams|"
193 r" digits)(;|$)|"
194 r"(^|; )(Letters using |Letters of the |"
195 r"Variations of letter )|"
196 r"^(Hiragana|Katakana)$"
197)
199# Matches an unicode character including any combining diacritics (even if
200# separate characters)
201unicode_dc_re = re.compile(
202 r"\w[{}]|.".format(
203 "".join(
204 chr(x)
205 for x in range(0, 0x110000)
206 if unicodedata.category(chr(x)) == "Mn"
207 )
208 )
209)
212def parse_linkage_item_text(
213 wxr: WiktextractContext,
214 word: str,
215 data: WordData,
216 field: str,
217 item: str,
218 sense: Optional[str],
219 ruby: list,
220 pos_datas: list,
221 is_reconstruction: bool,
222 urls: Optional[list[str]] = None,
223 links: Optional[list[str]] = None,
224) -> Optional[str]:
225 """Parses a linkage item once it has been converted to a string. This
226 may add one or more linkages to ``data`` under ``field``. This
227 returns None or a string that contains tags that should be applied
228 to additional linkages (commonly used in tables for Asian characters)."""
229 assert isinstance(wxr, WiktextractContext)
230 assert isinstance(word, str) # Main word (derived from page title)
231 assert isinstance(data, dict) # Parsed linkages are stored here under field
232 assert isinstance(field, str) # The field under which to store linkage
233 assert isinstance(item, str) # The string to parse
234 assert sense is None or isinstance(sense, str)
235 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
236 assert isinstance(pos_datas, list) # List of senses (containing "glosses")
237 assert urls is None or isinstance(urls, list) # Captured urls
238 assert is_reconstruction in (True, False)
240 item = item.replace("()", "")
241 item = re.sub(r"\s+", " ", item)
242 item = item.strip()
244 base_roman = None
245 base_alt = None
246 base_english = None
247 script_chars = False
248 base_qualifier = None
249 lang = wxr.wtp.section
251 # If ``sense`` can be parsed as tags, treat it as tags instead
252 if sense:
253 cls = classify_desc(sense, no_unknown_starts=True)
254 if cls == "tags":
255 base_qualifier = sense
256 sense = None
258 # Check if this item is a stand-alone sense (or tag) specifier
259 # for following items (e.g., commonly in a table, see 滿)
260 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item)
261 if m:
262 return m.group(1)
264 # Check for pre-split ignored linkages using the appropriate regexp
265 if re.search(linkage_pre_split_ignore_re, item):
266 return None
268 # print(" LINKAGE ITEM: {}: {} (sense {})"
269 # .format(field, item, sense))
271 # Replace occurrences of ~ in the item by the page title
272 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr]
273 item = item.replace(" ~ ", " " + safetitle + " ")
274 item = re.sub(r"^~ ", safetitle + " ", item)
275 item = re.sub(r" ~$", " " + safetitle, item)
277 # Many taxonomic terms contain hyponym lists that end with the
278 # kind of the hyponym (a taxonomic level in plural). Recognize
279 # such and add the term in singular to all linkages in the list.
280 m = re.search(taxonomic_ending_re, item)
281 if m:
282 base_english = taxonomic_ending_map[m.group(1)]
283 item = item[: m.start()]
285 # Some Korean and Japanese words use "word (romanized): english" pattern
286 # Sometimes the parenthesized part contains comma-separated alt and roman.
287 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item)
288 if m:
289 rom = m.group(2)
290 eng = m.group(3)
291 rest = m.group(1)
292 if (
293 classify_desc(rest, no_unknown_starts=True) == "other"
294 and classify_desc(eng, no_unknown_starts=True) == "english"
295 ):
296 item = rest
297 base_roman = rom
298 lst = base_roman.split(", ")
299 if (
300 len(lst) == 2
301 and classify_desc(lst[0], no_unknown_starts=True) == "other"
302 ):
303 base_alt = lst[0]
304 base_roman = lst[1]
305 if base_english:
306 base_english += "; " + eng
307 else:
308 base_english = eng
310 # Many words have tags or similar descriptions in the beginning
311 # followed by a colon and one or more linkages (e.g.,
312 # panetella/Finnish)
313 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match(
314 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$",
315 item,
316 )
317 if m:
318 desc = m.group(1)
319 rest = m.group(len(m.groups()))
320 # Check for certain comma-separated tags combined
321 # with English text at the beginning or end of a
322 # comma-separated parenthesized list
323 lst = split_at_comma_semi(desc, skipped=links)
324 while len(lst) > 1:
325 # Check for tags at the beginning
326 cls = classify_desc(lst[0], no_unknown_starts=True)
327 if cls == "tags":
328 if base_qualifier:
329 base_qualifier += ", " + lst[0]
330 else:
331 base_qualifier = lst[0]
332 lst = lst[1:]
333 continue
334 # Check for tags at the end
335 cls = classify_desc(lst[-1], no_unknown_starts=True)
336 if cls == "tags":
337 if base_qualifier:
338 base_qualifier += ", " + lst[-1]
339 else:
340 base_qualifier = lst[-1]
341 lst = lst[:-1]
342 continue
343 break
344 desc = ", ".join(lst)
346 # Sometimes we have e.g. "chemistry (slang)" with are
347 # both tags (see "stink"). Handle that case by
348 # removing parentheses if the value is still tags. The part with
349 # parentheses could be on either side of the colon.
350 if "(" in desc:
351 x = desc.replace("(", ",").replace(")", ",")
352 if classify_desc(x, no_unknown_starts=True) == "tags":
353 desc = x
354 elif "(" in rest:
355 x = rest.replace("(", ",").replace(")", ",")
356 if classify_desc(x, no_unknown_starts=True) == "tags":
357 rest = desc
358 desc = x
360 # See if the prefix should trigger special handling for script
361 # character, letter, digit, etc. handling
362 if re.search(script_chars_re, desc):
363 script_chars = True
365 # Try to determine which side is description and which is
366 # the linked term (both orders are widely used in Wiktionary)
367 cls = classify_desc(desc, no_unknown_starts=True)
368 cls2 = classify_desc(rest, no_unknown_starts=True)
369 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}"
370 # .format(desc, cls, rest, cls2))
372 e1 = wxr.wtp.page_exists(desc)
373 e2 = wxr.wtp.page_exists(rest)
374 if cls != "tags":
375 if (
376 cls2 == "tags"
377 or (e1 and not e1)
378 or (
379 e1
380 and e2
381 and cls2 == "english"
382 and cls in ("other", "romanization")
383 )
384 or (
385 not e1
386 and not e2
387 and cls2 == "english"
388 and cls in ("other", "romanization")
389 )
390 ):
391 desc, rest = rest, desc # Looks like swapped syntax
392 cls = cls2
393 if re.search(linkage_paren_ignore_contains_re, desc): 393 ↛ 394line 393 didn't jump to line 394 because the condition on line 393 was never true
394 desc = ""
395 # print("linkage colon prefix desc={!r} rest={!r} cls={}"
396 # .format(desc, rest, cls))
398 # Handle the prefix according to its type
399 if cls == "tags":
400 if base_qualifier:
401 base_qualifier += ", " + desc
402 else:
403 base_qualifier = desc
404 item = rest
405 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"):
406 if base_english: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true
407 base_english += "; " + base_english
408 else:
409 base_english = desc
410 item = rest
411 elif cls in ("english", "taxonomic"):
412 if sense: 412 ↛ 413line 412 didn't jump to line 413 because the condition on line 412 was never true
413 sense += "; " + desc
414 else:
415 sense = desc
416 item = rest
417 elif desc.isdigit():
418 idx = int(desc) - 1
419 if idx >= 0 and idx < len(pos_datas):
420 d = pos_datas[idx]
421 gl = "; ".join(d.get("glosses", ()))
422 if not gl: 422 ↛ 423line 422 didn't jump to line 423 because the condition on line 422 was never true
423 wxr.wtp.debug(
424 "parenthesized numeric linkage prefix, "
425 "but the referenced sense has no gloss: "
426 "{}".format(desc),
427 sortid="linkages/355",
428 )
429 elif sense:
430 sense += "; " + gl
431 else:
432 sense = gl
433 item = rest
434 else:
435 wxr.wtp.debug(
436 "parenthesized numeric linkage prefix, "
437 "but there is no sense with such index: {}".format(desc),
438 sortid="linkages/365",
439 )
440 item = rest
441 else:
442 wxr.wtp.debug(
443 "unrecognized linkage prefix: {} desc={} rest={} "
444 "cls={} cls2={} e1={} e2={}".format(
445 item, desc, rest, cls, cls2, e1, e2
446 ),
447 sortid="linkages/371",
448 )
449 item = rest
451 base_sense = sense
453 # Check for certain plural tag forms at end of items list, and apply
454 # them to all items if found
455 m = re.search(
456 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
457 r"characters|symbols|tetragrams|letter names|names|"
458 r"female names|male names|proper nouns|contractions|"
459 r"nonstandard spellings|verbs|prepositions|postpositions|"
460 r"interjections|Abbreviations|abbreviations|variants|"
461 r"ordinals|nouns|phrases|adjectives|adverbs|"
462 r"augmentatives|pejoratives|compound words|numerals|"
463 r"Tally marks|surnames|modern nonstandard spellings)$",
464 item,
465 )
466 if m:
467 suffix = m.group(1)
468 if base_qualifier:
469 base_qualifier += ", " + suffix
470 else:
471 base_qualifier = suffix
472 item = item[: m.start()]
474 # Certain linkage items have space-separated valus. These are
475 # generated by, e.g., certain templates
476 if base_sense and base_sense.endswith(" paper sizes"):
477 base_qualifier = None
478 item = ", ".join(item.split())
479 # XXX isn't this now handled by the generic digits/letters/etc code?
480 # elif base_qualifier in ("Arabic digits",):
481 # item = ", ".join(item.split())
483 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript
484 item = item.strip()
485 if not item: 485 ↛ 486line 485 didn't jump to line 486 because the condition on line 485 was never true
486 return None
488 # Kludge: if the item contains ")/" (with possibly spaces in between),
489 # replace it by a comma so it gets split.
490 item = re.sub(r"\)\s*/", "), ", item)
492 # The item may contain multiple comma-separated linkages
493 if base_roman:
494 subitems = [item]
495 else:
496 # Split at commas. Also, in most cases split by " or ", but this
497 # is complicated - "or" may end certain words (e.g., "logical or")
498 # and it may separate head-final tags (e.g. "foo f or m"). Also,
499 # some words have parenthesizxed parts in between, e.g.,
500 # wife/English/Translations/Yiddish:
501 # "ווײַב n (vayb) or f, פֿרוי f (froy)"
502 subitems = []
503 for item1 in split_at_comma_semi(item, skipped=links):
504 if " or " not in item1:
505 subitems.append(item1)
506 continue
507 # Item1 contains " or "
508 item2 = re.sub(r"\s*\([^)]*\)", "", item1)
509 item2 = re.sub(r"\s+", " ", item2)
510 if (
511 (
512 lang not in head_final_bantu_langs
513 or not re.search(head_final_bantu_re, item2)
514 )
515 and (
516 lang not in head_final_other_langs
517 or not re.search(head_final_other_re, item2)
518 )
519 and (
520 not re.search(head_final_re, item2)
521 or (
522 item2[-1].isdigit()
523 and lang not in head_final_numeric_langs
524 )
525 )
526 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE")
527 and all(
528 wxr.wtp.title not in x.split(" or ")
529 for x in split_at_comma_semi(item2, skipped=links)
530 if " or " in x
531 )
532 ):
533 # We can split this item. Split the non-cleaned version
534 # that still has any intervening parenthesized parts.
535 subitems.extend(
536 split_at_comma_semi(item1, extra=[" or "], skipped=links)
537 )
538 else:
539 subitems.append(item1)
540 if len(subitems) > 1: # Would be merged from multiple subitems
541 ruby = [] # XXX what is the purpose of this?
542 for item1 in subitems:
543 if len(subitems) > 1 and item1 in ("...", "…"):
544 # Some lists have ellipsis in the middle - don't generate
545 # linkages for the ellipsis
546 continue
547 item1 = item1.strip()
548 qualifier = base_qualifier
549 sense = base_sense
550 parts = []
551 roman = base_roman # Usually None
552 alt = base_alt # Usually None
553 taxonomic = None
554 english = base_english
556 # Some words have derived terms with parenthesized quoted English
557 # descriptions, which can sometimes essentially be tags
558 # Some word (bleki/Esperanto...) can have parentheses inside
559 # the quotes, so let's make this regex even more unreadable.
560 m = re.search(r"\s*\(“([^”]+)”\)", item1)
561 if m: 561 ↛ 562line 561 didn't jump to line 562 because the condition on line 561 was never true
562 t = m.group(1)
563 item1 = (item1[: m.start()] + item1[m.end() :]).strip()
564 cls = classify_desc(t)
565 if cls == "tags":
566 if qualifier:
567 qualifier += ", " + t
568 else:
569 qualifier = t
570 else:
571 english = t
573 # Some Korean words use "word (alt, oman, “english”) pattern
574 # See 滿/Korean
575 m = re.match(
576 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "
577 r'[“”"]([^”“"]+)[“”"]\)$',
578 item1,
579 )
580 if (
581 m
582 and classify_desc(m.group(1), no_unknown_starts=True) == "other"
583 and classify_desc(m.group(2), no_unknown_starts=True) == "other"
584 ):
585 alt = m.group(2)
586 roman = m.group(3)
587 english = m.group(4)
588 item1 = m.group(1)
590 words = item1.split(" ")
591 if (
592 len(words) > 1
593 and words[0] in linkage_beginning_tags
594 and words[0] != wxr.wtp.title
595 ):
596 t = linkage_beginning_tags[words[0]]
597 item1 = " ".join(words[1:])
598 if qualifier: 598 ↛ 599line 598 didn't jump to line 599 because the condition on line 598 was never true
599 qualifier += ", " + t
600 else:
601 qualifier = t
603 # Extract quoted English translations (there are also other
604 # kinds of English translations)
605 def english_repl(m: re.Match) -> str:
606 nonlocal english
607 nonlocal qualifier
608 v = m.group(1).strip()
609 # If v is "tags: sense", handle the tags
610 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v)
611 if m1 is not None: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true
612 desc, rest = m1.groups()
613 if classify_desc(desc, no_unknown_starts=True) == "tags":
614 if qualifier:
615 qualifier += ", " + desc
616 else:
617 qualifier = desc
618 v = rest
619 if english: 619 ↛ 620line 619 didn't jump to line 620 because the condition on line 619 was never true
620 english += "; " + v
621 else:
622 english = v
623 return ""
625 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip()
627 # There could be multiple parenthesized parts, and
628 # sometimes both at the beginning and at the end.
629 # And sometimes even in the middle, as in e.g.
630 # wife/English/Translations/Yiddish
631 while not script_chars and (
632 not sense or not re.search(script_chars_re, sense)
633 ):
634 par = None
635 nonfirst_par = False
636 if par is None: 636 ↛ 653line 636 didn't jump to line 653 because the condition on line 636 was always true
637 # Try to find a parenthesized part from the beginning.
638 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1)
639 if m:
640 par = m.group(1)
641 item1 = item1[m.end() :]
642 else:
643 # Try to find a parenthesized part at the end or from the
644 # middle.
645 m = re.search(
646 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?",
647 item1,
648 )
649 if m:
650 par = m.group(1)
651 item1 = item1[: m.start()] + item1[m.end() :]
652 nonfirst_par = True
653 if not par:
654 break
655 if re.search(linkage_paren_ignore_contains_re, par):
656 continue # Skip these linkage descriptors
657 par = par.strip()
658 # Handle tags from beginning of par. We also handle "other"
659 # here as Korean entries often have Hanja form in the
660 # beginning of parenthesis, before romanization. Similar
661 # for many Japanese entries.
662 while par: 662 ↛ 683line 662 didn't jump to line 683 because the condition on line 662 was always true
663 idx = par.find(",")
664 if idx <= 0:
665 break
666 cls = classify_desc(par[:idx], no_unknown_starts=True)
667 if cls == "other" and not alt: 667 ↛ 668line 667 didn't jump to line 668 because the condition on line 667 was never true
668 alt = par[:idx]
669 elif cls == "taxonomic": 669 ↛ 670line 669 didn't jump to line 670 because the condition on line 669 was never true
670 taxonomic = par[:idx]
671 elif cls == "tags":
672 if qualifier:
673 qualifier += ", " + par[:idx]
674 else:
675 qualifier = par[:idx]
676 else:
677 break
678 par = par[idx + 1 :].strip()
680 # Check for certain comma-separated tags combined
681 # with English text at the beginning or end of a
682 # comma-separated parenthesized list
683 lst = par.split(",") if len(par) > 1 else [par]
684 lst = list(x.strip() for x in lst if x.strip())
685 while len(lst) > 1:
686 cls = classify_desc(lst[0], no_unknown_starts=True)
687 if cls == "tags": 687 ↛ 688line 687 didn't jump to line 688 because the condition on line 687 was never true
688 if qualifier:
689 qualifier += ", " + lst[0]
690 else:
691 qualifier = lst[0]
692 lst = lst[1:]
693 continue
694 cls = classify_desc(lst[-1], no_unknown_starts=True)
695 if cls == "tags":
696 if qualifier:
697 qualifier += ", " + lst[-1]
698 else:
699 qualifier = lst[-1]
700 lst = lst[:-1]
701 continue
702 break
703 par = ", ".join(lst)
705 # Handle remaining types
706 if not par: 706 ↛ 707line 706 didn't jump to line 707 because the condition on line 706 was never true
707 continue
708 if re.search(script_chars_re, par):
709 script_chars = True
710 if classify_desc(par, no_unknown_starts=True) == "tags": 710 ↛ 720line 710 didn't jump to line 720 because the condition on line 710 was always true
711 if base_qualifier: 711 ↛ 712line 711 didn't jump to line 712 because the condition on line 711 was never true
712 base_qualifier += "; " + par
713 else:
714 base_qualifier = par
715 if qualifier: 715 ↛ 716line 715 didn't jump to line 716 because the condition on line 715 was never true
716 qualifier += "; " + par
717 else:
718 qualifier = par
719 else:
720 if base_sense:
721 base_sense += "; " + par
722 else:
723 base_sense = par
724 if sense:
725 sense += "; " + par
726 else:
727 sense = par
728 elif par.endswith(" letter names"): 728 ↛ 729line 728 didn't jump to line 729 because the condition on line 728 was never true
729 if base_qualifier:
730 base_qualifier += "; " + par
731 else:
732 base_qualifier = par
733 if qualifier:
734 qualifier += "; " + par
735 else:
736 qualifier = par
737 else:
738 cls = classify_desc(par)
739 # print("classify_desc: {!r} -> {}".format(par, cls))
740 if cls == "tags":
741 if qualifier: 741 ↛ 742line 741 didn't jump to line 742 because the condition on line 741 was never true
742 qualifier += ", " + par
743 else:
744 qualifier = par
745 elif cls == "english":
746 if nonfirst_par:
747 if english:
748 english += "; " + par
749 else:
750 english = par
751 else:
752 if sense: 752 ↛ 753line 752 didn't jump to line 753 because the condition on line 752 was never true
753 sense += "; " + par
754 else:
755 sense = par
756 elif cls == "romanization":
757 roman = par
758 elif cls == "taxonomic":
759 taxonomic = par
760 elif par.isdigit(): 760 ↛ 761line 760 didn't jump to line 761 because the condition on line 760 was never true
761 idx = int(par) - 1
762 if idx >= 0 and idx < len(pos_datas):
763 d = pos_datas[idx]
764 gl = "; ".join(d.get("glosses", ()))
765 if not gl:
766 wxr.wtp.debug(
767 "parenthesized number "
768 "but the referenced sense has no "
769 "gloss: {}".format(par),
770 sortid="linkages/665",
771 )
772 elif sense:
773 sense += "; " + gl
774 else:
775 sense = gl
776 else:
777 wxr.wtp.debug(
778 "parenthesized number but there is "
779 "no sense with such index: {}".format(par),
780 sortid="linkages/674",
781 )
782 else:
783 if alt: 783 ↛ 784line 783 didn't jump to line 784 because the condition on line 783 was never true
784 alt += "; " + par
785 else:
786 alt = par
788 # Handle certain special cases, unless we are parsing
789 # script characters.
790 if not script_chars:
791 # Ignore all linkages with certain prefixes, suffixes, or parts
792 # (this is done after removing certain prefixes and suffixes)
793 if re.search(linkage_ignore_re, item1):
794 continue # Ignore linkages with certain prefixes
796 # Remove certain prefixes from linkages
797 m = re.match(linkage_remove_prefixes_re, item1)
798 if m:
799 prefix = item1[: m.end()]
800 item1 = item1[m.end() :]
801 if prefix in linkage_remove_prefixes_tags:
802 if qualifier:
803 qualifier += ", " + linkage_remove_prefixes_tags[prefix]
804 else:
805 qualifier = linkage_remove_prefixes_tags[prefix]
806 # Recheck ignored linkages
807 if re.search(linkage_ignore_re, item1): 807 ↛ 808line 807 didn't jump to line 808 because the condition on line 807 was never true
808 continue
810 # Remove certain suffixes from linkages
811 m = re.search(linkage_remove_suffixes_re, item1)
812 if m:
813 item1 = item1[: m.start()]
815 # Parse linkages with "value = english" syntax (e.g.,
816 # väittää/Finnish)
817 idx = item1.find(" = ")
818 if idx >= 0:
819 eng = item1[idx + 3 :]
820 if classify_desc(eng, no_unknown_starts=True) == "english":
821 english = eng
822 item1 = item1[:idx]
823 else:
824 # Some places seem to use it reversed
825 # "english = value"
826 eng = item1[:idx]
827 if classify_desc(eng, no_unknown_starts=True) == "english": 827 ↛ 833line 827 didn't jump to line 833 because the condition on line 827 was always true
828 english = eng
829 item1 = item1[idx + 3 :]
831 # Parse linkages with "value - english" syntax (e.g.,
832 # man/Faroese)
833 m = re.search(r" [-‐‑‒–—―] ", item1)
834 if m and "(" not in item1:
835 suffix = item1[m.end() :]
836 cls = classify_desc(suffix, no_unknown_starts=True)
837 if cls == "english":
838 # This case intentionally ignores old values from english
839 # (otherwise taxonomic lists fail)
840 english = suffix
841 item1 = item1[: m.start()]
842 elif cls == "tags": 842 ↛ 843line 842 didn't jump to line 843 because the condition on line 842 was never true
843 if qualifier:
844 qualifier += ", " + suffix
845 else:
846 qualifier = suffix
847 item1 = item1[: m.start()]
849 # Parse certain tags at the end of the linked term (unless
850 # we are in a letters list)
851 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1)
852 if q:
853 if qualifier: 853 ↛ 854line 853 didn't jump to line 854 because the condition on line 853 was never true
854 qualifier += ", " + ", ".join(q)
855 else:
856 qualifier = ", ".join(q)
858 m = re.search(linkage_truncate_re, item1)
859 if m: 859 ↛ 861line 859 didn't jump to line 861 because the condition on line 859 was never true
860 # suffix = item1[m.start():] # Currently ignored
861 item1 = item1[: m.start()]
862 if not item1:
863 continue # Ignore empty link targets
864 if item1 == word:
865 continue # Ignore self-links
867 def add(w: str, r: Optional[str]) -> None:
868 assert isinstance(w, str)
869 assert r is None or isinstance(r, str)
870 nonlocal alt
871 nonlocal taxonomic
873 # We remove "*" from the beginning of reconstruction linkages.
874 # Such linkages should only occur in reconstruction senses, so
875 # this should not cause ambiguity.
876 if is_reconstruction and w.startswith("*"):
877 w = w[1:]
879 # Check if the word contains the Fullwith Solidus, and if
880 # so, split by it and treat the the results as alternative
881 # linkages. (This is very commonly used for alternative
882 # written forms in Chinese compounds and other linkages.)
883 # However, if the word contains a comma, then we wont't
884 # split as this is used when we have a different number
885 # of romanizations than written forms, and don't know
886 # which is which.
887 if (
888 (not w or "," not in w)
889 and (not r or "," not in r)
890 and not wxr.wtp.page_exists(w)
891 ):
892 lst = w.split("/") if len(w) > 1 else [w]
893 if len(lst) == 1:
894 lst = w.split(" / ")
895 if len(lst) == 1 and len(lst[0]) >= 6:
896 lst = w.split("/")
897 if len(lst) > 1:
898 # Treat each alternative as separate linkage
899 for w in lst:
900 add(w, r)
901 return None
903 # Heuristically remove "." at the end of most linkages
904 # (some linkage lists end in a period, but we also have
905 # abbreviations that end with a period that should be kept)
906 if ( 906 ↛ 915line 906 didn't jump to line 915 because the condition on line 906 was never true
907 w.endswith(".")
908 and not wxr.wtp.page_exists(w)
909 and (
910 wxr.wtp.page_exists(w[:-1])
911 or (len(w) >= 5)
912 and "." not in w[:-1]
913 )
914 ):
915 w = w[:-1]
917 # If we have roman but not alt and the word is ASCII,
918 # move roman to alt.
919 if r and not alt and w.isascii():
920 alt = r
921 r = None
922 # Add the linkage
923 dt: LinkageData = {}
924 if qualifier:
925 parse_sense_qualifier(wxr, qualifier, dt)
926 if sense:
927 dt["sense"] = sense.strip()
928 if r:
929 dt["roman"] = r.strip()
930 if ruby:
931 dt["ruby"] = ruby
932 if english:
933 dt["english"] = english.strip()
934 if taxonomic:
935 if re.match(r"×[A-Z]", taxonomic):
936 data_append(dt, "tags", "extinct")
937 taxonomic = taxonomic[1:]
938 dt["taxonomic"] = taxonomic
939 if re.match(r"×[A-Z]", w):
940 data_append(dt, "tags", "extinct")
941 w = w[1:] # Remove × before dead species names
942 if alt and re.match(r"×[A-Z]", alt):
943 data_append(dt, "tags", "extinct")
944 alt = alt[1:] # Remove × before dead species names
945 if alt and alt.strip() != w:
946 dt["alt"] = alt.strip()
947 if urls: 947 ↛ 948line 947 didn't jump to line 948 because the condition on line 947 was never true
948 dt["urls"] = [
949 url.strip() for url in urls if url and isinstance(url, str)
950 ]
951 dt["word"] = w
952 for old in data.get(field, ()): # type: ignore[attr-defined]
953 if dt == old:
954 break
955 else:
956 data_append(data, field, dt)
958 # Handle exceptional linkage splits and other linkage
959 # conversions (including expanding to variant forms)
960 if item1 in linkage_split_exceptions: 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true
961 for item2 in linkage_split_exceptions[item1]:
962 add(item2, roman)
963 continue
965 # Various templates for letters in scripts use spaces as
966 # separators and also have multiple characters without
967 # spaces consecutively.
968 v = sense or qualifier
969 # print("lang={} v={} script_chars={} item1={!r}"
970 # .format(wxr.wtp.section, v, script_chars, item1))
971 if v and script_chars:
972 if (
973 len(item1.split()) > 1
974 or len(list(re.finditer(unicode_dc_re, item1))) == 2
975 or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))
976 ):
977 if v == qualifier:
978 # if sense:
979 # sense += "; " + qualifier
980 # else:
981 # sense = qualifier
982 qualifier = None
983 if re.search(r" (letters|digits|script)$", v):
984 qualifier = v # Also parse as qualifier
985 elif re.search( 985 ↛ 992line 985 didn't jump to line 992 because the condition on line 985 was always true
986 r"Variations of letter |"
987 r"Letters using |"
988 r"Letters of the ",
989 v,
990 ):
991 qualifier = "letter"
992 parts = item1.split(". ")
993 extra: Sequence[str] = ()
994 if len(parts) > 1: 994 ↛ 995line 994 didn't jump to line 995 because the condition on line 994 was never true
995 extra = parts[1:]
996 item1 = parts[0]
997 # Handle multi-character names for chars in language's
998 # alphabet, e.g., "Ny ny" in P/Hungarian.
999 if (
1000 len(subitems) > 20
1001 and len(item1.split()) == 2
1002 and all(len(x) <= 3 for x in item1.split())
1003 ):
1004 parts = list(
1005 m.group(0)
1006 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)
1007 if not m.group(0).isspace()
1008 and m.group(0) not in ("(", ")")
1009 )
1010 else:
1011 parts = list(
1012 m.group(0)
1013 for m in re.finditer(r".[\u0300-\u036f]?", item1)
1014 if not m.group(0).isspace()
1015 and m.group(0) not in ("(", ")")
1016 )
1017 for e in extra: 1017 ↛ 1018line 1017 didn't jump to line 1018 because the loop on line 1017 never started
1018 idx = e.find(":")
1019 if idx >= 0:
1020 e = e[idx + 1 :].strip()
1021 if e.endswith("."):
1022 e = e[:-1]
1023 parts.extend(e.split())
1025 # XXX this is not correct - see P/Vietnamese
1026 # While some sequences have multiple consecutive
1027 # characters, others use pairs and some have
1028 # 2/3 character names, e.g., "Ng ng".
1030 rparts: Optional[list[Optional[str]]] = None
1031 if roman: 1031 ↛ 1032line 1031 didn't jump to line 1032 because the condition on line 1031 was never true
1032 rparts = list(
1033 m.group(0)
1034 for m in re.finditer(r".[\u0300-\u036f]", roman)
1035 if not m.group(0).isspace()
1036 )
1037 if len(rparts) != len(parts):
1038 rparts = None
1039 if not rparts: 1039 ↛ 1042line 1039 didn't jump to line 1042 because the condition on line 1039 was always true
1040 rparts = [None] * len(parts)
1042 for w, r in zip(parts, rparts):
1043 add(w, r)
1044 continue
1046 add(item1, roman)
1047 return None