Coverage for src/wiktextract/extractor/en/linkages.py: 83%
517 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc)
2#
3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import re
6import unicodedata
7from typing import Optional, Sequence
9from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
11from ...datautils import data_append, data_extend, split_at_comma_semi
12from ...page import clean_node
13from ...tags import linkage_beginning_tags
14from ...wxr_context import WiktextractContext
15from .form_descriptions import (
16 classify_desc,
17 decode_tags,
18 head_final_bantu_langs,
19 head_final_bantu_re,
20 head_final_numeric_langs,
21 head_final_other_langs,
22 head_final_other_re,
23 head_final_re,
24 parse_head_final_tags,
25 parse_sense_qualifier,
26)
27from .type_utils import FormData, LinkageData, WordData
29# Linkage will be ignored if it matches this regexp before splitting
30linkage_pre_split_ignore_re = re.compile(
31 r"^("
32 + "|".join(
33 re.escape(x)
34 for x in [
35 "For more variations, see ",
36 "Signal flag:",
37 "Semaphore:",
38 ]
39 )
40 + r")"
41)
43# Linkage will be ignored if it has one of these prefixes
44linkage_ignore_prefixes = [
45 "Historical and regional synonyms of ",
46 "edit data",
47 "or these other third-person pronouns",
48 "introduced in Unicode ",
49 "Entries in the ",
50 "Wikipedia article ",
51 "Wiktionary's coverage of ",
52 "Ethnologue entry for ",
53 "Any of Thesaurus:",
54 "See contents of Category:",
55 "See also Thesaurus:",
56 "See also Appendix:",
57 "As SMS messaging ",
58 "For the reversed question mark used in some right-to-left-scripts",
59 "such as ",
60 "Appendix:",
61 "Category:",
62 ":Category:",
63]
65# Linkage will be ignored if it has any of these suffixes
66linkage_ignore_suffixes = [
67 " Wikipedia",
68 " Wikipedia.",
69 " edition of Wiktionary",
70]
72# Linkage will be ignored if it is one of these (with full match)
73linkage_ignore_whole = [
74 "etc.",
75 "other derived terms:",
76 "Formal terms",
77 "informal and slang terms",
78]
80# Linkage will be ignored if it matches this regexp
81linkage_ignore_re = re.compile(
82 r"^("
83 + "|".join(re.escape(x) for x in linkage_ignore_whole)
84 + r")$|^("
85 + "|".join(re.escape(x) for x in linkage_ignore_prefixes)
86 + r")|("
87 + "|".join(re.escape(x) for x in linkage_ignore_suffixes)
88 + r")$"
89)
91# These prefixes will be removed from linkages, leaving the rest. This is
92# considered separately for each linkage in a list.
93linkage_remove_prefixes_re = re.compile(
94 r"^("
95 + r"|".join(
96 re.escape(x)
97 for x in [
98 ":",
99 "see Thesaurus:",
100 "See Thesaurus:",
101 "see also Thesaurus:",
102 "See also Thesaurus:",
103 "see also ",
104 "See also ",
105 "see ",
106 "See ",
107 "from ",
108 "abbreviation of ",
109 "ISO 639-1 code ",
110 "ISO 639-3 code ",
111 "Thesaurus:",
112 ]
113 )
114 + ")"
115)
117# When removing prefix from linkage, this dictionary can be used to map
118# the removed prefix to a space-separated list of tags to add
119linkage_remove_prefixes_tags = {
120 "abbreviation of ": "abbreviation",
121}
123# These suffixes will be removed from linkages, leaving the rest. This is
124# considered separately for each linkage in a list.
125linkage_remove_suffixes_re = re.compile(
126 r"(\s+on (Wikispecies|Wikimedia Commons|"
127 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"
128 r"\s*[-–] Pre-reform orthography.*)"
129 r"$"
130)
132# Ignore linkage parenthesized sections that contain one of these strings
133linkage_paren_ignore_contains_re = re.compile(
134 r"\b("
135 + "|".join(
136 re.escape(x)
137 for x in [
138 "from Etymology",
139 "used as",
140 "usage notes",
141 ]
142 )
143 + ")([, ]|$)"
144)
146taxonomic_ending_map = {
147 "superkingdoms": "superkingdom",
148 "kingdoms": "kingdom",
149 "subkingdoms": "subkingdom",
150 "infrakingdoms": "infrakingdom",
151 "phylums": "phylum",
152 "subphylums": "subphylum",
153 "infraphylums": "infraphylum",
154 "superclasses": "superclass",
155 "classes": "class",
156 "orders": "order",
157 "suborders": "suborder",
158 "families": "family",
159 "subfamilies": "subfamily",
160 "genera": "genus",
161}
162for k, v in list(taxonomic_ending_map.items()):
163 taxonomic_ending_map[v] = v # Also add singular -> singular
164taxonomic_ending_re = re.compile(
165 r"\s+[-‐‑‒–—]\s+({})$".format(
166 "|".join(re.escape(x) for x in taxonomic_ending_map)
167 )
168)
170# Exceptional splits for linkages. This can be used to fix particular linkages
171# that are not handled correctly by the default code. This can also be used
172# to create automatic aliases, e.g., for mapping "..." and "…" to both.
173linkage_split_exceptions = {
174 "∛ ∜": ["∛", "∜"],
175 "...": ["...", "…"],
176 "…": ["...", "…"],
177}
179# Truncate linkage word if it matches any of these strings
180linkage_truncate_re = re.compile(
181 "|".join(
182 re.escape(x)
183 for x in [
184 " and its derived terms",
185 " UTF-16 0x214C",
186 ]
187 )
188)
190# Regexp for identifying special linkages containing lists of letters, digits,
191# or characters
192script_chars_re = re.compile(
193 r"(script letters| script| letters|"
194 r"Dialectological|Puctuation|Symbols|"
195 r"Guillemets|Single guillemets|"
196 r" tetragrams|"
197 r" digits)(;|$)|"
198 r"(^|; )(Letters using |Letters of the |"
199 r"Variations of letter )|"
200 r"^(Hiragana|Katakana)$"
201)
203# Matches an unicode character including any combining diacritics (even if
204# separate characters)
205unicode_dc_re = re.compile(
206 r"\w[{}]|.".format(
207 "".join(
208 chr(x)
209 for x in range(0, 0x110000)
210 if unicodedata.category(chr(x)) == "Mn"
211 )
212 )
213)
216def parse_linkage_item_text(
217 wxr: WiktextractContext,
218 word: str,
219 data: WordData,
220 field: str,
221 item: str,
222 sense: Optional[str],
223 ruby: list,
224 pos_datas: list,
225 is_reconstruction: bool,
226 urls: Optional[list[str]] = None,
227 links: Optional[list[str]] = None,
228) -> Optional[str]:
229 """Parses a linkage item once it has been converted to a string. This
230 may add one or more linkages to ``data`` under ``field``. This
231 returns None or a string that contains tags that should be applied
232 to additional linkages (commonly used in tables for Asian characters)."""
233 assert isinstance(wxr, WiktextractContext)
234 assert isinstance(word, str) # Main word (derived from page title)
235 assert isinstance(data, dict) # Parsed linkages are stored here under field
236 assert isinstance(field, str) # The field under which to store linkage
237 assert isinstance(item, str) # The string to parse
238 assert sense is None or isinstance(sense, str)
239 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
240 assert isinstance(pos_datas, list) # List of senses (containing "glosses")
241 assert urls is None or isinstance(urls, list) # Captured urls
242 assert is_reconstruction in (True, False)
244 item = item.replace("()", "")
245 item = re.sub(r"\s+", " ", item)
246 item = item.strip()
248 base_roman = None
249 base_alt = None
250 base_english = None
251 script_chars = False
252 base_qualifier = None
253 lang = wxr.wtp.section
255 # If ``sense`` can be parsed as tags, treat it as tags instead
256 if sense:
257 cls = classify_desc(sense, no_unknown_starts=True)
258 if cls == "tags":
259 base_qualifier = sense
260 sense = None
262 # Check if this item is a stand-alone sense (or tag) specifier
263 # for following items (e.g., commonly in a table, see 滿)
264 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item)
265 if m:
266 return m.group(1)
268 # Check for pre-split ignored linkages using the appropriate regexp
269 if re.search(linkage_pre_split_ignore_re, item):
270 return None
272 # print(" LINKAGE ITEM: {}: {} (sense {})"
273 # .format(field, item, sense))
275 # Replace occurrences of ~ in the item by the page title
276 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr]
277 item = item.replace(" ~ ", " " + safetitle + " ")
278 item = re.sub(r"^~ ", safetitle + " ", item)
279 item = re.sub(r" ~$", " " + safetitle, item)
281 # Many taxonomic terms contain hyponym lists that end with the
282 # kind of the hyponym (a taxonomic level in plural). Recognize
283 # such and add the term in singular to all linkages in the list.
284 m = re.search(taxonomic_ending_re, item)
285 if m:
286 base_english = taxonomic_ending_map[m.group(1)]
287 item = item[: m.start()]
289 # Some Korean and Japanese words use "word (romanized): english" pattern
290 # Sometimes the parenthesized part contains comma-separated alt and roman.
291 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item)
292 if m:
293 rom = m.group(2)
294 eng = m.group(3)
295 rest = m.group(1)
296 if (
297 classify_desc(rest, no_unknown_starts=True) == "other"
298 and classify_desc(eng, no_unknown_starts=True) == "english"
299 ):
300 item = rest
301 base_roman = rom
302 lst = base_roman.split(", ")
303 if (
304 len(lst) == 2
305 and classify_desc(lst[0], no_unknown_starts=True) == "other"
306 ):
307 base_alt = lst[0]
308 base_roman = lst[1]
309 if base_english:
310 base_english += "; " + eng
311 else:
312 base_english = eng
314 # Many words have tags or similar descriptions in the beginning
315 # followed by a colon and one or more linkages (e.g.,
316 # panetella/Finnish)
317 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match(
318 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$",
319 item,
320 )
321 if m:
322 desc = m.group(1)
323 rest = m.group(len(m.groups()))
324 # Check for certain comma-separated tags combined
325 # with English text at the beginning or end of a
326 # comma-separated parenthesized list
327 lst = split_at_comma_semi(desc, skipped=links)
328 while len(lst) > 1:
329 # Check for tags at the beginning
330 cls = classify_desc(lst[0], no_unknown_starts=True)
331 if cls == "tags":
332 if base_qualifier:
333 base_qualifier += ", " + lst[0]
334 else:
335 base_qualifier = lst[0]
336 lst = lst[1:]
337 continue
338 # Check for tags at the end
339 cls = classify_desc(lst[-1], no_unknown_starts=True)
340 if cls == "tags":
341 if base_qualifier:
342 base_qualifier += ", " + lst[-1]
343 else:
344 base_qualifier = lst[-1]
345 lst = lst[:-1]
346 continue
347 break
348 desc = ", ".join(lst)
350 # Sometimes we have e.g. "chemistry (slang)" with are
351 # both tags (see "stink"). Handle that case by
352 # removing parentheses if the value is still tags. The part with
353 # parentheses could be on either side of the colon.
354 if "(" in desc:
355 x = desc.replace("(", ",").replace(")", ",")
356 if classify_desc(x, no_unknown_starts=True) == "tags":
357 desc = x
358 elif "(" in rest:
359 x = rest.replace("(", ",").replace(")", ",")
360 if classify_desc(x, no_unknown_starts=True) == "tags":
361 rest = desc
362 desc = x
364 # See if the prefix should trigger special handling for script
365 # character, letter, digit, etc. handling
366 if re.search(script_chars_re, desc):
367 script_chars = True
369 # Try to determine which side is description and which is
370 # the linked term (both orders are widely used in Wiktionary)
371 cls = classify_desc(desc, no_unknown_starts=True)
372 cls2 = classify_desc(rest, no_unknown_starts=True)
373 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}"
374 # .format(desc, cls, rest, cls2))
376 e1 = wxr.wtp.page_exists(desc)
377 e2 = wxr.wtp.page_exists(rest)
378 if cls != "tags":
379 if (
380 cls2 == "tags"
381 or (e1 and not e1)
382 or (
383 e1
384 and e2
385 and cls2 == "english"
386 and cls in ("other", "romanization")
387 )
388 or (
389 not e1
390 and not e2
391 and cls2 == "english"
392 and cls in ("other", "romanization")
393 )
394 ):
395 desc, rest = rest, desc # Looks like swapped syntax
396 cls = cls2
397 if re.search(linkage_paren_ignore_contains_re, desc): 397 ↛ 398line 397 didn't jump to line 398 because the condition on line 397 was never true
398 desc = ""
399 # print("linkage colon prefix desc={!r} rest={!r} cls={}"
400 # .format(desc, rest, cls))
402 # Handle the prefix according to its type
403 if cls == "tags":
404 if base_qualifier:
405 base_qualifier += ", " + desc
406 else:
407 base_qualifier = desc
408 item = rest
409 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"):
410 if base_english: 410 ↛ 411line 410 didn't jump to line 411 because the condition on line 410 was never true
411 base_english += "; " + base_english
412 else:
413 base_english = desc
414 item = rest
415 elif cls in ("english", "taxonomic"):
416 if sense: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 sense += "; " + desc
418 else:
419 sense = desc
420 item = rest
421 elif desc.isdigit():
422 idx = int(desc) - 1
423 if idx >= 0 and idx < len(pos_datas):
424 d = pos_datas[idx]
425 gl = "; ".join(d.get("glosses", ()))
426 if not gl: 426 ↛ 427line 426 didn't jump to line 427 because the condition on line 426 was never true
427 wxr.wtp.debug(
428 "parenthesized numeric linkage prefix, "
429 "but the referenced sense has no gloss: "
430 "{}".format(desc),
431 sortid="linkages/355",
432 )
433 elif sense:
434 sense += "; " + gl
435 else:
436 sense = gl
437 item = rest
438 else:
439 wxr.wtp.debug(
440 "parenthesized numeric linkage prefix, "
441 "but there is no sense with such index: {}".format(desc),
442 sortid="linkages/365",
443 )
444 item = rest
445 else:
446 wxr.wtp.debug(
447 "unrecognized linkage prefix: {} desc={} rest={} "
448 "cls={} cls2={} e1={} e2={}".format(
449 item, desc, rest, cls, cls2, e1, e2
450 ),
451 sortid="linkages/371",
452 )
453 item = rest
455 base_sense = sense
457 # Check for certain plural tag forms at end of items list, and apply
458 # them to all items if found
459 m = re.search(
460 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
461 r"characters|symbols|tetragrams|letter names|names|"
462 r"female names|male names|proper nouns|contractions|"
463 r"nonstandard spellings|verbs|prepositions|postpositions|"
464 r"interjections|Abbreviations|abbreviations|variants|"
465 r"ordinals|nouns|phrases|adjectives|adverbs|"
466 r"augmentatives|pejoratives|compound words|numerals|"
467 r"Tally marks|surnames|modern nonstandard spellings)$",
468 item,
469 )
470 if m:
471 suffix = m.group(1)
472 if base_qualifier:
473 base_qualifier += ", " + suffix
474 else:
475 base_qualifier = suffix
476 item = item[: m.start()]
478 # Certain linkage items have space-separated valus. These are
479 # generated by, e.g., certain templates
480 if base_sense and base_sense.endswith(" paper sizes"):
481 base_qualifier = None
482 item = ", ".join(item.split())
483 # XXX isn't this now handled by the generic digits/letters/etc code?
484 # elif base_qualifier in ("Arabic digits",):
485 # item = ", ".join(item.split())
487 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript
488 item = item.strip()
489 if not item:
490 return None
492 # Kludge: if the item contains ")/" (with possibly spaces in between),
493 # replace it by a comma so it gets split.
494 item = re.sub(r"\)\s*/", "), ", item)
496 # The item may contain multiple comma-separated linkages
497 if base_roman:
498 subitems = [item]
499 else:
500 # Split at commas. Also, in most cases split by " or ", but this
501 # is complicated - "or" may end certain words (e.g., "logical or")
502 # and it may separate head-final tags (e.g. "foo f or m"). Also,
503 # some words have parenthesizxed parts in between, e.g.,
504 # wife/English/Translations/Yiddish:
505 # "ווײַב n (vayb) or f, פֿרוי f (froy)"
506 subitems = []
507 for item1 in split_at_comma_semi(item, skipped=links):
508 if " or " not in item1:
509 subitems.append(item1)
510 continue
511 # Item1 contains " or "
512 item2 = re.sub(r"\s*\([^)]*\)", "", item1)
513 item2 = re.sub(r"\s+", " ", item2)
514 if (
515 (
516 lang not in head_final_bantu_langs
517 or not re.search(head_final_bantu_re, item2)
518 )
519 and (
520 lang not in head_final_other_langs
521 or not re.search(head_final_other_re, item2)
522 )
523 and (
524 not re.search(head_final_re, item2)
525 or (
526 item2[-1].isdigit()
527 and lang not in head_final_numeric_langs
528 )
529 )
530 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE")
531 and all(
532 wxr.wtp.title not in x.split(" or ")
533 for x in split_at_comma_semi(item2, skipped=links)
534 if " or " in x
535 )
536 ):
537 # We can split this item. Split the non-cleaned version
538 # that still has any intervening parenthesized parts.
539 subitems.extend(
540 split_at_comma_semi(item1, extra=[" or "], skipped=links)
541 )
542 else:
543 subitems.append(item1)
544 if len(subitems) > 1: # Would be merged from multiple subitems
545 ruby = [] # XXX what is the purpose of this?
546 for item1 in subitems:
547 if len(subitems) > 1 and item1 in ("...", "…"):
548 # Some lists have ellipsis in the middle - don't generate
549 # linkages for the ellipsis
550 continue
551 item1 = item1.strip()
552 qualifier = base_qualifier
553 sense = base_sense
554 parts = []
555 roman = base_roman # Usually None
556 alt = base_alt # Usually None
557 taxonomic = None
558 english = base_english
560 # Some words have derived terms with parenthesized quoted English
561 # descriptions, which can sometimes essentially be tags
562 # Some word (bleki/Esperanto...) can have parentheses inside
563 # the quotes, so let's make this regex even more unreadable.
564 m = re.search(r"\s*\(“([^”]+)”\)", item1)
565 if m: 565 ↛ 566line 565 didn't jump to line 566 because the condition on line 565 was never true
566 t = m.group(1)
567 item1 = (item1[: m.start()] + item1[m.end() :]).strip()
568 cls = classify_desc(t)
569 if cls == "tags":
570 if qualifier:
571 qualifier += ", " + t
572 else:
573 qualifier = t
574 else:
575 english = t
577 # Some Korean words use "word (alt, oman, “english”) pattern
578 # See 滿/Korean
579 m = re.match(
580 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "
581 r'[“”"]([^”“"]+)[“”"]\)$',
582 item1,
583 )
584 if (
585 m
586 and classify_desc(m.group(1), no_unknown_starts=True) == "other"
587 and classify_desc(m.group(2), no_unknown_starts=True) == "other"
588 ):
589 alt = m.group(2)
590 roman = m.group(3)
591 english = m.group(4)
592 item1 = m.group(1)
594 words = item1.split(" ")
595 if (
596 len(words) > 1
597 and words[0] in linkage_beginning_tags
598 and words[0] != wxr.wtp.title
599 ):
600 t = linkage_beginning_tags[words[0]]
601 item1 = " ".join(words[1:])
602 if qualifier: 602 ↛ 603line 602 didn't jump to line 603 because the condition on line 602 was never true
603 qualifier += ", " + t
604 else:
605 qualifier = t
607 # Extract quoted English translations (there are also other
608 # kinds of English translations)
609 def english_repl(m: re.Match) -> str:
610 nonlocal english
611 nonlocal qualifier
612 v = m.group(1).strip()
613 # If v is "tags: sense", handle the tags
614 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v)
615 if m1 is not None: 615 ↛ 616line 615 didn't jump to line 616 because the condition on line 615 was never true
616 desc, rest = m1.groups()
617 if classify_desc(desc, no_unknown_starts=True) == "tags":
618 if qualifier:
619 qualifier += ", " + desc
620 else:
621 qualifier = desc
622 v = rest
623 if english:
624 english += "; " + v
625 else:
626 english = v
627 return ""
629 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip()
631 # There could be multiple parenthesized parts, and
632 # sometimes both at the beginning and at the end.
633 # And sometimes even in the middle, as in e.g.
634 # wife/English/Translations/Yiddish
635 while not script_chars and (
636 not sense or not re.search(script_chars_re, sense)
637 ):
638 par = None
639 nonfirst_par = False
640 if par is None: 640 ↛ 657line 640 didn't jump to line 657 because the condition on line 640 was always true
641 # Try to find a parenthesized part from the beginning.
642 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1)
643 if m:
644 par = m.group(1)
645 item1 = item1[m.end() :]
646 else:
647 # Try to find a parenthesized part at the end or from the
648 # middle.
649 m = re.search(
650 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?",
651 item1,
652 )
653 if m:
654 par = m.group(1)
655 item1 = item1[: m.start()] + item1[m.end() :]
656 nonfirst_par = True
657 if not par:
658 break
659 if re.search(linkage_paren_ignore_contains_re, par):
660 continue # Skip these linkage descriptors
661 par = par.strip()
662 # Handle tags from beginning of par. We also handle "other"
663 # here as Korean entries often have Hanja form in the
664 # beginning of parenthesis, before romanization. Similar
665 # for many Japanese entries.
666 while par: 666 ↛ 687line 666 didn't jump to line 687 because the condition on line 666 was always true
667 idx = par.find(",")
668 if idx <= 0:
669 break
670 cls = classify_desc(par[:idx], no_unknown_starts=True)
671 if cls == "other" and not alt: 671 ↛ 672line 671 didn't jump to line 672 because the condition on line 671 was never true
672 alt = par[:idx]
673 elif cls == "taxonomic": 673 ↛ 674line 673 didn't jump to line 674 because the condition on line 673 was never true
674 taxonomic = par[:idx]
675 elif cls == "tags":
676 if qualifier:
677 qualifier += ", " + par[:idx]
678 else:
679 qualifier = par[:idx]
680 else:
681 break
682 par = par[idx + 1 :].strip()
684 # Check for certain comma-separated tags combined
685 # with English text at the beginning or end of a
686 # comma-separated parenthesized list
687 lst = par.split(",") if len(par) > 1 else [par]
688 lst = list(x.strip() for x in lst if x.strip())
689 while len(lst) > 1:
690 cls = classify_desc(lst[0], no_unknown_starts=True)
691 if cls == "tags": 691 ↛ 692line 691 didn't jump to line 692 because the condition on line 691 was never true
692 if qualifier:
693 qualifier += ", " + lst[0]
694 else:
695 qualifier = lst[0]
696 lst = lst[1:]
697 continue
698 cls = classify_desc(lst[-1], no_unknown_starts=True)
699 if cls == "tags":
700 if qualifier:
701 qualifier += ", " + lst[-1]
702 else:
703 qualifier = lst[-1]
704 lst = lst[:-1]
705 continue
706 break
707 par = ", ".join(lst)
709 # Handle remaining types
710 if not par: 710 ↛ 711line 710 didn't jump to line 711 because the condition on line 710 was never true
711 continue
712 if re.search(script_chars_re, par):
713 script_chars = True
714 if classify_desc(par, no_unknown_starts=True) == "tags": 714 ↛ 724line 714 didn't jump to line 724 because the condition on line 714 was always true
715 if base_qualifier: 715 ↛ 716line 715 didn't jump to line 716 because the condition on line 715 was never true
716 base_qualifier += "; " + par
717 else:
718 base_qualifier = par
719 if qualifier: 719 ↛ 720line 719 didn't jump to line 720 because the condition on line 719 was never true
720 qualifier += "; " + par
721 else:
722 qualifier = par
723 else:
724 if base_sense:
725 base_sense += "; " + par
726 else:
727 base_sense = par
728 if sense:
729 sense += "; " + par
730 else:
731 sense = par
732 elif par.endswith(" letter names"): 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true
733 if base_qualifier:
734 base_qualifier += "; " + par
735 else:
736 base_qualifier = par
737 if qualifier:
738 qualifier += "; " + par
739 else:
740 qualifier = par
741 else:
742 cls = classify_desc(par)
743 # print("classify_desc: {!r} -> {}".format(par, cls))
744 if cls == "tags":
745 if qualifier:
746 qualifier += ", " + par
747 else:
748 qualifier = par
749 elif cls == "english":
750 if nonfirst_par:
751 if english:
752 english += "; " + par
753 else:
754 english = par
755 else:
756 if sense: 756 ↛ 757line 756 didn't jump to line 757 because the condition on line 756 was never true
757 sense += "; " + par
758 else:
759 sense = par
760 elif cls == "romanization":
761 roman = par
762 elif cls == "taxonomic":
763 taxonomic = par
764 elif par.isdigit():
765 idx = int(par) - 1
766 if idx >= 0 and idx < len(pos_datas):
767 d = pos_datas[idx]
768 gl = "; ".join(d.get("glosses", ()))
769 if not gl: 769 ↛ 770line 769 didn't jump to line 770 because the condition on line 769 was never true
770 wxr.wtp.debug(
771 "parenthesized number "
772 "but the referenced sense has no "
773 "gloss: {}".format(par),
774 sortid="linkages/665",
775 )
776 elif sense: 776 ↛ 779line 776 didn't jump to line 779 because the condition on line 776 was always true
777 sense += "; " + gl
778 else:
779 sense = gl
780 else:
781 wxr.wtp.debug(
782 "parenthesized number but there is "
783 "no sense with such index: {}".format(par),
784 sortid="linkages/674",
785 )
786 else:
787 if alt: 787 ↛ 788line 787 didn't jump to line 788 because the condition on line 787 was never true
788 alt += "; " + par
789 else:
790 alt = par
792 # Handle certain special cases, unless we are parsing
793 # script characters.
794 if not script_chars:
795 # Ignore all linkages with certain prefixes, suffixes, or parts
796 # (this is done after removing certain prefixes and suffixes)
797 if re.search(linkage_ignore_re, item1):
798 continue # Ignore linkages with certain prefixes
800 # Remove certain prefixes from linkages
801 m = re.match(linkage_remove_prefixes_re, item1)
802 if m:
803 prefix = item1[: m.end()]
804 item1 = item1[m.end() :]
805 if prefix in linkage_remove_prefixes_tags:
806 if qualifier:
807 qualifier += ", " + linkage_remove_prefixes_tags[prefix]
808 else:
809 qualifier = linkage_remove_prefixes_tags[prefix]
810 # Recheck ignored linkages
811 if re.search(linkage_ignore_re, item1):
812 continue
814 # Remove certain suffixes from linkages
815 m = re.search(linkage_remove_suffixes_re, item1)
816 if m:
817 item1 = item1[: m.start()]
819 # Parse linkages with "value = english" syntax (e.g.,
820 # väittää/Finnish)
821 idx = item1.find(" = ")
822 if idx >= 0:
823 eng = item1[idx + 3 :]
824 if classify_desc(eng, no_unknown_starts=True) == "english":
825 english = eng
826 item1 = item1[:idx]
827 else:
828 # Some places seem to use it reversed
829 # "english = value"
830 eng = item1[:idx]
831 if classify_desc(eng, no_unknown_starts=True) == "english":
832 english = eng
833 item1 = item1[idx + 3 :]
835 # Parse linkages with "value - english" syntax (e.g.,
836 # man/Faroese)
837 m = re.search(r" [-‐‑‒–—―] ", item1)
838 if m and "(" not in item1:
839 suffix = item1[m.end() :]
840 cls = classify_desc(suffix, no_unknown_starts=True)
841 if cls == "english":
842 # This case intentionally ignores old values from english
843 # (otherwise taxonomic lists fail)
844 english = suffix
845 item1 = item1[: m.start()]
846 elif cls == "tags":
847 if qualifier: 847 ↛ 848line 847 didn't jump to line 848 because the condition on line 847 was never true
848 qualifier += ", " + suffix
849 else:
850 qualifier = suffix
851 item1 = item1[: m.start()]
853 # Parse certain tags at the end of the linked term (unless
854 # we are in a letters list)
855 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1)
856 if q:
857 if qualifier: 857 ↛ 858line 857 didn't jump to line 858 because the condition on line 857 was never true
858 qualifier += ", " + ", ".join(q)
859 else:
860 qualifier = ", ".join(q)
862 m = re.search(linkage_truncate_re, item1)
863 if m: 863 ↛ 865line 863 didn't jump to line 865 because the condition on line 863 was never true
864 # suffix = item1[m.start():] # Currently ignored
865 item1 = item1[: m.start()]
866 if not item1:
867 continue # Ignore empty link targets
868 if item1 == word:
869 continue # Ignore self-links
871 def add(w: str, r: Optional[str]) -> None:
872 assert isinstance(w, str)
873 assert r is None or isinstance(r, str)
874 nonlocal alt
875 nonlocal taxonomic
877 # We remove "*" from the beginning of reconstruction linkages.
878 # Such linkages should only occur in reconstruction senses, so
879 # this should not cause ambiguity.
880 if is_reconstruction and w.startswith("*"):
881 w = w[1:]
883 # Check if the word contains the Fullwith Solidus, and if
884 # so, split by it and treat the the results as alternative
885 # linkages. (This is very commonly used for alternative
886 # written forms in Chinese compounds and other linkages.)
887 # However, if the word contains a comma, then we wont't
888 # split as this is used when we have a different number
889 # of romanizations than written forms, and don't know
890 # which is which.
891 if (
892 (not w or "," not in w)
893 and (not r or "," not in r)
894 and not wxr.wtp.page_exists(w)
895 ):
896 lst = w.split("/") if len(w) > 1 else [w]
897 if len(lst) == 1:
898 lst = w.split(" / ")
899 if len(lst) == 1 and len(lst[0]) >= 6:
900 lst = w.split("/")
901 if len(lst) > 1:
902 # Treat each alternative as separate linkage
903 for w in lst:
904 add(w, r)
905 return None
907 # Heuristically remove "." at the end of most linkages
908 # (some linkage lists end in a period, but we also have
909 # abbreviations that end with a period that should be kept)
910 if (
911 w.endswith(".")
912 and not wxr.wtp.page_exists(w)
913 and (
914 wxr.wtp.page_exists(w[:-1])
915 or (len(w) >= 5)
916 and "." not in w[:-1]
917 )
918 ):
919 w = w[:-1]
921 # If we have roman but not alt and the word is ASCII,
922 # move roman to alt.
923 if r and not alt and w.isascii():
924 alt = r
925 r = None
926 # Add the linkage
927 dt: LinkageData = {}
928 if qualifier:
929 parse_sense_qualifier(wxr, qualifier, dt)
930 if sense:
931 dt["sense"] = sense.strip()
932 if r:
933 dt["roman"] = r.strip()
934 if ruby:
935 dt["ruby"] = ruby
936 if english:
937 dt["english"] = english.strip()
938 if taxonomic:
939 if re.match(r"×[A-Z]", taxonomic):
940 data_append(dt, "tags", "extinct")
941 taxonomic = taxonomic[1:]
942 dt["taxonomic"] = taxonomic
943 if re.match(r"×[A-Z]", w):
944 data_append(dt, "tags", "extinct")
945 w = w[1:] # Remove × before dead species names
946 if alt and re.match(r"×[A-Z]", alt):
947 data_append(dt, "tags", "extinct")
948 alt = alt[1:] # Remove × before dead species names
949 if alt and alt.strip() != w:
950 dt["alt"] = alt.strip()
951 if urls:
952 dt["urls"] = [
953 url.strip() for url in urls if url and isinstance(url, str)
954 ]
955 dt["word"] = w
956 for old in data.get(field, ()): # type: ignore[attr-defined]
957 if dt == old:
958 break
959 else:
960 data_append(data, field, dt)
962 # Handle exceptional linkage splits and other linkage
963 # conversions (including expanding to variant forms)
964 if item1 in linkage_split_exceptions: 964 ↛ 965line 964 didn't jump to line 965 because the condition on line 964 was never true
965 for item2 in linkage_split_exceptions[item1]:
966 add(item2, roman)
967 continue
969 # Various templates for letters in scripts use spaces as
970 # separators and also have multiple characters without
971 # spaces consecutively.
972 v = sense or qualifier
973 # print("lang={} v={} script_chars={} item1={!r}"
974 # .format(wxr.wtp.section, v, script_chars, item1))
975 if v and script_chars:
976 if (
977 len(item1.split()) > 1
978 or len(list(re.finditer(unicode_dc_re, item1))) == 2
979 or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))
980 ):
981 if v == qualifier:
982 # if sense:
983 # sense += "; " + qualifier
984 # else:
985 # sense = qualifier
986 qualifier = None
987 if re.search(r" (letters|digits|script)$", v):
988 qualifier = v # Also parse as qualifier
989 elif re.search( 989 ↛ 996line 989 didn't jump to line 996 because the condition on line 989 was always true
990 r"Variations of letter |"
991 r"Letters using |"
992 r"Letters of the ",
993 v,
994 ):
995 qualifier = "letter"
996 parts = item1.split(". ")
997 extra: Sequence[str] = ()
998 if len(parts) > 1: 998 ↛ 999line 998 didn't jump to line 999 because the condition on line 998 was never true
999 extra = parts[1:]
1000 item1 = parts[0]
1001 # Handle multi-character names for chars in language's
1002 # alphabet, e.g., "Ny ny" in P/Hungarian.
1003 if (
1004 len(subitems) > 20
1005 and len(item1.split()) == 2
1006 and all(len(x) <= 3 for x in item1.split())
1007 ):
1008 parts = list(
1009 m.group(0)
1010 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)
1011 if not m.group(0).isspace()
1012 and m.group(0) not in ("(", ")")
1013 )
1014 else:
1015 parts = list(
1016 m.group(0)
1017 for m in re.finditer(r".[\u0300-\u036f]?", item1)
1018 if not m.group(0).isspace()
1019 and m.group(0) not in ("(", ")")
1020 )
1021 for e in extra: 1021 ↛ 1022line 1021 didn't jump to line 1022 because the loop on line 1021 never started
1022 idx = e.find(":")
1023 if idx >= 0:
1024 e = e[idx + 1 :].strip()
1025 if e.endswith("."):
1026 e = e[:-1]
1027 parts.extend(e.split())
1029 # XXX this is not correct - see P/Vietnamese
1030 # While some sequences have multiple consecutive
1031 # characters, others use pairs and some have
1032 # 2/3 character names, e.g., "Ng ng".
1034 rparts: Optional[list[Optional[str]]] = None
1035 if roman: 1035 ↛ 1036line 1035 didn't jump to line 1036 because the condition on line 1035 was never true
1036 rparts = list(
1037 m.group(0)
1038 for m in re.finditer(r".[\u0300-\u036f]", roman)
1039 if not m.group(0).isspace()
1040 )
1041 if len(rparts) != len(parts):
1042 rparts = None
1043 if not rparts: 1043 ↛ 1046line 1043 didn't jump to line 1046 because the condition on line 1043 was always true
1044 rparts = [None] * len(parts)
1046 for w, r in zip(parts, rparts):
1047 add(w, r)
1048 continue
1050 add(item1, roman)
1051 return None
1054def extract_alt_form_section(
1055 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode
1056) -> None:
1057 for list_node in level_node.find_child(NodeKind.LIST):
1058 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
1059 for node in list_item.children:
1060 if isinstance(node, TemplateNode) and node.template_name in [
1061 "l",
1062 "link",
1063 "L",
1064 "alt",
1065 "alter",
1066 ]:
1067 extract_l_template(wxr, word_entry, node)
1068 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
1069 word = clean_node(wxr, None, node)
1070 if word != "": 1070 ↛ 1059line 1070 didn't jump to line 1059 because the condition on line 1070 was always true
1071 form: FormData = {"form": word, "tags": ["alternative"]}
1072 data_append(word_entry, "forms", form)
1075def extract_l_template(
1076 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
1077) -> None:
1078 forms: list[FormData] = []
1079 expanded_node = wxr.wtp.parse(
1080 wxr.wtp.node_to_wikitext(t_node), expand_all=True
1081 )
1082 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
1083 for span_tag in expanded_node.find_html("span"):
1084 span_lang = span_tag.attrs.get("lang", "")
1085 span_class = span_tag.attrs.get("class", "")
1086 if span_lang == lang_code: 1086 ↛ 1091line 1086 didn't jump to line 1091 because the condition on line 1086 was always true
1087 word = clean_node(wxr, None, span_tag)
1088 if word != "": 1088 ↛ 1083line 1088 didn't jump to line 1083 because the condition on line 1088 was always true
1089 form: FormData = {"form": word, "tags": ["alternative"]}
1090 forms.append(form)
1091 elif span_lang.endswith("-Latn") and len(forms) > 0:
1092 roman = clean_node(wxr, None, span_tag)
1093 if roman != "":
1094 forms[-1]["roman"] = roman
1095 elif "label-content" in span_class and len(forms) > 0:
1096 tag_text = clean_node(wxr, None, span_tag)
1097 if classify_desc(tag_text) == "tags":
1098 tagsets1, _ = decode_tags(tag_text)
1099 tags: list[str] = []
1100 for ts in tagsets1:
1101 tags.extend(ts)
1102 for form in forms:
1103 form["tags"].extend(tags)
1104 data_extend(word_entry, "forms", forms)