Coverage for src/wiktextract/extractor/en/linkages.py: 72%
758 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-29 08:54 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-05-29 08:54 +0000
1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc)
2#
3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import re
6import unicodedata
7from typing import Optional, Sequence
9from wikitextprocessor import (
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
15from wikitextprocessor.core import TemplateArgs
16from wikitextprocessor.parser import (
17 HTMLNode,
18 is_list,
19 is_list_item,
20)
22from ...datautils import (
23 data_append,
24 data_extend,
25 ns_title_prefix_tuple,
26 split_at_comma_semi,
27)
28from ...page import clean_node, is_panel_template
29from ...tags import linkage_beginning_tags, valid_tags
30from ...wxr_context import WiktextractContext
31from ..ruby import extract_ruby, parse_ruby # noqa: F401
32from .form_descriptions import (
33 classify_desc,
34 decode_tags,
35 head_final_bantu_langs,
36 head_final_bantu_re,
37 head_final_numeric_langs,
38 head_final_other_langs,
39 head_final_other_re,
40 head_final_re,
41 parse_head_final_tags,
42 parse_sense_qualifier,
43)
44from .section_titles import TRANSLATIONS_TITLE
45from .type_utils import FormData, LinkageData, SenseData, WordData
47# Linkage will be ignored if it matches this regexp before splitting
48linkage_pre_split_ignore_re = re.compile(
49 r"^("
50 + "|".join(
51 re.escape(x)
52 for x in [
53 "For more variations, see ",
54 "Signal flag:",
55 "Semaphore:",
56 ]
57 )
58 + r")"
59)
61# Linkage will be ignored if it has one of these prefixes
62linkage_ignore_prefixes = [
63 "Historical and regional synonyms of ",
64 "edit data",
65 "or these other third-person pronouns",
66 "introduced in Unicode ",
67 "Entries in the ",
68 "Wikipedia article ",
69 "Wiktionary's coverage of ",
70 "Ethnologue entry for ",
71 "Any of Thesaurus:",
72 "See contents of Category:",
73 "See also Thesaurus:",
74 "See also Appendix:",
75 "see also Appendix:",
76 "see also Thesaurus:",
77 "As SMS messaging ",
78 "For the reversed question mark used in some right-to-left-scripts",
79 "such as ",
80 "Appendix:",
81 "Category:",
82 ":Category:",
83]
85# Linkage will be ignored if it has any of these suffixes
86linkage_ignore_suffixes = [
87 " Wikipedia",
88 " Wikipedia.",
89 " edition of Wiktionary",
90]
92# Linkage will be ignored if it is one of these (with full match)
93linkage_ignore_whole = [
94 "etc.",
95 "other derived terms:",
96 "Formal terms",
97 "informal and slang terms",
98]
100# Linkage will be ignored if it matches this regexp
101linkage_ignore_re = re.compile(
102 r"^("
103 + "|".join(re.escape(x) for x in linkage_ignore_whole)
104 + r")$|^("
105 + "|".join(re.escape(x) for x in linkage_ignore_prefixes)
106 + r")|("
107 + "|".join(re.escape(x) for x in linkage_ignore_suffixes)
108 + r")$"
109)
111# These prefixes will be removed from linkages, leaving the rest. This is
112# considered separately for each linkage in a list.
113linkage_remove_prefixes_re = re.compile(
114 r"^("
115 + r"|".join(
116 re.escape(x)
117 for x in [
118 ":",
119 "see Thesaurus:",
120 "See Thesaurus:",
121 "see also Thesaurus:",
122 "See also Thesaurus:",
123 "see also ",
124 "See also ",
125 "see ",
126 "See ",
127 "from ",
128 "abbreviation of ",
129 "ISO 639-1 code ",
130 "ISO 639-3 code ",
131 "Thesaurus:",
132 ]
133 )
134 + ")"
135)
137# When removing prefix from linkage, this dictionary can be used to map
138# the removed prefix to a space-separated list of tags to add
139linkage_remove_prefixes_tags = {
140 "abbreviation of ": "abbreviation",
141}
143# These suffixes will be removed from linkages, leaving the rest. This is
144# considered separately for each linkage in a list.
145linkage_remove_suffixes_re = re.compile(
146 r"(\s+on (Wikispecies|Wikimedia Commons|"
147 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"
148 r"\s*[-–] Pre-reform orthography.*)"
149 r"$"
150)
152# Ignore linkage parenthesized sections that contain one of these strings
153linkage_paren_ignore_contains_re = re.compile(
154 r"\b("
155 + "|".join(
156 re.escape(x)
157 for x in [
158 "from Etymology",
159 "used as",
160 "usage notes",
161 ]
162 )
163 + ")([, ]|$)"
164)
166taxonomic_ending_map = {
167 "superkingdoms": "superkingdom",
168 "kingdoms": "kingdom",
169 "subkingdoms": "subkingdom",
170 "infrakingdoms": "infrakingdom",
171 "phylums": "phylum",
172 "subphylums": "subphylum",
173 "infraphylums": "infraphylum",
174 "superclasses": "superclass",
175 "classes": "class",
176 "orders": "order",
177 "suborders": "suborder",
178 "families": "family",
179 "subfamilies": "subfamily",
180 "genera": "genus",
181}
182for k, v in list(taxonomic_ending_map.items()):
183 taxonomic_ending_map[v] = v # Also add singular -> singular
184taxonomic_ending_re = re.compile(
185 r"\s+[-‐‑‒–—]\s+({})$".format(
186 "|".join(re.escape(x) for x in taxonomic_ending_map)
187 )
188)
190# Exceptional splits for linkages. This can be used to fix particular linkages
191# that are not handled correctly by the default code. This can also be used
192# to create automatic aliases, e.g., for mapping "..." and "…" to both.
193linkage_split_exceptions = {
194 "∛ ∜": ["∛", "∜"],
195 "...": ["...", "…"],
196 "…": ["...", "…"],
197}
199# Truncate linkage word if it matches any of these strings
200linkage_truncate_re = re.compile(
201 "|".join(
202 re.escape(x)
203 for x in [
204 " and its derived terms",
205 " UTF-16 0x214C",
206 ]
207 )
208)
210# Regexp for identifying special linkages containing lists of letters, digits,
211# or characters
212script_chars_re = re.compile(
213 r"(script letters| script| letters|"
214 r"Dialectological|Puctuation|Symbols|"
215 r"Guillemets|Single guillemets|"
216 r" tetragrams|"
217 r" digits)(;|$)|"
218 r"(^|; )(Letters using |Letters of the |"
219 r"Variations of letter )|"
220 r"^(Hiragana|Katakana)$"
221)
223# Matches an unicode character including any combining diacritics (even if
224# separate characters)
225unicode_dc_re = re.compile(
226 r"\w[{}]|.".format(
227 "".join(
228 chr(x)
229 for x in range(0, 0x110000)
230 if unicodedata.category(chr(x)) == "Mn"
231 )
232 )
233)
236def extract_alt_form_section(
237 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode
238) -> None:
239 for list_node in level_node.find_child(NodeKind.LIST):
240 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
241 for node in list_item.children:
242 if isinstance(node, TemplateNode) and node.template_name in [
243 "l",
244 "link",
245 "L",
246 "alt",
247 "alter",
248 ]:
249 extract_l_template(wxr, word_entry, node)
250 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
251 word = clean_node(wxr, None, node)
252 if word != "": 252 ↛ 241line 252 didn't jump to line 241 because the condition on line 252 was always true
253 form: FormData = {"form": word, "tags": ["alternative"]}
254 data_append(word_entry, "forms", form)
257def extract_l_template(
258 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
259) -> None:
260 forms: list[FormData] = []
261 expanded_node = wxr.wtp.parse(
262 wxr.wtp.node_to_wikitext(t_node), expand_all=True
263 )
264 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
265 for span_tag in expanded_node.find_html("span"):
266 span_lang = span_tag.attrs.get("lang", "")
267 span_class = span_tag.attrs.get("class", "")
268 if span_lang == lang_code: 268 ↛ 273line 268 didn't jump to line 273 because the condition on line 268 was always true
269 word = clean_node(wxr, None, span_tag)
270 if word != "": 270 ↛ 265line 270 didn't jump to line 265 because the condition on line 270 was always true
271 form: FormData = {"form": word, "tags": ["alternative"]}
272 forms.append(form)
273 elif span_lang.endswith("-Latn") and len(forms) > 0:
274 roman = clean_node(wxr, None, span_tag)
275 if roman != "":
276 forms[-1]["roman"] = roman
277 elif "label-content" in span_class and len(forms) > 0:
278 tag_text = clean_node(wxr, None, span_tag)
279 if classify_desc(tag_text) == "tags":
280 tagsets1, _ = decode_tags(tag_text)
281 tags: list[str] = []
282 for ts in tagsets1:
283 tags.extend(ts)
284 for form in forms:
285 form["tags"].extend(tags)
286 data_extend(word_entry, "forms", forms)
289ZH_DIAL_TAGS = {
290 "Classical Chinese": ["Classical-Chinese"],
291 "Formal": ["formal"],
292 "Written Standard Chinese": ["Written-vernacular-Chinese"],
293 "Northeastern Mandarin": ["Northeastern-Mandarin"],
294 "Jilu Mandarin": ["Jilu-Mandarin"],
295 "Jiaoliao Mandarin": ["Jiaoliao-Mandarin"],
296 "Central Plains Mandarin": ["Central-Plains-Mandarin"],
297 "Lanyin Mandarin": ["Lanyin-Mandarin"],
298 "Southwestern Mandarin": ["Southwestern-Mandarin"],
299 "Jianghuai Mandarin": ["Jianghuai-Mandarin"],
300 "Northern Min": ["Min-Bei"],
301 "Eastern Min": ["Min-Dong"],
302 "Southern Min": ["Min-Nan"],
303 "Zhongshan Min": ["Zhongshan-Min"],
304 "Southern Pinghua": ["Southern-Pinghua"],
305 "Puxian Min": ["Puxian-Min"],
306}
309def extract_zh_dial_template(
310 wxr: WiktextractContext,
311 word_entry: WordData,
312 t_node: TemplateNode,
313 sense: str,
314):
315 # https://en.wiktionary.org/wiki/Template:zh-dial
316 from .pronunciation import split_zh_pron_raw_tag
318 linkage_list: list[LinkageData] = []
319 expanded_node = wxr.wtp.parse(
320 wxr.wtp.node_to_wikitext(t_node), expand_all=True
321 )
322 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 322 ↛ 323line 322 didn't jump to line 323 because the loop on line 322 never started
323 is_note_row = False
324 note_tags = {}
325 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
326 for cell_node in row_node.find_child(
327 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
328 ):
329 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
330 is_note_row = clean_node(wxr, None, cell_node) == "Note"
331 elif is_note_row:
332 for note_str in clean_node(wxr, None, cell_node).split(";"):
333 if "-" in note_str:
334 note_symbol, note = note_str.split("-", maxsplit=1)
335 note_symbol = note_symbol.strip()
336 note = note.strip()
337 if note_symbol != "" and note != "":
338 note_tags[note_symbol] = note
339 lang_tags = []
340 region_tags = []
341 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
342 if not row_node.contain_node(NodeKind.TABLE_CELL):
343 continue # skip header row
344 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
345 lang_tags = split_zh_pron_raw_tag(
346 clean_node(wxr, None, header_node)
347 )
348 if lang_tags == ["Note"]: # skip last note row
349 continue
350 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
351 for link_node in cell_node.find_child(NodeKind.LINK):
352 region_tags = split_zh_pron_raw_tag(
353 clean_node(wxr, None, link_node)
354 )
355 for span_tag in cell_node.find_html("span"):
356 span_text = clean_node(wxr, None, span_tag)
357 if span_text == "":
358 continue
359 if (
360 span_tag.attrs.get("lang", "") == "zh"
361 and span_text != wxr.wtp.title
362 ):
363 l_data: LinkageData = {"word": span_text}
364 if sense != "":
365 l_data["sense"] = sense
366 if len(lang_tags) > 0:
367 data_extend(l_data, "raw_tags", lang_tags)
368 if len(region_tags) > 0:
369 data_extend(l_data, "raw_tags", region_tags)
370 linkage_list.append(l_data)
371 elif (
372 span_tag.attrs.get("style", "") == "font-size:60%"
373 and len(linkage_list) > 0
374 ):
375 for note_symbol in span_text.split(","):
376 note_symbol = note_symbol.strip()
377 raw_tag = note_symbol
378 if note_symbol in note_tags:
379 raw_tag = note_tags[note_symbol]
380 if raw_tag != "":
381 data_append(
382 linkage_list[-1], "raw_tags", raw_tag
383 )
385 for l_data in linkage_list:
386 raw_tags = []
387 for raw_tag in l_data.get("raw_tags", []):
388 if raw_tag in ZH_DIAL_TAGS:
389 data_extend(l_data, "tags", ZH_DIAL_TAGS[raw_tag])
390 elif raw_tag in valid_tags:
391 data_append(l_data, "tags", raw_tag)
392 else:
393 raw_tags.append(raw_tag)
394 if len(raw_tags) > 0:
395 l_data["raw_tags"] = raw_tags
396 elif "raw_tags" in l_data:
397 del l_data["raw_tags"]
398 data_extend(word_entry, "synonyms", linkage_list)
401def parse_linkage(
402 wxr: WiktextractContext,
403 data: WordData,
404 field: str,
405 linkagenode: LevelNode,
406 word: str,
407 sense_datas: list[SenseData],
408 is_reconstruction: bool,
409) -> None:
410 assert isinstance(data, dict)
411 assert isinstance(field, str)
412 assert isinstance(linkagenode, LevelNode)
413 # print("field", field)
414 # print("data", data)
415 # print("children:")
416 if not wxr.config.capture_linkages: 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 return
418 have_panel_template = False
420 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
421 nonlocal have_panel_template
422 if is_panel_template(wxr, name):
423 have_panel_template = True
424 return ""
425 # Ignore auto-filled templates like Template:table:Solar System/en
426 if name.startswith(("table:", "list:")):
427 return ""
428 return None
430 # Main body of parse_linkage()
431 l_nodes: list[str | WikiNode] = []
432 l_sense = ""
433 for node in linkagenode.children:
434 if isinstance(node, TemplateNode) and node.template_name == "zh-dial":
435 extract_zh_dial_template(wxr, data, node, l_sense)
436 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
437 for list_item in node.find_child(NodeKind.LIST_ITEM):
438 for t_node in list_item.find_child(NodeKind.TEMPLATE):
439 if t_node.template_name in ["s", "sense"]:
440 l_sense = clean_node(wxr, None, t_node).strip("(): ")
441 l_nodes.append(node)
442 else:
443 l_nodes.append(node)
444 text = wxr.wtp.node_to_wikitext(l_nodes)
445 parsed = wxr.wtp.parse(
446 text, expand_all=True, template_fn=linkage_template_fn1
447 )
449 text_outside_list_items = parse_linkage_recurse(
450 wxr,
451 parsed.children,
452 field,
453 None,
454 None,
455 word,
456 data,
457 sense_datas,
458 is_reconstruction,
459 )
461 if not data.get(field) and not have_panel_template:
462 text = "".join(text_outside_list_items).strip()
463 if "\n" not in text and "," in text and text.count(",") > 3:
464 if not text.startswith("See "): 464 ↛ exitline 464 didn't return from function 'parse_linkage' because the condition on line 464 was always true
465 parse_linkage_item(
466 wxr,
467 [text],
468 field,
469 word,
470 data,
471 sense_datas,
472 is_reconstruction,
473 None,
474 )
477def parse_linkage_recurse(
478 wxr: WiktextractContext,
479 contents: list[WikiNode | str],
480 field: str,
481 sense: str | None,
482 block_header_sense: str | None,
483 word: str,
484 data,
485 sense_datas,
486 is_reconstruction,
487) -> list[str]:
488 assert isinstance(contents, (list, tuple))
489 assert sense is None or isinstance(sense, str)
490 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
492 # Return values
493 text_outside_list_items: list[str] = []
495 for node in contents:
496 if isinstance(node, str):
497 # Ignore top-level text, generally comments before the
498 # linkages list. However, if no linkages are found, then
499 # use this for linkages (not all words use bullet points
500 # for linkages).
501 text_outside_list_items.append(node)
502 continue
503 assert isinstance(node, WikiNode)
504 kind = node.kind
505 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
506 if is_list(node) or kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
507 toli = parse_linkage_recurse(
508 wxr,
509 node.children,
510 field,
511 sense,
512 block_header_sense,
513 word,
514 data,
515 sense_datas,
516 is_reconstruction,
517 )
518 text_outside_list_items.extend(toli)
519 elif is_list_item(node) or kind == NodeKind.TABLE_CELL:
520 v = parse_linkage_item(
521 wxr,
522 node.children,
523 field,
524 word,
525 data,
526 sense_datas,
527 is_reconstruction,
528 sense,
529 )
530 if v is not None: 530 ↛ 495line 530 didn't jump to line 495 because the condition on line 530 was always true
531 # parse_linkage_item() can return a value that should
532 # be used as the sense for the follow-on linkages,
533 # which are typically provided in a table (see 滿)
534 block_header_sense = "".join(v)
535 elif kind in (
536 NodeKind.TABLE_CAPTION,
537 NodeKind.TABLE_HEADER_CELL,
538 NodeKind.PREFORMATTED,
539 NodeKind.BOLD,
540 ):
541 # Let's still ignore table extra stuff
542 continue
543 elif isinstance(node, HTMLNode): 543 ↛ 545line 543 didn't jump to line 545 because the condition on line 543 was never true
544 # Recurse to process inside the HTML for most tags
545 if node.sarg in ("gallery", "ref", "cite", "caption"):
546 continue
547 classes = (node.attrs.get("class") or "").replace("+", " ").split()
548 if "qualifier-content" in classes:
549 sense1 = clean_node(wxr, None, node.children)
550 if sense1.endswith(":"):
551 sense1 = sense1[:-1].strip()
552 if sense and sense1:
553 wxr.wtp.debug(
554 "linkage qualifier-content on multiple "
555 "levels: {!r} and {!r}".format(sense, sense1),
556 sortid="page/2170",
557 )
558 toli = parse_linkage_recurse(
559 wxr,
560 node.children,
561 field,
562 sense1,
563 block_header_sense,
564 word,
565 data,
566 sense_datas,
567 is_reconstruction,
568 )
569 text_outside_list_items.extend(toli)
570 elif "list-switcher-header" in classes:
571 block_header_sense = clean_node(wxr, None, node.children)
572 if block_header_sense.endswith(":"):
573 block_header_sense = block_header_sense[:-1].strip()
574 elif any(x in classes for x in ("NavFrame", "term-list")):
575 # NavFrame uses previously assigned block_header_sense
576 # (from a "(sense):" item) and clears it afterwards
577 # print(f"{sense=}, {block_header_sense=}")
578 toli = parse_linkage_recurse(
579 wxr,
580 node.children,
581 field,
582 sense or block_header_sense,
583 block_header_sense,
584 word,
585 data,
586 sense_datas,
587 is_reconstruction,
588 )
589 text_outside_list_items.extend(toli)
590 block_header_sense = None
591 else:
592 toli = parse_linkage_recurse(
593 wxr,
594 node.children,
595 field,
596 sense,
597 block_header_sense,
598 word,
599 data,
600 sense_datas,
601 is_reconstruction,
602 )
603 text_outside_list_items.extend(toli)
604 elif isinstance(node, LevelNode): 604 ↛ 606line 604 didn't jump to line 606 because the condition on line 604 was never true
605 # Just recurse to any possible subsections
606 toli = parse_linkage_recurse(
607 wxr,
608 node.children,
609 field,
610 sense,
611 block_header_sense,
612 word,
613 data,
614 sense_datas,
615 is_reconstruction,
616 )
617 text_outside_list_items.extend(toli)
618 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
619 # Skip these on top level; at least sometimes bold is
620 # used for indicating a subtitle
621 continue
622 elif kind == NodeKind.LINK: 622 ↛ 639line 622 didn't jump to line 639 because the condition on line 622 was always true
623 # Recurse into the last argument
624 # Apparently ":/" is used as a link to "/", so strip
625 # initial value
626 toli = parse_linkage_recurse(
627 wxr,
628 node.largs[-1],
629 field,
630 sense,
631 block_header_sense,
632 word,
633 data,
634 sense_datas,
635 is_reconstruction,
636 )
637 text_outside_list_items.extend(toli)
638 else:
639 wxr.wtp.debug(
640 "parse_linkage_recurse unhandled {}: {}".format(kind, node),
641 sortid="page/2196",
642 )
644 return text_outside_list_items
647def parse_linkage_item(
648 wxr: WiktextractContext,
649 contents: list[str | WikiNode],
650 field: str,
651 word: str,
652 data: WordData,
653 sense_datas: list[SenseData],
654 is_reconstruction: bool,
655 sense: str | None = None,
656) -> list[str]:
657 assert isinstance(contents, (list, tuple))
658 assert isinstance(field, str)
659 assert sense is None or isinstance(sense, str)
661 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
662 # .format(field, sense, contents))
664 parts: list[str] = []
665 ruby: list[tuple[str, str]] = []
666 urls: list[str] = []
667 # data about link text; this is used to skip splitting on
668 # linkage text items that contain stuff like commas; for
669 # example "Hunde, die bellen, beißen nicht" in article
670 # beißen is split into "Hunde", "die bellen" etc.
671 # We take that link text and use it, eventually,
672 # in split_at_comma_semi to skip splitting on those
673 # commas.
674 links_that_should_not_be_split: list[str] = []
676 def item_recurse(
677 contents: list[str | WikiNode], possible_sense: str | None = None
678 ) -> bool:
679 assert isinstance(contents, (list, tuple))
680 nonlocal sense
681 nonlocal ruby
682 nonlocal parts
683 is_sense = False
684 # print("ITEM_RECURSE:", contents)
685 for node in contents:
686 if isinstance(node, str):
687 parts.append(node)
688 continue
689 kind = node.kind
690 # print(
691 # "ITEM_RECURSE KIND:",
692 # kind,
693 # node.sarg if node.sarg else node.largs,
694 # )
696 #### parts into possible_sense
697 if (
698 is_list_item(node)
699 or is_list(node)
700 or kind
701 in (
702 NodeKind.TABLE,
703 NodeKind.TABLE_ROW,
704 NodeKind.TABLE_CELL,
705 )
706 and parts
707 ):
708 # print(f"{parts=}")
709 candidate_sense: str | None
710 candidate_sense = clean_node(wxr, None, parts)
711 is_sense = False
713 if candidate_sense.endswith(":"):
714 is_sense = True
715 candidate_sense = candidate_sense[:-1].strip()
716 if candidate_sense.startswith("(") and candidate_sense.endswith( 716 ↛ 719line 716 didn't jump to line 719 because the condition on line 716 was never true
717 ")"
718 ):
719 is_sense = True
720 candidate_sense = candidate_sense[1:-1].strip()
721 if (
722 candidate_sense.lower() == TRANSLATIONS_TITLE
723 or not is_sense
724 ):
725 candidate_sense = None
726 # print(f"{possible_sense=}, {is_sense=}")
727 if is_sense:
728 possible_sense = candidate_sense
729 parts = []
730 else:
731 candidate_sense = None
733 # Handle nodes
734 if is_list_item(node): 734 ↛ 735line 734 didn't jump to line 735 because the condition on line 734 was never true
735 parse_linkage_item(
736 wxr,
737 node.children,
738 field,
739 word,
740 data,
741 sense_datas,
742 is_reconstruction,
743 possible_sense or sense,
744 )
745 elif is_list(node) or kind in (
746 NodeKind.TABLE,
747 NodeKind.TABLE_ROW,
748 NodeKind.TABLE_CELL,
749 ):
750 parse_linkage_recurse(
751 wxr,
752 node.children,
753 field,
754 possible_sense or sense,
755 None,
756 word,
757 data,
758 sense_datas,
759 is_reconstruction,
760 )
761 elif kind in ( 761 ↛ 765line 761 didn't jump to line 765 because the condition on line 761 was never true
762 NodeKind.TABLE_HEADER_CELL,
763 NodeKind.TABLE_CAPTION,
764 ):
765 continue
766 elif kind == NodeKind.HTML: 766 ↛ 767line 766 didn't jump to line 767 because the condition on line 766 was never true
767 classes = (node.attrs.get("class") or "").split()
768 if node.sarg in ("gallery", "ref", "cite", "caption"):
769 continue
770 elif node.sarg == "ruby":
771 rb = parse_ruby(wxr, node)
772 if rb:
773 ruby.append(rb)
774 parts.append(rb[0])
775 continue
776 elif node.sarg == "math":
777 parts.append(clean_node(wxr, None, node))
778 continue
779 elif "interProject" in classes:
780 continue # These do not seem to be displayed
781 if "NavFrame" in classes:
782 parse_linkage_recurse(
783 wxr,
784 node.children,
785 field,
786 possible_sense or sense,
787 None,
788 word,
789 data,
790 sense_datas,
791 is_reconstruction,
792 )
793 else:
794 item_recurse(node.children, possible_sense)
795 elif kind == NodeKind.LINK:
796 ignore = False
797 if isinstance(node.largs[0][0], str): 797 ↛ 685line 797 didn't jump to line 685 because the condition on line 797 was always true
798 v1 = node.largs[0][0].strip().lower()
799 if v1.startswith( 799 ↛ 803line 799 didn't jump to line 803 because the condition on line 799 was never true
800 ns_title_prefix_tuple(wxr, "Category", True)
801 + ns_title_prefix_tuple(wxr, "File", True)
802 ):
803 ignore = True
804 if not ignore: 804 ↛ 685line 804 didn't jump to line 685 because the condition on line 804 was always true
805 v = node.largs[-1]
806 if (
807 len(node.largs) == 1
808 and len(v) > 0
809 and isinstance(v[0], str)
810 and v[0][0] == ":"
811 ):
812 v = [v[0][1:]] + list(v[1:]) # type:ignore
813 if isinstance(v[0], str) and not v[0].isalnum():
814 links_that_should_not_be_split.append("".join(v[0])) # type: ignore
815 item_recurse(v, possible_sense)
816 elif kind == NodeKind.URL:
817 if len(node.largs) < 2 and node.largs:
818 # Naked url captured
819 urls.extend(node.largs[-1]) # type:ignore[arg-type]
820 continue
821 if len(node.largs) == 2: 821 ↛ 826line 821 didn't jump to line 826 because the condition on line 821 was always true
822 # Url from link with text
823 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
824 # print(f"{node.largs=!r}")
825 # print("linkage recurse URL {}".format(node))
826 item_recurse(node.largs[-1], possible_sense)
827 elif kind in ( 827 ↛ 834line 827 didn't jump to line 834 because the condition on line 827 was always true
828 NodeKind.PREFORMATTED,
829 NodeKind.BOLD,
830 NodeKind.ITALIC,
831 ):
832 item_recurse(node.children)
833 else:
834 wxr.wtp.debug(
835 "linkage item_recurse unhandled {}: {}".format(
836 node.kind, node
837 ),
838 sortid="page/2073",
839 )
841 return is_sense
843 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
844 # .format(contents))
846 is_sense = item_recurse(contents)
848 if not is_sense:
849 item = clean_node(wxr, None, parts)
850 # print("LINKAGE ITEM CONTENTS:", parts)
851 # print("CLEANED ITEM: {!r}".format(item))
852 # print(f"URLS {urls=!r}")
854 if v := parse_linkage_item_text( 854 ↛ 867line 854 didn't jump to line 867 because the condition on line 854 was never true
855 wxr,
856 word,
857 data,
858 field,
859 item,
860 sense,
861 ruby,
862 sense_datas,
863 is_reconstruction,
864 urls or None,
865 links_that_should_not_be_split or None,
866 ):
867 return [v]
869 return []
872def parse_linkage_item_text(
873 wxr: WiktextractContext,
874 word: str,
875 data: WordData,
876 field: str,
877 item: str,
878 sense: Optional[str],
879 ruby: list,
880 pos_datas: list,
881 is_reconstruction: bool,
882 urls: Optional[list[str]] = None,
883 links: Optional[list[str]] = None,
884) -> Optional[str]:
885 """Parses a linkage item once it has been converted to a string. This
886 may add one or more linkages to ``data`` under ``field``. This
887 returns None or a string that contains a sense that should be applied
888 to additional linkages (commonly used in tables for Asian characters)."""
889 assert isinstance(wxr, WiktextractContext)
890 assert isinstance(word, str) # Main word (derived from page title)
891 assert isinstance(data, dict) # Parsed linkages are stored here under field
892 assert isinstance(field, str) # The field under which to store linkage
893 assert isinstance(item, str) # The string to parse
894 assert sense is None or isinstance(sense, str)
895 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
896 assert isinstance(pos_datas, list) # List of senses (containing "glosses")
897 assert urls is None or isinstance(urls, list) # Captured urls
898 assert is_reconstruction in (True, False)
900 item = item.replace("()", "")
901 item = re.sub(r"\s+", " ", item)
902 item = item.strip()
904 base_roman = None
905 base_alt = None
906 base_english = None
907 script_chars = False
908 base_qualifier = None
909 lang = wxr.wtp.section
911 # If ``sense`` can be parsed as tags, treat it as tags instead
912 if sense:
913 cls = classify_desc(sense, no_unknown_starts=True)
914 if cls == "tags":
915 base_qualifier = sense
916 sense = None
918 # Check if this item is a stand-alone sense (or tag) specifier
919 # for following items (e.g., commonly in a table, see 滿)
920 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item)
921 if m:
922 return m.group(1)
924 # Check for pre-split ignored linkages using the appropriate regexp
925 if re.search(linkage_pre_split_ignore_re, item):
926 return None
928 # print(" LINKAGE ITEM: {}: {} (sense {})"
929 # .format(field, item, sense))
931 # Replace occurrences of ~ in the item by the page title
932 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr]
933 item = item.replace(" ~ ", " " + safetitle + " ")
934 item = re.sub(r"^~ ", safetitle + " ", item)
935 item = re.sub(r" ~$", " " + safetitle, item)
937 # Many taxonomic terms contain hyponym lists that end with the
938 # kind of the hyponym (a taxonomic level in plural). Recognize
939 # such and add the term in singular to all linkages in the list.
940 m = re.search(taxonomic_ending_re, item)
941 if m:
942 base_english = taxonomic_ending_map[m.group(1)]
943 item = item[: m.start()]
945 # Some Korean and Japanese words use "word (romanized): english" pattern
946 # Sometimes the parenthesized part contains comma-separated alt and roman.
947 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item)
948 if m:
949 rom = m.group(2)
950 eng = m.group(3)
951 rest = m.group(1)
952 if (
953 classify_desc(rest, no_unknown_starts=True) == "other"
954 and classify_desc(eng, no_unknown_starts=True) == "english"
955 ):
956 item = rest
957 base_roman = rom
958 lst = base_roman.split(", ")
959 if (
960 len(lst) == 2
961 and classify_desc(lst[0], no_unknown_starts=True) == "other"
962 ):
963 base_alt = lst[0]
964 base_roman = lst[1]
965 if base_english:
966 base_english += "; " + eng
967 else:
968 base_english = eng
970 # Many words have tags or similar descriptions in the beginning
971 # followed by a colon and one or more linkages (e.g.,
972 # panetella/Finnish)
973 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match(
974 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$",
975 item,
976 )
977 if m:
978 desc = m.group(1)
979 rest = m.group(len(m.groups()))
980 # Check for certain comma-separated tags combined
981 # with English text at the beginning or end of a
982 # comma-separated parenthesized list
983 lst = split_at_comma_semi(desc, skipped=links)
984 while len(lst) > 1:
985 # Check for tags at the beginning
986 cls = classify_desc(lst[0], no_unknown_starts=True)
987 if cls == "tags":
988 if base_qualifier:
989 base_qualifier += ", " + lst[0]
990 else:
991 base_qualifier = lst[0]
992 lst = lst[1:]
993 continue
994 # Check for tags at the end
995 cls = classify_desc(lst[-1], no_unknown_starts=True)
996 if cls == "tags":
997 if base_qualifier:
998 base_qualifier += ", " + lst[-1]
999 else:
1000 base_qualifier = lst[-1]
1001 lst = lst[:-1]
1002 continue
1003 break
1004 desc = ", ".join(lst)
1006 # Sometimes we have e.g. "chemistry (slang)" with are
1007 # both tags (see "stink"). Handle that case by
1008 # removing parentheses if the value is still tags. The part with
1009 # parentheses could be on either side of the colon.
1010 if "(" in desc:
1011 x = desc.replace("(", ",").replace(")", ",")
1012 if classify_desc(x, no_unknown_starts=True) == "tags":
1013 desc = x
1014 elif "(" in rest:
1015 x = rest.replace("(", ",").replace(")", ",")
1016 if classify_desc(x, no_unknown_starts=True) == "tags":
1017 rest = desc
1018 desc = x
1020 # See if the prefix should trigger special handling for script
1021 # character, letter, digit, etc. handling
1022 if re.search(script_chars_re, desc):
1023 script_chars = True
1025 # Try to determine which side is description and which is
1026 # the linked term (both orders are widely used in Wiktionary)
1027 cls = classify_desc(desc, no_unknown_starts=True)
1028 cls2 = classify_desc(rest, no_unknown_starts=True)
1029 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}"
1030 # .format(desc, cls, rest, cls2))
1032 e1 = wxr.wtp.page_exists(desc)
1033 e2 = wxr.wtp.page_exists(rest)
1034 if cls != "tags":
1035 if (
1036 cls2 == "tags"
1037 or (e1 and not e1)
1038 or (
1039 e1
1040 and e2
1041 and cls2 == "english"
1042 and cls in ("other", "romanization")
1043 )
1044 or (
1045 not e1
1046 and not e2
1047 and cls2 == "english"
1048 and cls in ("other", "romanization")
1049 )
1050 ):
1051 desc, rest = rest, desc # Looks like swapped syntax
1052 cls = cls2
1053 if re.search(linkage_paren_ignore_contains_re, desc): 1053 ↛ 1054line 1053 didn't jump to line 1054 because the condition on line 1053 was never true
1054 desc = ""
1055 # print("linkage colon prefix desc={!r} rest={!r} cls={}"
1056 # .format(desc, rest, cls))
1058 # Handle the prefix according to its type
1059 if cls == "tags":
1060 if base_qualifier:
1061 base_qualifier += ", " + desc
1062 else:
1063 base_qualifier = desc
1064 item = rest
1065 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"):
1066 if base_english: 1066 ↛ 1067line 1066 didn't jump to line 1067 because the condition on line 1066 was never true
1067 base_english += "; " + base_english
1068 else:
1069 base_english = desc
1070 item = rest
1071 elif cls in ("english", "taxonomic"):
1072 if sense: 1072 ↛ 1073line 1072 didn't jump to line 1073 because the condition on line 1072 was never true
1073 sense += "; " + desc
1074 else:
1075 sense = desc
1076 item = rest
1077 elif desc.isdigit():
1078 idx = int(desc) - 1
1079 if idx >= 0 and idx < len(pos_datas):
1080 d = pos_datas[idx]
1081 gl = "; ".join(d.get("glosses", ()))
1082 if not gl: 1082 ↛ 1083line 1082 didn't jump to line 1083 because the condition on line 1082 was never true
1083 wxr.wtp.debug(
1084 "parenthesized numeric linkage prefix, "
1085 "but the referenced sense has no gloss: "
1086 "{}".format(desc),
1087 sortid="linkages/355",
1088 )
1089 elif sense:
1090 sense += "; " + gl
1091 else:
1092 sense = gl
1093 item = rest
1094 else:
1095 wxr.wtp.debug(
1096 "parenthesized numeric linkage prefix, "
1097 "but there is no sense with such index: {}".format(desc),
1098 sortid="linkages/365",
1099 )
1100 item = rest
1101 else:
1102 wxr.wtp.debug(
1103 "unrecognized linkage prefix: {} desc={} rest={} "
1104 "cls={} cls2={} e1={} e2={}".format(
1105 item, desc, rest, cls, cls2, e1, e2
1106 ),
1107 sortid="linkages/371",
1108 )
1109 item = rest
1111 base_sense = sense
1113 # Check for certain plural tag forms at end of items list, and apply
1114 # them to all items if found
1115 m = re.search(
1116 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
1117 r"characters|symbols|tetragrams|letter names|names|"
1118 r"female names|male names|proper nouns|contractions|"
1119 r"nonstandard spellings|verbs|prepositions|postpositions|"
1120 r"interjections|Abbreviations|abbreviations|variants|"
1121 r"ordinals|nouns|phrases|adjectives|adverbs|"
1122 r"augmentatives|pejoratives|compound words|numerals|"
1123 r"Tally marks|surnames|modern nonstandard spellings)$",
1124 item,
1125 )
1126 if m:
1127 suffix = m.group(1)
1128 if base_qualifier:
1129 base_qualifier += ", " + suffix
1130 else:
1131 base_qualifier = suffix
1132 item = item[: m.start()]
1134 # Certain linkage items have space-separated valus. These are
1135 # generated by, e.g., certain templates
1136 if base_sense and base_sense.endswith(" paper sizes"):
1137 base_qualifier = None
1138 item = ", ".join(item.split())
1139 # XXX isn't this now handled by the generic digits/letters/etc code?
1140 # elif base_qualifier in ("Arabic digits",):
1141 # item = ", ".join(item.split())
1143 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript
1144 item = item.strip()
1145 if not item:
1146 return None
1148 # Kludge: if the item contains ")/" (with possibly spaces in between),
1149 # replace it by a comma so it gets split.
1150 item = re.sub(r"\)\s*/", "), ", item)
1152 # The item may contain multiple comma-separated linkages
1153 if base_roman:
1154 subitems = [item]
1155 else:
1156 # Split at commas. Also, in most cases split by " or ", but this
1157 # is complicated - "or" may end certain words (e.g., "logical or")
1158 # and it may separate head-final tags (e.g. "foo f or m"). Also,
1159 # some words have parenthesizxed parts in between, e.g.,
1160 # wife/English/Translations/Yiddish:
1161 # "ווײַב n (vayb) or f, פֿרוי f (froy)"
1162 subitems = []
1163 for item1 in split_at_comma_semi(item, skipped=links):
1164 if " or " not in item1:
1165 subitems.append(item1)
1166 continue
1167 # Item1 contains " or "
1168 item2 = re.sub(r"\s*\([^)]*\)", "", item1)
1169 item2 = re.sub(r"\s+", " ", item2)
1170 if (
1171 (
1172 lang not in head_final_bantu_langs
1173 or not re.search(head_final_bantu_re, item2)
1174 )
1175 and (
1176 lang not in head_final_other_langs
1177 or not re.search(head_final_other_re, item2)
1178 )
1179 and (
1180 not re.search(head_final_re, item2)
1181 or (
1182 item2[-1].isdigit()
1183 and lang not in head_final_numeric_langs
1184 )
1185 )
1186 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE")
1187 and all(
1188 wxr.wtp.title not in x.split(" or ")
1189 for x in split_at_comma_semi(item2, skipped=links)
1190 if " or " in x
1191 )
1192 ):
1193 # We can split this item. Split the non-cleaned version
1194 # that still has any intervening parenthesized parts.
1195 subitems.extend(
1196 split_at_comma_semi(item1, extra=[" or "], skipped=links)
1197 )
1198 else:
1199 subitems.append(item1)
1200 if len(subitems) > 1: # Would be merged from multiple subitems
1201 ruby = [] # XXX what is the purpose of this?
1202 for item1 in subitems:
1203 if len(subitems) > 1 and item1 in ("...", "…"):
1204 # Some lists have ellipsis in the middle - don't generate
1205 # linkages for the ellipsis
1206 continue
1207 item1 = item1.strip()
1208 qualifier = base_qualifier
1209 sense = base_sense
1210 parts = []
1211 roman = base_roman # Usually None
1212 alt = base_alt # Usually None
1213 taxonomic = None
1214 english = base_english
1216 # Some words have derived terms with parenthesized quoted English
1217 # descriptions, which can sometimes essentially be tags
1218 # Some word (bleki/Esperanto...) can have parentheses inside
1219 # the quotes, so let's make this regex even more unreadable.
1220 m = re.search(r"\s*\(“([^”]+)”\)", item1)
1221 if m: 1221 ↛ 1222line 1221 didn't jump to line 1222 because the condition on line 1221 was never true
1222 t = m.group(1)
1223 item1 = (item1[: m.start()] + item1[m.end() :]).strip()
1224 cls = classify_desc(t)
1225 if cls == "tags":
1226 if qualifier:
1227 qualifier += ", " + t
1228 else:
1229 qualifier = t
1230 else:
1231 english = t
1233 # Some Korean words use "word (alt, oman, “english”) pattern
1234 # See 滿/Korean
1235 m = re.match(
1236 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "
1237 r'[“”"]([^”“"]+)[“”"]\)$',
1238 item1,
1239 )
1240 if (
1241 m
1242 and classify_desc(m.group(1), no_unknown_starts=True) == "other"
1243 and classify_desc(m.group(2), no_unknown_starts=True) == "other"
1244 ):
1245 alt = m.group(2)
1246 roman = m.group(3)
1247 english = m.group(4)
1248 item1 = m.group(1)
1250 words = item1.split(" ")
1251 if (
1252 len(words) > 1
1253 and words[0] in linkage_beginning_tags
1254 and words[0] != wxr.wtp.title
1255 ):
1256 t = linkage_beginning_tags[words[0]]
1257 item1 = " ".join(words[1:])
1258 if qualifier: 1258 ↛ 1259line 1258 didn't jump to line 1259 because the condition on line 1258 was never true
1259 qualifier += ", " + t
1260 else:
1261 qualifier = t
1263 # Extract quoted English translations (there are also other
1264 # kinds of English translations)
1265 def english_repl(m: re.Match) -> str:
1266 nonlocal english
1267 nonlocal qualifier
1268 v = m.group(1).strip()
1269 # If v is "tags: sense", handle the tags
1270 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v)
1271 if m1 is not None: 1271 ↛ 1272line 1271 didn't jump to line 1272 because the condition on line 1271 was never true
1272 desc, rest = m1.groups()
1273 if classify_desc(desc, no_unknown_starts=True) == "tags":
1274 if qualifier:
1275 qualifier += ", " + desc
1276 else:
1277 qualifier = desc
1278 v = rest
1279 if english:
1280 english += "; " + v
1281 else:
1282 english = v
1283 return ""
1285 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip()
1287 # There could be multiple parenthesized parts, and
1288 # sometimes both at the beginning and at the end.
1289 # And sometimes even in the middle, as in e.g.
1290 # wife/English/Translations/Yiddish
1291 while not script_chars and (
1292 not sense or not re.search(script_chars_re, sense)
1293 ):
1294 par = None
1295 nonfirst_par = False
1296 if par is None: 1296 ↛ 1313line 1296 didn't jump to line 1313 because the condition on line 1296 was always true
1297 # Try to find a parenthesized part from the beginning.
1298 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1)
1299 if m:
1300 par = m.group(1)
1301 item1 = item1[m.end() :]
1302 else:
1303 # Try to find a parenthesized part at the end or from the
1304 # middle.
1305 m = re.search(
1306 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?",
1307 item1,
1308 )
1309 if m:
1310 par = m.group(1)
1311 item1 = item1[: m.start()] + item1[m.end() :]
1312 nonfirst_par = True
1313 if not par:
1314 break
1315 if re.search(linkage_paren_ignore_contains_re, par):
1316 continue # Skip these linkage descriptors
1317 par = par.strip()
1318 # Handle tags from beginning of par. We also handle "other"
1319 # here as Korean entries often have Hanja form in the
1320 # beginning of parenthesis, before romanization. Similar
1321 # for many Japanese entries.
1322 while par: 1322 ↛ 1343line 1322 didn't jump to line 1343 because the condition on line 1322 was always true
1323 idx = par.find(",")
1324 if idx <= 0:
1325 break
1326 cls = classify_desc(par[:idx], no_unknown_starts=True)
1327 if cls == "other" and not alt: 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true
1328 alt = par[:idx]
1329 elif cls == "taxonomic": 1329 ↛ 1330line 1329 didn't jump to line 1330 because the condition on line 1329 was never true
1330 taxonomic = par[:idx]
1331 elif cls == "tags":
1332 if qualifier:
1333 qualifier += ", " + par[:idx]
1334 else:
1335 qualifier = par[:idx]
1336 else:
1337 break
1338 par = par[idx + 1 :].strip()
1340 # Check for certain comma-separated tags combined
1341 # with English text at the beginning or end of a
1342 # comma-separated parenthesized list
1343 lst = par.split(",") if len(par) > 1 else [par]
1344 lst = list(x.strip() for x in lst if x.strip())
1345 while len(lst) > 1:
1346 cls = classify_desc(lst[0], no_unknown_starts=True)
1347 if cls == "tags": 1347 ↛ 1348line 1347 didn't jump to line 1348 because the condition on line 1347 was never true
1348 if qualifier:
1349 qualifier += ", " + lst[0]
1350 else:
1351 qualifier = lst[0]
1352 lst = lst[1:]
1353 continue
1354 cls = classify_desc(lst[-1], no_unknown_starts=True)
1355 if cls == "tags":
1356 if qualifier:
1357 qualifier += ", " + lst[-1]
1358 else:
1359 qualifier = lst[-1]
1360 lst = lst[:-1]
1361 continue
1362 break
1363 par = ", ".join(lst)
1365 # Handle remaining types
1366 if not par: 1366 ↛ 1367line 1366 didn't jump to line 1367 because the condition on line 1366 was never true
1367 continue
1368 if re.search(script_chars_re, par):
1369 script_chars = True
1370 if classify_desc(par, no_unknown_starts=True) == "tags": 1370 ↛ 1380line 1370 didn't jump to line 1380 because the condition on line 1370 was always true
1371 if base_qualifier: 1371 ↛ 1372line 1371 didn't jump to line 1372 because the condition on line 1371 was never true
1372 base_qualifier += "; " + par
1373 else:
1374 base_qualifier = par
1375 if qualifier: 1375 ↛ 1376line 1375 didn't jump to line 1376 because the condition on line 1375 was never true
1376 qualifier += "; " + par
1377 else:
1378 qualifier = par
1379 else:
1380 if base_sense:
1381 base_sense += "; " + par
1382 else:
1383 base_sense = par
1384 if sense:
1385 sense += "; " + par
1386 else:
1387 sense = par
1388 elif par.endswith(" letter names"): 1388 ↛ 1389line 1388 didn't jump to line 1389 because the condition on line 1388 was never true
1389 if base_qualifier:
1390 base_qualifier += "; " + par
1391 else:
1392 base_qualifier = par
1393 if qualifier:
1394 qualifier += "; " + par
1395 else:
1396 qualifier = par
1397 else:
1398 cls = classify_desc(par)
1399 # print("classify_desc: {!r} -> {}".format(par, cls))
1400 if cls == "tags":
1401 if qualifier:
1402 qualifier += ", " + par
1403 else:
1404 qualifier = par
1405 elif cls == "english":
1406 if nonfirst_par:
1407 if english:
1408 english += "; " + par
1409 else:
1410 english = par
1411 else:
1412 if sense: 1412 ↛ 1413line 1412 didn't jump to line 1413 because the condition on line 1412 was never true
1413 sense += "; " + par
1414 else:
1415 sense = par
1416 elif cls == "romanization":
1417 roman = par
1418 elif cls == "taxonomic":
1419 taxonomic = par
1420 elif par.isdigit():
1421 idx = int(par) - 1
1422 if idx >= 0 and idx < len(pos_datas):
1423 d = pos_datas[idx]
1424 gl = "; ".join(d.get("glosses", ()))
1425 if not gl: 1425 ↛ 1426line 1425 didn't jump to line 1426 because the condition on line 1425 was never true
1426 wxr.wtp.debug(
1427 "parenthesized number "
1428 "but the referenced sense has no "
1429 "gloss: {}".format(par),
1430 sortid="linkages/665",
1431 )
1432 elif sense: 1432 ↛ 1435line 1432 didn't jump to line 1435 because the condition on line 1432 was always true
1433 sense += "; " + gl
1434 else:
1435 sense = gl
1436 else:
1437 wxr.wtp.debug(
1438 "parenthesized number but there is "
1439 "no sense with such index: {}".format(par),
1440 sortid="linkages/674",
1441 )
1442 else:
1443 if alt: 1443 ↛ 1444line 1443 didn't jump to line 1444 because the condition on line 1443 was never true
1444 alt += "; " + par
1445 else:
1446 alt = par
1448 # Handle certain special cases, unless we are parsing
1449 # script characters.
1450 if not script_chars:
1451 # Ignore all linkages with certain prefixes, suffixes, or parts
1452 # (this is done after removing certain prefixes and suffixes)
1453 if re.search(linkage_ignore_re, item1):
1454 continue # Ignore linkages with certain prefixes
1456 # Remove certain prefixes from linkages
1457 m = re.match(linkage_remove_prefixes_re, item1)
1458 if m:
1459 prefix = item1[: m.end()]
1460 item1 = item1[m.end() :]
1461 if prefix in linkage_remove_prefixes_tags:
1462 if qualifier:
1463 qualifier += ", " + linkage_remove_prefixes_tags[prefix]
1464 else:
1465 qualifier = linkage_remove_prefixes_tags[prefix]
1466 # Recheck ignored linkages
1467 if re.search(linkage_ignore_re, item1):
1468 continue
1470 # Remove certain suffixes from linkages
1471 m = re.search(linkage_remove_suffixes_re, item1)
1472 if m:
1473 item1 = item1[: m.start()]
1475 # Parse linkages with "value = english" syntax (e.g.,
1476 # väittää/Finnish)
1477 idx = item1.find(" = ")
1478 if idx >= 0:
1479 eng = item1[idx + 3 :]
1480 if classify_desc(eng, no_unknown_starts=True) == "english":
1481 english = eng
1482 item1 = item1[:idx]
1483 else:
1484 # Some places seem to use it reversed
1485 # "english = value"
1486 eng = item1[:idx]
1487 if classify_desc(eng, no_unknown_starts=True) == "english":
1488 english = eng
1489 item1 = item1[idx + 3 :]
1491 # Parse linkages with "value - english" syntax (e.g.,
1492 # man/Faroese)
1493 m = re.search(r" [-‐‑‒–—―] ", item1)
1494 if m and "(" not in item1:
1495 suffix = item1[m.end() :]
1496 cls = classify_desc(suffix, no_unknown_starts=True)
1497 if cls == "english":
1498 # This case intentionally ignores old values from english
1499 # (otherwise taxonomic lists fail)
1500 english = suffix
1501 item1 = item1[: m.start()]
1502 elif cls == "tags":
1503 if qualifier: 1503 ↛ 1504line 1503 didn't jump to line 1504 because the condition on line 1503 was never true
1504 qualifier += ", " + suffix
1505 else:
1506 qualifier = suffix
1507 item1 = item1[: m.start()]
1509 # Parse certain tags at the end of the linked term (unless
1510 # we are in a letters list)
1511 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1)
1512 if q:
1513 if qualifier: 1513 ↛ 1514line 1513 didn't jump to line 1514 because the condition on line 1513 was never true
1514 qualifier += ", " + ", ".join(q)
1515 else:
1516 qualifier = ", ".join(q)
1518 m = re.search(linkage_truncate_re, item1)
1519 if m: 1519 ↛ 1521line 1519 didn't jump to line 1521 because the condition on line 1519 was never true
1520 # suffix = item1[m.start():] # Currently ignored
1521 item1 = item1[: m.start()]
1522 if not item1:
1523 continue # Ignore empty link targets
1524 if item1 == word:
1525 continue # Ignore self-links
1527 def add(w: str, r: Optional[str]) -> None:
1528 assert isinstance(w, str)
1529 assert r is None or isinstance(r, str)
1530 nonlocal alt
1531 nonlocal taxonomic
1533 # We remove "*" from the beginning of reconstruction linkages.
1534 # Such linkages should only occur in reconstruction senses, so
1535 # this should not cause ambiguity.
1536 if is_reconstruction and w.startswith("*"):
1537 w = w[1:]
1539 # Check if the word contains the Fullwith Solidus, and if
1540 # so, split by it and treat the the results as alternative
1541 # linkages. (This is very commonly used for alternative
1542 # written forms in Chinese compounds and other linkages.)
1543 # However, if the word contains a comma, then we wont't
1544 # split as this is used when we have a different number
1545 # of romanizations than written forms, and don't know
1546 # which is which.
1547 if (
1548 (not w or "," not in w)
1549 and (not r or "," not in r)
1550 and not wxr.wtp.page_exists(w)
1551 ):
1552 lst = w.split("/") if len(w) > 1 else [w]
1553 if len(lst) == 1:
1554 lst = w.split(" / ")
1555 if len(lst) == 1 and len(lst[0]) >= 6:
1556 lst = w.split("/")
1557 if len(lst) > 1:
1558 # Treat each alternative as separate linkage
1559 for w in lst:
1560 add(w, r)
1561 return None
1563 # Heuristically remove "." at the end of most linkages
1564 # (some linkage lists end in a period, but we also have
1565 # abbreviations that end with a period that should be kept)
1566 if (
1567 w.endswith(".")
1568 and not wxr.wtp.page_exists(w)
1569 and (
1570 wxr.wtp.page_exists(w[:-1])
1571 or (len(w) >= 5)
1572 and "." not in w[:-1]
1573 )
1574 ):
1575 w = w[:-1]
1577 # If we have roman but not alt and the word is ASCII,
1578 # move roman to alt.
1579 if r and not alt and w.isascii():
1580 alt = r
1581 r = None
1582 # Add the linkage
1583 dt: LinkageData = {}
1584 if qualifier:
1585 parse_sense_qualifier(wxr, qualifier, dt)
1586 if sense:
1587 dt["sense"] = sense.strip()
1588 if r:
1589 dt["roman"] = r.strip()
1590 if ruby:
1591 dt["ruby"] = ruby
1592 if english:
1593 dt["english"] = english.strip() # DEPRECATED for "translation"
1594 dt["translation"] = english.strip()
1595 if taxonomic:
1596 if re.match(r"×[A-Z]", taxonomic):
1597 data_append(dt, "tags", "extinct")
1598 taxonomic = taxonomic[1:]
1599 dt["taxonomic"] = taxonomic
1600 if re.match(r"×[A-Z]", w):
1601 data_append(dt, "tags", "extinct")
1602 w = w[1:] # Remove × before dead species names
1603 if alt and re.match(r"×[A-Z]", alt):
1604 data_append(dt, "tags", "extinct")
1605 alt = alt[1:] # Remove × before dead species names
1606 if alt and alt.strip() != w:
1607 dt["alt"] = alt.strip()
1608 if urls:
1609 dt["urls"] = [
1610 url.strip() for url in urls if url and isinstance(url, str)
1611 ]
1612 dt["word"] = w
1613 for old in data.get(field, ()): # type: ignore[attr-defined]
1614 if dt == old:
1615 break
1616 else:
1617 data_append(data, field, dt)
1619 # Handle exceptional linkage splits and other linkage
1620 # conversions (including expanding to variant forms)
1621 if item1 in linkage_split_exceptions: 1621 ↛ 1622line 1621 didn't jump to line 1622 because the condition on line 1621 was never true
1622 for item2 in linkage_split_exceptions[item1]:
1623 add(item2, roman)
1624 continue
1626 # Various templates for letters in scripts use spaces as
1627 # separators and also have multiple characters without
1628 # spaces consecutively.
1629 v = sense or qualifier
1630 # print("lang={} v={} script_chars={} item1={!r}"
1631 # .format(wxr.wtp.section, v, script_chars, item1))
1632 if v and script_chars:
1633 if (
1634 len(item1.split()) > 1
1635 or len(list(re.finditer(unicode_dc_re, item1))) == 2
1636 or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))
1637 ):
1638 if v == qualifier:
1639 # if sense:
1640 # sense += "; " + qualifier
1641 # else:
1642 # sense = qualifier
1643 qualifier = None
1644 if re.search(r" (letters|digits|script)$", v):
1645 qualifier = v # Also parse as qualifier
1646 elif re.search( 1646 ↛ 1653line 1646 didn't jump to line 1653 because the condition on line 1646 was always true
1647 r"Variations of letter |"
1648 r"Letters using |"
1649 r"Letters of the ",
1650 v,
1651 ):
1652 qualifier = "letter"
1653 parts = item1.split(". ")
1654 extra: Sequence[str] = ()
1655 if len(parts) > 1: 1655 ↛ 1656line 1655 didn't jump to line 1656 because the condition on line 1655 was never true
1656 extra = parts[1:]
1657 item1 = parts[0]
1658 # Handle multi-character names for chars in language's
1659 # alphabet, e.g., "Ny ny" in P/Hungarian.
1660 if (
1661 len(subitems) > 20
1662 and len(item1.split()) == 2
1663 and all(len(x) <= 3 for x in item1.split())
1664 ):
1665 parts = list(
1666 m.group(0)
1667 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)
1668 if not m.group(0).isspace()
1669 and m.group(0) not in ("(", ")")
1670 )
1671 else:
1672 parts = list(
1673 m.group(0)
1674 for m in re.finditer(r".[\u0300-\u036f]?", item1)
1675 if not m.group(0).isspace()
1676 and m.group(0) not in ("(", ")")
1677 )
1678 for e in extra: 1678 ↛ 1679line 1678 didn't jump to line 1679 because the loop on line 1678 never started
1679 idx = e.find(":")
1680 if idx >= 0:
1681 e = e[idx + 1 :].strip()
1682 if e.endswith("."):
1683 e = e[:-1]
1684 parts.extend(e.split())
1686 # XXX this is not correct - see P/Vietnamese
1687 # While some sequences have multiple consecutive
1688 # characters, others use pairs and some have
1689 # 2/3 character names, e.g., "Ng ng".
1691 rparts: Optional[list[Optional[str]]] = None
1692 if roman: 1692 ↛ 1693line 1692 didn't jump to line 1693 because the condition on line 1692 was never true
1693 rparts = list(
1694 m.group(0)
1695 for m in re.finditer(r".[\u0300-\u036f]", roman)
1696 if not m.group(0).isspace()
1697 )
1698 if len(rparts) != len(parts):
1699 rparts = None
1700 if not rparts: 1700 ↛ 1703line 1700 didn't jump to line 1703 because the condition on line 1700 was always true
1701 rparts = [None] * len(parts)
1703 for w, r in zip(parts, rparts):
1704 add(w, r)
1705 continue
1707 add(item1, roman)
1708 return None