Coverage for src / wiktextract / extractor / en / linkages.py: 72%
758 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-05 07:46 +0000
1# Code related to parsing linkages (synonyms, hypernyms, related terms, etc)
2#
3# Copyright (c) 2019-2021 Tatu Ylonen. See file LICENSE and https://ylonen.org
5import re
6import unicodedata
7from typing import Optional, Sequence
9from wikitextprocessor import (
10 LevelNode,
11 NodeKind,
12 TemplateNode,
13 WikiNode,
14)
15from wikitextprocessor.core import TemplateArgs
16from wikitextprocessor.parser import (
17 HTMLNode,
18 is_list,
19 is_list_item,
20)
22from ...datautils import (
23 data_append,
24 data_extend,
25 ns_title_prefix_tuple,
26 split_at_comma_semi,
27)
28from ...page import clean_node, is_panel_template
29from ...tags import linkage_beginning_tags, valid_tags
30from ...wxr_context import WiktextractContext
31from ..ruby import extract_ruby, parse_ruby # noqa: F401
32from .form_descriptions import (
33 classify_desc,
34 decode_tags,
35 head_final_bantu_langs,
36 head_final_bantu_re,
37 head_final_numeric_langs,
38 head_final_other_langs,
39 head_final_other_re,
40 head_final_re,
41 parse_head_final_tags,
42 parse_sense_qualifier,
43)
44from .section_titles import TRANSLATIONS_TITLE
45from .type_utils import FormData, LinkageData, SenseData, WordData
47# Linkage will be ignored if it matches this regexp before splitting
48linkage_pre_split_ignore_re = re.compile(
49 r"^("
50 + "|".join(
51 re.escape(x)
52 for x in [
53 "For more variations, see ",
54 "Signal flag:",
55 "Semaphore:",
56 ]
57 )
58 + r")"
59)
61# Linkage will be ignored if it has one of these prefixes
62linkage_ignore_prefixes = [
63 "Historical and regional synonyms of ",
64 "edit data",
65 "or these other third-person pronouns",
66 "introduced in Unicode ",
67 "Entries in the ",
68 "Wikipedia article ",
69 "Wiktionary's coverage of ",
70 "Ethnologue entry for ",
71 "Any of Thesaurus:",
72 "See contents of Category:",
73 "See also Thesaurus:",
74 "See also Appendix:",
75 "As SMS messaging ",
76 "For the reversed question mark used in some right-to-left-scripts",
77 "such as ",
78 "Appendix:",
79 "Category:",
80 ":Category:",
81]
83# Linkage will be ignored if it has any of these suffixes
84linkage_ignore_suffixes = [
85 " Wikipedia",
86 " Wikipedia.",
87 " edition of Wiktionary",
88]
90# Linkage will be ignored if it is one of these (with full match)
91linkage_ignore_whole = [
92 "etc.",
93 "other derived terms:",
94 "Formal terms",
95 "informal and slang terms",
96]
98# Linkage will be ignored if it matches this regexp
99linkage_ignore_re = re.compile(
100 r"^("
101 + "|".join(re.escape(x) for x in linkage_ignore_whole)
102 + r")$|^("
103 + "|".join(re.escape(x) for x in linkage_ignore_prefixes)
104 + r")|("
105 + "|".join(re.escape(x) for x in linkage_ignore_suffixes)
106 + r")$"
107)
109# These prefixes will be removed from linkages, leaving the rest. This is
110# considered separately for each linkage in a list.
111linkage_remove_prefixes_re = re.compile(
112 r"^("
113 + r"|".join(
114 re.escape(x)
115 for x in [
116 ":",
117 "see Thesaurus:",
118 "See Thesaurus:",
119 "see also Thesaurus:",
120 "See also Thesaurus:",
121 "see also ",
122 "See also ",
123 "see ",
124 "See ",
125 "from ",
126 "abbreviation of ",
127 "ISO 639-1 code ",
128 "ISO 639-3 code ",
129 "Thesaurus:",
130 ]
131 )
132 + ")"
133)
135# When removing prefix from linkage, this dictionary can be used to map
136# the removed prefix to a space-separated list of tags to add
137linkage_remove_prefixes_tags = {
138 "abbreviation of ": "abbreviation",
139}
141# These suffixes will be removed from linkages, leaving the rest. This is
142# considered separately for each linkage in a list.
143linkage_remove_suffixes_re = re.compile(
144 r"(\s+on (Wikispecies|Wikimedia Commons|"
145 r"[A-Z]\w+ Wiktionary|[A-Z]\w+ Wikipedia)\.?|"
146 r"\s*[-–] Pre-reform orthography.*)"
147 r"$"
148)
150# Ignore linkage parenthesized sections that contain one of these strings
151linkage_paren_ignore_contains_re = re.compile(
152 r"\b("
153 + "|".join(
154 re.escape(x)
155 for x in [
156 "from Etymology",
157 "used as",
158 "usage notes",
159 ]
160 )
161 + ")([, ]|$)"
162)
164taxonomic_ending_map = {
165 "superkingdoms": "superkingdom",
166 "kingdoms": "kingdom",
167 "subkingdoms": "subkingdom",
168 "infrakingdoms": "infrakingdom",
169 "phylums": "phylum",
170 "subphylums": "subphylum",
171 "infraphylums": "infraphylum",
172 "superclasses": "superclass",
173 "classes": "class",
174 "orders": "order",
175 "suborders": "suborder",
176 "families": "family",
177 "subfamilies": "subfamily",
178 "genera": "genus",
179}
180for k, v in list(taxonomic_ending_map.items()):
181 taxonomic_ending_map[v] = v # Also add singular -> singular
182taxonomic_ending_re = re.compile(
183 r"\s+[-‐‑‒–—]\s+({})$".format(
184 "|".join(re.escape(x) for x in taxonomic_ending_map)
185 )
186)
188# Exceptional splits for linkages. This can be used to fix particular linkages
189# that are not handled correctly by the default code. This can also be used
190# to create automatic aliases, e.g., for mapping "..." and "…" to both.
191linkage_split_exceptions = {
192 "∛ ∜": ["∛", "∜"],
193 "...": ["...", "…"],
194 "…": ["...", "…"],
195}
197# Truncate linkage word if it matches any of these strings
198linkage_truncate_re = re.compile(
199 "|".join(
200 re.escape(x)
201 for x in [
202 " and its derived terms",
203 " UTF-16 0x214C",
204 ]
205 )
206)
208# Regexp for identifying special linkages containing lists of letters, digits,
209# or characters
210script_chars_re = re.compile(
211 r"(script letters| script| letters|"
212 r"Dialectological|Puctuation|Symbols|"
213 r"Guillemets|Single guillemets|"
214 r" tetragrams|"
215 r" digits)(;|$)|"
216 r"(^|; )(Letters using |Letters of the |"
217 r"Variations of letter )|"
218 r"^(Hiragana|Katakana)$"
219)
221# Matches an unicode character including any combining diacritics (even if
222# separate characters)
223unicode_dc_re = re.compile(
224 r"\w[{}]|.".format(
225 "".join(
226 chr(x)
227 for x in range(0, 0x110000)
228 if unicodedata.category(chr(x)) == "Mn"
229 )
230 )
231)
234def extract_alt_form_section(
235 wxr: WiktextractContext, word_entry: WordData, level_node: LevelNode
236) -> None:
237 for list_node in level_node.find_child(NodeKind.LIST):
238 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
239 for node in list_item.children:
240 if isinstance(node, TemplateNode) and node.template_name in [
241 "l",
242 "link",
243 "L",
244 "alt",
245 "alter",
246 ]:
247 extract_l_template(wxr, word_entry, node)
248 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
249 word = clean_node(wxr, None, node)
250 if word != "": 250 ↛ 239line 250 didn't jump to line 239 because the condition on line 250 was always true
251 form: FormData = {"form": word, "tags": ["alternative"]}
252 data_append(word_entry, "forms", form)
255def extract_l_template(
256 wxr: WiktextractContext, word_entry: WordData, t_node: TemplateNode
257) -> None:
258 forms: list[FormData] = []
259 expanded_node = wxr.wtp.parse(
260 wxr.wtp.node_to_wikitext(t_node), expand_all=True
261 )
262 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))
263 for span_tag in expanded_node.find_html("span"):
264 span_lang = span_tag.attrs.get("lang", "")
265 span_class = span_tag.attrs.get("class", "")
266 if span_lang == lang_code: 266 ↛ 271line 266 didn't jump to line 271 because the condition on line 266 was always true
267 word = clean_node(wxr, None, span_tag)
268 if word != "": 268 ↛ 263line 268 didn't jump to line 263 because the condition on line 268 was always true
269 form: FormData = {"form": word, "tags": ["alternative"]}
270 forms.append(form)
271 elif span_lang.endswith("-Latn") and len(forms) > 0:
272 roman = clean_node(wxr, None, span_tag)
273 if roman != "":
274 forms[-1]["roman"] = roman
275 elif "label-content" in span_class and len(forms) > 0:
276 tag_text = clean_node(wxr, None, span_tag)
277 if classify_desc(tag_text) == "tags":
278 tagsets1, _ = decode_tags(tag_text)
279 tags: list[str] = []
280 for ts in tagsets1:
281 tags.extend(ts)
282 for form in forms:
283 form["tags"].extend(tags)
284 data_extend(word_entry, "forms", forms)
287ZH_DIAL_TAGS = {
288 "Classical Chinese": ["Classical-Chinese"],
289 "Formal": ["formal"],
290 "Written Standard Chinese": ["Written-vernacular-Chinese"],
291 "Northeastern Mandarin": ["Northeastern-Mandarin"],
292 "Jilu Mandarin": ["Jilu-Mandarin"],
293 "Jiaoliao Mandarin": ["Jiaoliao-Mandarin"],
294 "Central Plains Mandarin": ["Central-Plains-Mandarin"],
295 "Lanyin Mandarin": ["Lanyin-Mandarin"],
296 "Southwestern Mandarin": ["Southwestern-Mandarin"],
297 "Jianghuai Mandarin": ["Jianghuai-Mandarin"],
298 "Northern Min": ["Min-Bei"],
299 "Eastern Min": ["Min-Dong"],
300 "Southern Min": ["Min-Nan"],
301 "Zhongshan Min": ["Zhongshan-Min"],
302 "Southern Pinghua": ["Southern-Pinghua"],
303 "Puxian Min": ["Puxian-Min"],
304}
307def extract_zh_dial_template(
308 wxr: WiktextractContext,
309 word_entry: WordData,
310 t_node: TemplateNode,
311 sense: str,
312):
313 # https://en.wiktionary.org/wiki/Template:zh-dial
314 from .pronunciation import split_zh_pron_raw_tag
316 linkage_list: list[LinkageData] = []
317 expanded_node = wxr.wtp.parse(
318 wxr.wtp.node_to_wikitext(t_node), expand_all=True
319 )
320 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 320 ↛ 321line 320 didn't jump to line 321 because the loop on line 320 never started
321 is_note_row = False
322 note_tags = {}
323 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
324 for cell_node in row_node.find_child(
325 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL
326 ):
327 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:
328 is_note_row = clean_node(wxr, None, cell_node) == "Note"
329 elif is_note_row:
330 for note_str in clean_node(wxr, None, cell_node).split(";"):
331 if "-" in note_str:
332 note_symbol, note = note_str.split("-", maxsplit=1)
333 note_symbol = note_symbol.strip()
334 note = note.strip()
335 if note_symbol != "" and note != "":
336 note_tags[note_symbol] = note
337 lang_tags = []
338 region_tags = []
339 for row_node in table_node.find_child(NodeKind.TABLE_ROW):
340 if not row_node.contain_node(NodeKind.TABLE_CELL):
341 continue # skip header row
342 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):
343 lang_tags = split_zh_pron_raw_tag(
344 clean_node(wxr, None, header_node)
345 )
346 if lang_tags == ["Note"]: # skip last note row
347 continue
348 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):
349 for link_node in cell_node.find_child(NodeKind.LINK):
350 region_tags = split_zh_pron_raw_tag(
351 clean_node(wxr, None, link_node)
352 )
353 for span_tag in cell_node.find_html("span"):
354 span_text = clean_node(wxr, None, span_tag)
355 if span_text == "":
356 continue
357 if (
358 span_tag.attrs.get("lang", "") == "zh"
359 and span_text != wxr.wtp.title
360 ):
361 l_data: LinkageData = {"word": span_text}
362 if sense != "":
363 l_data["sense"] = sense
364 if len(lang_tags) > 0:
365 data_extend(l_data, "raw_tags", lang_tags)
366 if len(region_tags) > 0:
367 data_extend(l_data, "raw_tags", region_tags)
368 linkage_list.append(l_data)
369 elif (
370 span_tag.attrs.get("style", "") == "font-size:60%"
371 and len(linkage_list) > 0
372 ):
373 for note_symbol in span_text.split(","):
374 note_symbol = note_symbol.strip()
375 raw_tag = note_symbol
376 if note_symbol in note_tags:
377 raw_tag = note_tags[note_symbol]
378 if raw_tag != "":
379 data_append(
380 linkage_list[-1], "raw_tags", raw_tag
381 )
383 for l_data in linkage_list:
384 raw_tags = []
385 for raw_tag in l_data.get("raw_tags", []):
386 if raw_tag in ZH_DIAL_TAGS:
387 data_extend(l_data, "tags", ZH_DIAL_TAGS[raw_tag])
388 elif raw_tag in valid_tags:
389 data_append(l_data, "tags", raw_tag)
390 else:
391 raw_tags.append(raw_tag)
392 if len(raw_tags) > 0:
393 l_data["raw_tags"] = raw_tags
394 elif "raw_tags" in l_data:
395 del l_data["raw_tags"]
396 data_extend(word_entry, "synonyms", linkage_list)
399def parse_linkage(
400 wxr: WiktextractContext,
401 data: WordData,
402 field: str,
403 linkagenode: LevelNode,
404 word: str,
405 sense_datas: list[SenseData],
406 is_reconstruction: bool,
407) -> None:
408 assert isinstance(data, dict)
409 assert isinstance(field, str)
410 assert isinstance(linkagenode, LevelNode)
411 # print("field", field)
412 # print("data", data)
413 # print("children:")
414 if not wxr.config.capture_linkages: 414 ↛ 415line 414 didn't jump to line 415 because the condition on line 414 was never true
415 return
416 have_panel_template = False
418 def linkage_template_fn1(name: str, ht: TemplateArgs) -> Optional[str]:
419 nonlocal have_panel_template
420 if is_panel_template(wxr, name):
421 have_panel_template = True
422 return ""
423 # Ignore auto-filled templates like Template:table:Solar System/en
424 if name.startswith(("table:", "list:")):
425 return ""
426 return None
428 # Main body of parse_linkage()
429 l_nodes: list[str | WikiNode] = []
430 l_sense = ""
431 for node in linkagenode.children:
432 if isinstance(node, TemplateNode) and node.template_name == "zh-dial":
433 extract_zh_dial_template(wxr, data, node, l_sense)
434 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
435 for list_item in node.find_child(NodeKind.LIST_ITEM):
436 for t_node in list_item.find_child(NodeKind.TEMPLATE):
437 if t_node.template_name in ["s", "sense"]:
438 l_sense = clean_node(wxr, None, t_node).strip("(): ")
439 l_nodes.append(node)
440 else:
441 l_nodes.append(node)
442 text = wxr.wtp.node_to_wikitext(l_nodes)
443 parsed = wxr.wtp.parse(
444 text, expand_all=True, template_fn=linkage_template_fn1
445 )
447 text_outside_list_items = parse_linkage_recurse(
448 wxr,
449 parsed.children,
450 field,
451 None,
452 None,
453 word,
454 data,
455 sense_datas,
456 is_reconstruction,
457 )
459 if not data.get(field) and not have_panel_template:
460 text = "".join(text_outside_list_items).strip()
461 if "\n" not in text and "," in text and text.count(",") > 3:
462 if not text.startswith("See "): 462 ↛ exitline 462 didn't return from function 'parse_linkage' because the condition on line 462 was always true
463 parse_linkage_item(
464 wxr,
465 [text],
466 field,
467 word,
468 data,
469 sense_datas,
470 is_reconstruction,
471 None,
472 )
475def parse_linkage_recurse(
476 wxr: WiktextractContext,
477 contents: list[WikiNode | str],
478 field: str,
479 sense: str | None,
480 block_header_sense: str | None,
481 word: str,
482 data,
483 sense_datas,
484 is_reconstruction,
485) -> list[str]:
486 assert isinstance(contents, (list, tuple))
487 assert sense is None or isinstance(sense, str)
488 # print("PARSE_LINKAGE_RECURSE: {}: {}".format(sense, contents))
490 # Return values
491 text_outside_list_items: list[str] = []
493 for node in contents:
494 if isinstance(node, str):
495 # Ignore top-level text, generally comments before the
496 # linkages list. However, if no linkages are found, then
497 # use this for linkages (not all words use bullet points
498 # for linkages).
499 text_outside_list_items.append(node)
500 continue
501 assert isinstance(node, WikiNode)
502 kind = node.kind
503 # print("PARSE_LINKAGE_RECURSE CHILD", kind)
504 if is_list(node) or kind in (NodeKind.TABLE, NodeKind.TABLE_ROW):
505 toli = parse_linkage_recurse(
506 wxr,
507 node.children,
508 field,
509 sense,
510 block_header_sense,
511 word,
512 data,
513 sense_datas,
514 is_reconstruction,
515 )
516 text_outside_list_items.extend(toli)
517 elif is_list_item(node) or kind == NodeKind.TABLE_CELL:
518 v = parse_linkage_item(
519 wxr,
520 node.children,
521 field,
522 word,
523 data,
524 sense_datas,
525 is_reconstruction,
526 sense,
527 )
528 if v is not None: 528 ↛ 493line 528 didn't jump to line 493 because the condition on line 528 was always true
529 # parse_linkage_item() can return a value that should
530 # be used as the sense for the follow-on linkages,
531 # which are typically provided in a table (see 滿)
532 block_header_sense = "".join(v)
533 elif kind in (
534 NodeKind.TABLE_CAPTION,
535 NodeKind.TABLE_HEADER_CELL,
536 NodeKind.PREFORMATTED,
537 NodeKind.BOLD,
538 ):
539 # Let's still ignore table extra stuff
540 continue
541 elif isinstance(node, HTMLNode): 541 ↛ 543line 541 didn't jump to line 543 because the condition on line 541 was never true
542 # Recurse to process inside the HTML for most tags
543 if node.sarg in ("gallery", "ref", "cite", "caption"):
544 continue
545 classes = (node.attrs.get("class") or "").replace("+", " ").split()
546 if "qualifier-content" in classes:
547 sense1 = clean_node(wxr, None, node.children)
548 if sense1.endswith(":"):
549 sense1 = sense1[:-1].strip()
550 if sense and sense1:
551 wxr.wtp.debug(
552 "linkage qualifier-content on multiple "
553 "levels: {!r} and {!r}".format(sense, sense1),
554 sortid="page/2170",
555 )
556 toli = parse_linkage_recurse(
557 wxr,
558 node.children,
559 field,
560 sense1,
561 block_header_sense,
562 word,
563 data,
564 sense_datas,
565 is_reconstruction,
566 )
567 text_outside_list_items.extend(toli)
568 elif "list-switcher-header" in classes:
569 block_header_sense = clean_node(wxr, None, node.children)
570 if block_header_sense.endswith(":"):
571 block_header_sense = block_header_sense[:-1].strip()
572 elif any(x in classes for x in ("NavFrame", "term-list")):
573 # NavFrame uses previously assigned block_header_sense
574 # (from a "(sense):" item) and clears it afterwards
575 # print(f"{sense=}, {block_header_sense=}")
576 toli = parse_linkage_recurse(
577 wxr,
578 node.children,
579 field,
580 sense or block_header_sense,
581 block_header_sense,
582 word,
583 data,
584 sense_datas,
585 is_reconstruction,
586 )
587 text_outside_list_items.extend(toli)
588 block_header_sense = None
589 else:
590 toli = parse_linkage_recurse(
591 wxr,
592 node.children,
593 field,
594 sense,
595 block_header_sense,
596 word,
597 data,
598 sense_datas,
599 is_reconstruction,
600 )
601 text_outside_list_items.extend(toli)
602 elif isinstance(node, LevelNode): 602 ↛ 604line 602 didn't jump to line 604 because the condition on line 602 was never true
603 # Just recurse to any possible subsections
604 toli = parse_linkage_recurse(
605 wxr,
606 node.children,
607 field,
608 sense,
609 block_header_sense,
610 word,
611 data,
612 sense_datas,
613 is_reconstruction,
614 )
615 text_outside_list_items.extend(toli)
616 elif kind in (NodeKind.BOLD, NodeKind.ITALIC):
617 # Skip these on top level; at least sometimes bold is
618 # used for indicating a subtitle
619 continue
620 elif kind == NodeKind.LINK: 620 ↛ 637line 620 didn't jump to line 637 because the condition on line 620 was always true
621 # Recurse into the last argument
622 # Apparently ":/" is used as a link to "/", so strip
623 # initial value
624 toli = parse_linkage_recurse(
625 wxr,
626 node.largs[-1],
627 field,
628 sense,
629 block_header_sense,
630 word,
631 data,
632 sense_datas,
633 is_reconstruction,
634 )
635 text_outside_list_items.extend(toli)
636 else:
637 wxr.wtp.debug(
638 "parse_linkage_recurse unhandled {}: {}".format(kind, node),
639 sortid="page/2196",
640 )
642 return text_outside_list_items
645def parse_linkage_item(
646 wxr: WiktextractContext,
647 contents: list[str | WikiNode],
648 field: str,
649 word: str,
650 data: WordData,
651 sense_datas: list[SenseData],
652 is_reconstruction: bool,
653 sense: str | None = None,
654) -> list[str]:
655 assert isinstance(contents, (list, tuple))
656 assert isinstance(field, str)
657 assert sense is None or isinstance(sense, str)
659 # print("PARSE_LINKAGE_ITEM: {} ({}): {}"
660 # .format(field, sense, contents))
662 parts: list[str] = []
663 ruby: list[tuple[str, str]] = []
664 urls: list[str] = []
665 # data about link text; this is used to skip splitting on
666 # linkage text items that contain stuff like commas; for
667 # example "Hunde, die bellen, beißen nicht" in article
668 # beißen is split into "Hunde", "die bellen" etc.
669 # We take that link text and use it, eventually,
670 # in split_at_comma_semi to skip splitting on those
671 # commas.
672 links_that_should_not_be_split: list[str] = []
674 def item_recurse(
675 contents: list[str | WikiNode], possible_sense: str | None = None
676 ) -> bool:
677 assert isinstance(contents, (list, tuple))
678 nonlocal sense
679 nonlocal ruby
680 nonlocal parts
681 is_sense = False
682 # print("ITEM_RECURSE:", contents)
683 for node in contents:
684 if isinstance(node, str):
685 parts.append(node)
686 continue
687 kind = node.kind
688 # print(
689 # "ITEM_RECURSE KIND:",
690 # kind,
691 # node.sarg if node.sarg else node.largs,
692 # )
694 #### parts into possible_sense
695 if (
696 is_list_item(node)
697 or is_list(node)
698 or kind
699 in (
700 NodeKind.TABLE,
701 NodeKind.TABLE_ROW,
702 NodeKind.TABLE_CELL,
703 )
704 and parts
705 ):
706 # print(f"{parts=}")
707 candidate_sense: str | None
708 candidate_sense = clean_node(wxr, None, parts)
709 is_sense = False
711 if candidate_sense.endswith(":"):
712 is_sense = True
713 candidate_sense = candidate_sense[:-1].strip()
714 if candidate_sense.startswith("(") and candidate_sense.endswith( 714 ↛ 717line 714 didn't jump to line 717 because the condition on line 714 was never true
715 ")"
716 ):
717 is_sense = True
718 candidate_sense = candidate_sense[1:-1].strip()
719 if (
720 candidate_sense.lower() == TRANSLATIONS_TITLE
721 or not is_sense
722 ):
723 candidate_sense = None
724 # print(f"{possible_sense=}, {is_sense=}")
725 if is_sense:
726 possible_sense = candidate_sense
727 parts = []
728 else:
729 candidate_sense = None
731 # Handle nodes
732 if is_list_item(node): 732 ↛ 733line 732 didn't jump to line 733 because the condition on line 732 was never true
733 parse_linkage_item(
734 wxr,
735 node.children,
736 field,
737 word,
738 data,
739 sense_datas,
740 is_reconstruction,
741 possible_sense or sense,
742 )
743 elif is_list(node) or kind in (
744 NodeKind.TABLE,
745 NodeKind.TABLE_ROW,
746 NodeKind.TABLE_CELL,
747 ):
748 parse_linkage_recurse(
749 wxr,
750 node.children,
751 field,
752 possible_sense or sense,
753 None,
754 word,
755 data,
756 sense_datas,
757 is_reconstruction,
758 )
759 elif kind in ( 759 ↛ 763line 759 didn't jump to line 763 because the condition on line 759 was never true
760 NodeKind.TABLE_HEADER_CELL,
761 NodeKind.TABLE_CAPTION,
762 ):
763 continue
764 elif kind == NodeKind.HTML: 764 ↛ 765line 764 didn't jump to line 765 because the condition on line 764 was never true
765 classes = (node.attrs.get("class") or "").split()
766 if node.sarg in ("gallery", "ref", "cite", "caption"):
767 continue
768 elif node.sarg == "ruby":
769 rb = parse_ruby(wxr, node)
770 if rb:
771 ruby.append(rb)
772 parts.append(rb[0])
773 continue
774 elif node.sarg == "math":
775 parts.append(clean_node(wxr, None, node))
776 continue
777 elif "interProject" in classes:
778 continue # These do not seem to be displayed
779 if "NavFrame" in classes:
780 parse_linkage_recurse(
781 wxr,
782 node.children,
783 field,
784 possible_sense or sense,
785 None,
786 word,
787 data,
788 sense_datas,
789 is_reconstruction,
790 )
791 else:
792 item_recurse(node.children, possible_sense)
793 elif kind == NodeKind.LINK:
794 ignore = False
795 if isinstance(node.largs[0][0], str): 795 ↛ 683line 795 didn't jump to line 683 because the condition on line 795 was always true
796 v1 = node.largs[0][0].strip().lower()
797 if v1.startswith( 797 ↛ 801line 797 didn't jump to line 801 because the condition on line 797 was never true
798 ns_title_prefix_tuple(wxr, "Category", True)
799 + ns_title_prefix_tuple(wxr, "File", True)
800 ):
801 ignore = True
802 if not ignore: 802 ↛ 683line 802 didn't jump to line 683 because the condition on line 802 was always true
803 v = node.largs[-1]
804 if (
805 len(node.largs) == 1
806 and len(v) > 0
807 and isinstance(v[0], str)
808 and v[0][0] == ":"
809 ):
810 v = [v[0][1:]] + list(v[1:]) # type:ignore
811 if isinstance(v[0], str) and not v[0].isalnum():
812 links_that_should_not_be_split.append("".join(v[0])) # type: ignore
813 item_recurse(v, possible_sense)
814 elif kind == NodeKind.URL:
815 if len(node.largs) < 2 and node.largs:
816 # Naked url captured
817 urls.extend(node.largs[-1]) # type:ignore[arg-type]
818 continue
819 if len(node.largs) == 2: 819 ↛ 824line 819 didn't jump to line 824 because the condition on line 819 was always true
820 # Url from link with text
821 urls.append(node.largs[0][-1]) # type:ignore[arg-type]
822 # print(f"{node.largs=!r}")
823 # print("linkage recurse URL {}".format(node))
824 item_recurse(node.largs[-1], possible_sense)
825 elif kind in ( 825 ↛ 832line 825 didn't jump to line 832 because the condition on line 825 was always true
826 NodeKind.PREFORMATTED,
827 NodeKind.BOLD,
828 NodeKind.ITALIC,
829 ):
830 item_recurse(node.children)
831 else:
832 wxr.wtp.debug(
833 "linkage item_recurse unhandled {}: {}".format(
834 node.kind, node
835 ),
836 sortid="page/2073",
837 )
839 return is_sense
841 # print("LINKAGE CONTENTS BEFORE ITEM_RECURSE: {!r}"
842 # .format(contents))
844 is_sense = item_recurse(contents)
846 if not is_sense:
847 item = clean_node(wxr, None, parts)
848 # print("LINKAGE ITEM CONTENTS:", parts)
849 # print("CLEANED ITEM: {!r}".format(item))
850 # print(f"URLS {urls=!r}")
852 if v := parse_linkage_item_text( 852 ↛ 865line 852 didn't jump to line 865 because the condition on line 852 was never true
853 wxr,
854 word,
855 data,
856 field,
857 item,
858 sense,
859 ruby,
860 sense_datas,
861 is_reconstruction,
862 urls or None,
863 links_that_should_not_be_split or None,
864 ):
865 return [v]
867 return []
870def parse_linkage_item_text(
871 wxr: WiktextractContext,
872 word: str,
873 data: WordData,
874 field: str,
875 item: str,
876 sense: Optional[str],
877 ruby: list,
878 pos_datas: list,
879 is_reconstruction: bool,
880 urls: Optional[list[str]] = None,
881 links: Optional[list[str]] = None,
882) -> Optional[str]:
883 """Parses a linkage item once it has been converted to a string. This
884 may add one or more linkages to ``data`` under ``field``. This
885 returns None or a string that contains a sense that should be applied
886 to additional linkages (commonly used in tables for Asian characters)."""
887 assert isinstance(wxr, WiktextractContext)
888 assert isinstance(word, str) # Main word (derived from page title)
889 assert isinstance(data, dict) # Parsed linkages are stored here under field
890 assert isinstance(field, str) # The field under which to store linkage
891 assert isinstance(item, str) # The string to parse
892 assert sense is None or isinstance(sense, str)
893 assert isinstance(ruby, list) # Captured ruby (hiragana/katakana) or ""
894 assert isinstance(pos_datas, list) # List of senses (containing "glosses")
895 assert urls is None or isinstance(urls, list) # Captured urls
896 assert is_reconstruction in (True, False)
898 item = item.replace("()", "")
899 item = re.sub(r"\s+", " ", item)
900 item = item.strip()
902 base_roman = None
903 base_alt = None
904 base_english = None
905 script_chars = False
906 base_qualifier = None
907 lang = wxr.wtp.section
909 # If ``sense`` can be parsed as tags, treat it as tags instead
910 if sense:
911 cls = classify_desc(sense, no_unknown_starts=True)
912 if cls == "tags":
913 base_qualifier = sense
914 sense = None
916 # Check if this item is a stand-alone sense (or tag) specifier
917 # for following items (e.g., commonly in a table, see 滿)
918 m = re.match(r"\(([-a-zA-Z0-9 ]+)\):$", item)
919 if m:
920 return m.group(1)
922 # Check for pre-split ignored linkages using the appropriate regexp
923 if re.search(linkage_pre_split_ignore_re, item):
924 return None
926 # print(" LINKAGE ITEM: {}: {} (sense {})"
927 # .format(field, item, sense))
929 # Replace occurrences of ~ in the item by the page title
930 safetitle = wxr.wtp.title.replace("\\", "\\\\") # type: ignore[union-attr]
931 item = item.replace(" ~ ", " " + safetitle + " ")
932 item = re.sub(r"^~ ", safetitle + " ", item)
933 item = re.sub(r" ~$", " " + safetitle, item)
935 # Many taxonomic terms contain hyponym lists that end with the
936 # kind of the hyponym (a taxonomic level in plural). Recognize
937 # such and add the term in singular to all linkages in the list.
938 m = re.search(taxonomic_ending_re, item)
939 if m:
940 base_english = taxonomic_ending_map[m.group(1)]
941 item = item[: m.start()]
943 # Some Korean and Japanese words use "word (romanized): english" pattern
944 # Sometimes the parenthesized part contains comma-separated alt and roman.
945 m = re.match(r"(.+?) \(([^():]+)\): ([-a-zA-Z0-9,. ]+)$", item)
946 if m:
947 rom = m.group(2)
948 eng = m.group(3)
949 rest = m.group(1)
950 if (
951 classify_desc(rest, no_unknown_starts=True) == "other"
952 and classify_desc(eng, no_unknown_starts=True) == "english"
953 ):
954 item = rest
955 base_roman = rom
956 lst = base_roman.split(", ")
957 if (
958 len(lst) == 2
959 and classify_desc(lst[0], no_unknown_starts=True) == "other"
960 ):
961 base_alt = lst[0]
962 base_roman = lst[1]
963 if base_english:
964 base_english += "; " + eng
965 else:
966 base_english = eng
968 # Many words have tags or similar descriptions in the beginning
969 # followed by a colon and one or more linkages (e.g.,
970 # panetella/Finnish)
971 m = re.match(r"^\((([^():]|\([^()]*\))+)\): ([^:]*)$", item) or re.match(
972 r"^([a-zA-Z][-'a-zA-Z0-9 ]*" r"(\([^()]+\)[-'a-zA-Z0-9 ]*)*): ([^:]*)$",
973 item,
974 )
975 if m:
976 desc = m.group(1)
977 rest = m.group(len(m.groups()))
978 # Check for certain comma-separated tags combined
979 # with English text at the beginning or end of a
980 # comma-separated parenthesized list
981 lst = split_at_comma_semi(desc, skipped=links)
982 while len(lst) > 1:
983 # Check for tags at the beginning
984 cls = classify_desc(lst[0], no_unknown_starts=True)
985 if cls == "tags":
986 if base_qualifier:
987 base_qualifier += ", " + lst[0]
988 else:
989 base_qualifier = lst[0]
990 lst = lst[1:]
991 continue
992 # Check for tags at the end
993 cls = classify_desc(lst[-1], no_unknown_starts=True)
994 if cls == "tags":
995 if base_qualifier:
996 base_qualifier += ", " + lst[-1]
997 else:
998 base_qualifier = lst[-1]
999 lst = lst[:-1]
1000 continue
1001 break
1002 desc = ", ".join(lst)
1004 # Sometimes we have e.g. "chemistry (slang)" with are
1005 # both tags (see "stink"). Handle that case by
1006 # removing parentheses if the value is still tags. The part with
1007 # parentheses could be on either side of the colon.
1008 if "(" in desc:
1009 x = desc.replace("(", ",").replace(")", ",")
1010 if classify_desc(x, no_unknown_starts=True) == "tags":
1011 desc = x
1012 elif "(" in rest:
1013 x = rest.replace("(", ",").replace(")", ",")
1014 if classify_desc(x, no_unknown_starts=True) == "tags":
1015 rest = desc
1016 desc = x
1018 # See if the prefix should trigger special handling for script
1019 # character, letter, digit, etc. handling
1020 if re.search(script_chars_re, desc):
1021 script_chars = True
1023 # Try to determine which side is description and which is
1024 # the linked term (both orders are widely used in Wiktionary)
1025 cls = classify_desc(desc, no_unknown_starts=True)
1026 cls2 = classify_desc(rest, no_unknown_starts=True)
1027 # print("linkage prefix: desc={!r} cls={} rest={!r} cls2={}"
1028 # .format(desc, cls, rest, cls2))
1030 e1 = wxr.wtp.page_exists(desc)
1031 e2 = wxr.wtp.page_exists(rest)
1032 if cls != "tags":
1033 if (
1034 cls2 == "tags"
1035 or (e1 and not e1)
1036 or (
1037 e1
1038 and e2
1039 and cls2 == "english"
1040 and cls in ("other", "romanization")
1041 )
1042 or (
1043 not e1
1044 and not e2
1045 and cls2 == "english"
1046 and cls in ("other", "romanization")
1047 )
1048 ):
1049 desc, rest = rest, desc # Looks like swapped syntax
1050 cls = cls2
1051 if re.search(linkage_paren_ignore_contains_re, desc): 1051 ↛ 1052line 1051 didn't jump to line 1052 because the condition on line 1051 was never true
1052 desc = ""
1053 # print("linkage colon prefix desc={!r} rest={!r} cls={}"
1054 # .format(desc, rest, cls))
1056 # Handle the prefix according to its type
1057 if cls == "tags":
1058 if base_qualifier:
1059 base_qualifier += ", " + desc
1060 else:
1061 base_qualifier = desc
1062 item = rest
1063 elif desc in ("NATO phonetic", "Morse code", "Braille", "ASL Manual"):
1064 if base_english: 1064 ↛ 1065line 1064 didn't jump to line 1065 because the condition on line 1064 was never true
1065 base_english += "; " + base_english
1066 else:
1067 base_english = desc
1068 item = rest
1069 elif cls in ("english", "taxonomic"):
1070 if sense: 1070 ↛ 1071line 1070 didn't jump to line 1071 because the condition on line 1070 was never true
1071 sense += "; " + desc
1072 else:
1073 sense = desc
1074 item = rest
1075 elif desc.isdigit():
1076 idx = int(desc) - 1
1077 if idx >= 0 and idx < len(pos_datas):
1078 d = pos_datas[idx]
1079 gl = "; ".join(d.get("glosses", ()))
1080 if not gl: 1080 ↛ 1081line 1080 didn't jump to line 1081 because the condition on line 1080 was never true
1081 wxr.wtp.debug(
1082 "parenthesized numeric linkage prefix, "
1083 "but the referenced sense has no gloss: "
1084 "{}".format(desc),
1085 sortid="linkages/355",
1086 )
1087 elif sense:
1088 sense += "; " + gl
1089 else:
1090 sense = gl
1091 item = rest
1092 else:
1093 wxr.wtp.debug(
1094 "parenthesized numeric linkage prefix, "
1095 "but there is no sense with such index: {}".format(desc),
1096 sortid="linkages/365",
1097 )
1098 item = rest
1099 else:
1100 wxr.wtp.debug(
1101 "unrecognized linkage prefix: {} desc={} rest={} "
1102 "cls={} cls2={} e1={} e2={}".format(
1103 item, desc, rest, cls, cls2, e1, e2
1104 ),
1105 sortid="linkages/371",
1106 )
1107 item = rest
1109 base_sense = sense
1111 # Check for certain plural tag forms at end of items list, and apply
1112 # them to all items if found
1113 m = re.search(
1114 r" [-‐‑‒–—―] (diminutives|Diminutives|letters|digits|"
1115 r"characters|symbols|tetragrams|letter names|names|"
1116 r"female names|male names|proper nouns|contractions|"
1117 r"nonstandard spellings|verbs|prepositions|postpositions|"
1118 r"interjections|Abbreviations|abbreviations|variants|"
1119 r"ordinals|nouns|phrases|adjectives|adverbs|"
1120 r"augmentatives|pejoratives|compound words|numerals|"
1121 r"Tally marks|surnames|modern nonstandard spellings)$",
1122 item,
1123 )
1124 if m:
1125 suffix = m.group(1)
1126 if base_qualifier:
1127 base_qualifier += ", " + suffix
1128 else:
1129 base_qualifier = suffix
1130 item = item[: m.start()]
1132 # Certain linkage items have space-separated valus. These are
1133 # generated by, e.g., certain templates
1134 if base_sense and base_sense.endswith(" paper sizes"):
1135 base_qualifier = None
1136 item = ", ".join(item.split())
1137 # XXX isn't this now handled by the generic digits/letters/etc code?
1138 # elif base_qualifier in ("Arabic digits",):
1139 # item = ", ".join(item.split())
1141 item = re.sub(r"\s*\^\(\s*\)|\s*\^\s+", "", item) # Now empty superscript
1142 item = item.strip()
1143 if not item:
1144 return None
1146 # Kludge: if the item contains ")/" (with possibly spaces in between),
1147 # replace it by a comma so it gets split.
1148 item = re.sub(r"\)\s*/", "), ", item)
1150 # The item may contain multiple comma-separated linkages
1151 if base_roman:
1152 subitems = [item]
1153 else:
1154 # Split at commas. Also, in most cases split by " or ", but this
1155 # is complicated - "or" may end certain words (e.g., "logical or")
1156 # and it may separate head-final tags (e.g. "foo f or m"). Also,
1157 # some words have parenthesizxed parts in between, e.g.,
1158 # wife/English/Translations/Yiddish:
1159 # "ווײַב n (vayb) or f, פֿרוי f (froy)"
1160 subitems = []
1161 for item1 in split_at_comma_semi(item, skipped=links):
1162 if " or " not in item1:
1163 subitems.append(item1)
1164 continue
1165 # Item1 contains " or "
1166 item2 = re.sub(r"\s*\([^)]*\)", "", item1)
1167 item2 = re.sub(r"\s+", " ", item2)
1168 if (
1169 (
1170 lang not in head_final_bantu_langs
1171 or not re.search(head_final_bantu_re, item2)
1172 )
1173 and (
1174 lang not in head_final_other_langs
1175 or not re.search(head_final_other_re, item2)
1176 )
1177 and (
1178 not re.search(head_final_re, item2)
1179 or (
1180 item2[-1].isdigit()
1181 and lang not in head_final_numeric_langs
1182 )
1183 )
1184 and not re.search(r"\bor\b", wxr.wtp.title or "MISSING_TITLE")
1185 and all(
1186 wxr.wtp.title not in x.split(" or ")
1187 for x in split_at_comma_semi(item2, skipped=links)
1188 if " or " in x
1189 )
1190 ):
1191 # We can split this item. Split the non-cleaned version
1192 # that still has any intervening parenthesized parts.
1193 subitems.extend(
1194 split_at_comma_semi(item1, extra=[" or "], skipped=links)
1195 )
1196 else:
1197 subitems.append(item1)
1198 if len(subitems) > 1: # Would be merged from multiple subitems
1199 ruby = [] # XXX what is the purpose of this?
1200 for item1 in subitems:
1201 if len(subitems) > 1 and item1 in ("...", "…"):
1202 # Some lists have ellipsis in the middle - don't generate
1203 # linkages for the ellipsis
1204 continue
1205 item1 = item1.strip()
1206 qualifier = base_qualifier
1207 sense = base_sense
1208 parts = []
1209 roman = base_roman # Usually None
1210 alt = base_alt # Usually None
1211 taxonomic = None
1212 english = base_english
1214 # Some words have derived terms with parenthesized quoted English
1215 # descriptions, which can sometimes essentially be tags
1216 # Some word (bleki/Esperanto...) can have parentheses inside
1217 # the quotes, so let's make this regex even more unreadable.
1218 m = re.search(r"\s*\(“([^”]+)”\)", item1)
1219 if m: 1219 ↛ 1220line 1219 didn't jump to line 1220 because the condition on line 1219 was never true
1220 t = m.group(1)
1221 item1 = (item1[: m.start()] + item1[m.end() :]).strip()
1222 cls = classify_desc(t)
1223 if cls == "tags":
1224 if qualifier:
1225 qualifier += ", " + t
1226 else:
1227 qualifier = t
1228 else:
1229 english = t
1231 # Some Korean words use "word (alt, oman, “english”) pattern
1232 # See 滿/Korean
1233 m = re.match(
1234 r"([^(),;:]+) \(([^(),;:]+), ([^(),;:]+), "
1235 r'[“”"]([^”“"]+)[“”"]\)$',
1236 item1,
1237 )
1238 if (
1239 m
1240 and classify_desc(m.group(1), no_unknown_starts=True) == "other"
1241 and classify_desc(m.group(2), no_unknown_starts=True) == "other"
1242 ):
1243 alt = m.group(2)
1244 roman = m.group(3)
1245 english = m.group(4)
1246 item1 = m.group(1)
1248 words = item1.split(" ")
1249 if (
1250 len(words) > 1
1251 and words[0] in linkage_beginning_tags
1252 and words[0] != wxr.wtp.title
1253 ):
1254 t = linkage_beginning_tags[words[0]]
1255 item1 = " ".join(words[1:])
1256 if qualifier: 1256 ↛ 1257line 1256 didn't jump to line 1257 because the condition on line 1256 was never true
1257 qualifier += ", " + t
1258 else:
1259 qualifier = t
1261 # Extract quoted English translations (there are also other
1262 # kinds of English translations)
1263 def english_repl(m: re.Match) -> str:
1264 nonlocal english
1265 nonlocal qualifier
1266 v = m.group(1).strip()
1267 # If v is "tags: sense", handle the tags
1268 m1 = re.match(r"^([a-zA-Z ]+): (.*)$", v)
1269 if m1 is not None: 1269 ↛ 1270line 1269 didn't jump to line 1270 because the condition on line 1269 was never true
1270 desc, rest = m1.groups()
1271 if classify_desc(desc, no_unknown_starts=True) == "tags":
1272 if qualifier:
1273 qualifier += ", " + desc
1274 else:
1275 qualifier = desc
1276 v = rest
1277 if english:
1278 english += "; " + v
1279 else:
1280 english = v
1281 return ""
1283 item1 = re.sub(r'[“"]([^“”"]+)[“”"],?\s*', english_repl, item1).strip()
1285 # There could be multiple parenthesized parts, and
1286 # sometimes both at the beginning and at the end.
1287 # And sometimes even in the middle, as in e.g.
1288 # wife/English/Translations/Yiddish
1289 while not script_chars and (
1290 not sense or not re.search(script_chars_re, sense)
1291 ):
1292 par = None
1293 nonfirst_par = False
1294 if par is None: 1294 ↛ 1311line 1294 didn't jump to line 1311 because the condition on line 1294 was always true
1295 # Try to find a parenthesized part from the beginning.
1296 m = re.match(r"\((([^()]|\([^()]*\))*)\):?\s*", item1)
1297 if m:
1298 par = m.group(1)
1299 item1 = item1[m.end() :]
1300 else:
1301 # Try to find a parenthesized part at the end or from the
1302 # middle.
1303 m = re.search(
1304 r"\s+\((\d|\d\d|[^\d]([^()]|\([^()]*\))*)\)" r"(\.$)?",
1305 item1,
1306 )
1307 if m:
1308 par = m.group(1)
1309 item1 = item1[: m.start()] + item1[m.end() :]
1310 nonfirst_par = True
1311 if not par:
1312 break
1313 if re.search(linkage_paren_ignore_contains_re, par):
1314 continue # Skip these linkage descriptors
1315 par = par.strip()
1316 # Handle tags from beginning of par. We also handle "other"
1317 # here as Korean entries often have Hanja form in the
1318 # beginning of parenthesis, before romanization. Similar
1319 # for many Japanese entries.
1320 while par: 1320 ↛ 1341line 1320 didn't jump to line 1341 because the condition on line 1320 was always true
1321 idx = par.find(",")
1322 if idx <= 0:
1323 break
1324 cls = classify_desc(par[:idx], no_unknown_starts=True)
1325 if cls == "other" and not alt: 1325 ↛ 1326line 1325 didn't jump to line 1326 because the condition on line 1325 was never true
1326 alt = par[:idx]
1327 elif cls == "taxonomic": 1327 ↛ 1328line 1327 didn't jump to line 1328 because the condition on line 1327 was never true
1328 taxonomic = par[:idx]
1329 elif cls == "tags":
1330 if qualifier:
1331 qualifier += ", " + par[:idx]
1332 else:
1333 qualifier = par[:idx]
1334 else:
1335 break
1336 par = par[idx + 1 :].strip()
1338 # Check for certain comma-separated tags combined
1339 # with English text at the beginning or end of a
1340 # comma-separated parenthesized list
1341 lst = par.split(",") if len(par) > 1 else [par]
1342 lst = list(x.strip() for x in lst if x.strip())
1343 while len(lst) > 1:
1344 cls = classify_desc(lst[0], no_unknown_starts=True)
1345 if cls == "tags": 1345 ↛ 1346line 1345 didn't jump to line 1346 because the condition on line 1345 was never true
1346 if qualifier:
1347 qualifier += ", " + lst[0]
1348 else:
1349 qualifier = lst[0]
1350 lst = lst[1:]
1351 continue
1352 cls = classify_desc(lst[-1], no_unknown_starts=True)
1353 if cls == "tags":
1354 if qualifier:
1355 qualifier += ", " + lst[-1]
1356 else:
1357 qualifier = lst[-1]
1358 lst = lst[:-1]
1359 continue
1360 break
1361 par = ", ".join(lst)
1363 # Handle remaining types
1364 if not par: 1364 ↛ 1365line 1364 didn't jump to line 1365 because the condition on line 1364 was never true
1365 continue
1366 if re.search(script_chars_re, par):
1367 script_chars = True
1368 if classify_desc(par, no_unknown_starts=True) == "tags": 1368 ↛ 1378line 1368 didn't jump to line 1378 because the condition on line 1368 was always true
1369 if base_qualifier: 1369 ↛ 1370line 1369 didn't jump to line 1370 because the condition on line 1369 was never true
1370 base_qualifier += "; " + par
1371 else:
1372 base_qualifier = par
1373 if qualifier: 1373 ↛ 1374line 1373 didn't jump to line 1374 because the condition on line 1373 was never true
1374 qualifier += "; " + par
1375 else:
1376 qualifier = par
1377 else:
1378 if base_sense:
1379 base_sense += "; " + par
1380 else:
1381 base_sense = par
1382 if sense:
1383 sense += "; " + par
1384 else:
1385 sense = par
1386 elif par.endswith(" letter names"): 1386 ↛ 1387line 1386 didn't jump to line 1387 because the condition on line 1386 was never true
1387 if base_qualifier:
1388 base_qualifier += "; " + par
1389 else:
1390 base_qualifier = par
1391 if qualifier:
1392 qualifier += "; " + par
1393 else:
1394 qualifier = par
1395 else:
1396 cls = classify_desc(par)
1397 # print("classify_desc: {!r} -> {}".format(par, cls))
1398 if cls == "tags":
1399 if qualifier:
1400 qualifier += ", " + par
1401 else:
1402 qualifier = par
1403 elif cls == "english":
1404 if nonfirst_par:
1405 if english:
1406 english += "; " + par
1407 else:
1408 english = par
1409 else:
1410 if sense: 1410 ↛ 1411line 1410 didn't jump to line 1411 because the condition on line 1410 was never true
1411 sense += "; " + par
1412 else:
1413 sense = par
1414 elif cls == "romanization":
1415 roman = par
1416 elif cls == "taxonomic":
1417 taxonomic = par
1418 elif par.isdigit():
1419 idx = int(par) - 1
1420 if idx >= 0 and idx < len(pos_datas):
1421 d = pos_datas[idx]
1422 gl = "; ".join(d.get("glosses", ()))
1423 if not gl: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true
1424 wxr.wtp.debug(
1425 "parenthesized number "
1426 "but the referenced sense has no "
1427 "gloss: {}".format(par),
1428 sortid="linkages/665",
1429 )
1430 elif sense: 1430 ↛ 1433line 1430 didn't jump to line 1433 because the condition on line 1430 was always true
1431 sense += "; " + gl
1432 else:
1433 sense = gl
1434 else:
1435 wxr.wtp.debug(
1436 "parenthesized number but there is "
1437 "no sense with such index: {}".format(par),
1438 sortid="linkages/674",
1439 )
1440 else:
1441 if alt: 1441 ↛ 1442line 1441 didn't jump to line 1442 because the condition on line 1441 was never true
1442 alt += "; " + par
1443 else:
1444 alt = par
1446 # Handle certain special cases, unless we are parsing
1447 # script characters.
1448 if not script_chars:
1449 # Ignore all linkages with certain prefixes, suffixes, or parts
1450 # (this is done after removing certain prefixes and suffixes)
1451 if re.search(linkage_ignore_re, item1):
1452 continue # Ignore linkages with certain prefixes
1454 # Remove certain prefixes from linkages
1455 m = re.match(linkage_remove_prefixes_re, item1)
1456 if m:
1457 prefix = item1[: m.end()]
1458 item1 = item1[m.end() :]
1459 if prefix in linkage_remove_prefixes_tags:
1460 if qualifier:
1461 qualifier += ", " + linkage_remove_prefixes_tags[prefix]
1462 else:
1463 qualifier = linkage_remove_prefixes_tags[prefix]
1464 # Recheck ignored linkages
1465 if re.search(linkage_ignore_re, item1):
1466 continue
1468 # Remove certain suffixes from linkages
1469 m = re.search(linkage_remove_suffixes_re, item1)
1470 if m:
1471 item1 = item1[: m.start()]
1473 # Parse linkages with "value = english" syntax (e.g.,
1474 # väittää/Finnish)
1475 idx = item1.find(" = ")
1476 if idx >= 0:
1477 eng = item1[idx + 3 :]
1478 if classify_desc(eng, no_unknown_starts=True) == "english":
1479 english = eng
1480 item1 = item1[:idx]
1481 else:
1482 # Some places seem to use it reversed
1483 # "english = value"
1484 eng = item1[:idx]
1485 if classify_desc(eng, no_unknown_starts=True) == "english":
1486 english = eng
1487 item1 = item1[idx + 3 :]
1489 # Parse linkages with "value - english" syntax (e.g.,
1490 # man/Faroese)
1491 m = re.search(r" [-‐‑‒–—―] ", item1)
1492 if m and "(" not in item1:
1493 suffix = item1[m.end() :]
1494 cls = classify_desc(suffix, no_unknown_starts=True)
1495 if cls == "english":
1496 # This case intentionally ignores old values from english
1497 # (otherwise taxonomic lists fail)
1498 english = suffix
1499 item1 = item1[: m.start()]
1500 elif cls == "tags":
1501 if qualifier: 1501 ↛ 1502line 1501 didn't jump to line 1502 because the condition on line 1501 was never true
1502 qualifier += ", " + suffix
1503 else:
1504 qualifier = suffix
1505 item1 = item1[: m.start()]
1507 # Parse certain tags at the end of the linked term (unless
1508 # we are in a letters list)
1509 item1, q = parse_head_final_tags(wxr, lang or "MISSING_LANG", item1)
1510 if q:
1511 if qualifier: 1511 ↛ 1512line 1511 didn't jump to line 1512 because the condition on line 1511 was never true
1512 qualifier += ", " + ", ".join(q)
1513 else:
1514 qualifier = ", ".join(q)
1516 m = re.search(linkage_truncate_re, item1)
1517 if m: 1517 ↛ 1519line 1517 didn't jump to line 1519 because the condition on line 1517 was never true
1518 # suffix = item1[m.start():] # Currently ignored
1519 item1 = item1[: m.start()]
1520 if not item1:
1521 continue # Ignore empty link targets
1522 if item1 == word:
1523 continue # Ignore self-links
1525 def add(w: str, r: Optional[str]) -> None:
1526 assert isinstance(w, str)
1527 assert r is None or isinstance(r, str)
1528 nonlocal alt
1529 nonlocal taxonomic
1531 # We remove "*" from the beginning of reconstruction linkages.
1532 # Such linkages should only occur in reconstruction senses, so
1533 # this should not cause ambiguity.
1534 if is_reconstruction and w.startswith("*"):
1535 w = w[1:]
1537 # Check if the word contains the Fullwith Solidus, and if
1538 # so, split by it and treat the the results as alternative
1539 # linkages. (This is very commonly used for alternative
1540 # written forms in Chinese compounds and other linkages.)
1541 # However, if the word contains a comma, then we wont't
1542 # split as this is used when we have a different number
1543 # of romanizations than written forms, and don't know
1544 # which is which.
1545 if (
1546 (not w or "," not in w)
1547 and (not r or "," not in r)
1548 and not wxr.wtp.page_exists(w)
1549 ):
1550 lst = w.split("/") if len(w) > 1 else [w]
1551 if len(lst) == 1:
1552 lst = w.split(" / ")
1553 if len(lst) == 1 and len(lst[0]) >= 6:
1554 lst = w.split("/")
1555 if len(lst) > 1:
1556 # Treat each alternative as separate linkage
1557 for w in lst:
1558 add(w, r)
1559 return None
1561 # Heuristically remove "." at the end of most linkages
1562 # (some linkage lists end in a period, but we also have
1563 # abbreviations that end with a period that should be kept)
1564 if (
1565 w.endswith(".")
1566 and not wxr.wtp.page_exists(w)
1567 and (
1568 wxr.wtp.page_exists(w[:-1])
1569 or (len(w) >= 5)
1570 and "." not in w[:-1]
1571 )
1572 ):
1573 w = w[:-1]
1575 # If we have roman but not alt and the word is ASCII,
1576 # move roman to alt.
1577 if r and not alt and w.isascii():
1578 alt = r
1579 r = None
1580 # Add the linkage
1581 dt: LinkageData = {}
1582 if qualifier:
1583 parse_sense_qualifier(wxr, qualifier, dt)
1584 if sense:
1585 dt["sense"] = sense.strip()
1586 if r:
1587 dt["roman"] = r.strip()
1588 if ruby:
1589 dt["ruby"] = ruby
1590 if english:
1591 dt["english"] = english.strip() # DEPRECATED for "translation"
1592 dt["translation"] = english.strip()
1593 if taxonomic:
1594 if re.match(r"×[A-Z]", taxonomic):
1595 data_append(dt, "tags", "extinct")
1596 taxonomic = taxonomic[1:]
1597 dt["taxonomic"] = taxonomic
1598 if re.match(r"×[A-Z]", w):
1599 data_append(dt, "tags", "extinct")
1600 w = w[1:] # Remove × before dead species names
1601 if alt and re.match(r"×[A-Z]", alt):
1602 data_append(dt, "tags", "extinct")
1603 alt = alt[1:] # Remove × before dead species names
1604 if alt and alt.strip() != w:
1605 dt["alt"] = alt.strip()
1606 if urls:
1607 dt["urls"] = [
1608 url.strip() for url in urls if url and isinstance(url, str)
1609 ]
1610 dt["word"] = w
1611 for old in data.get(field, ()): # type: ignore[attr-defined]
1612 if dt == old:
1613 break
1614 else:
1615 data_append(data, field, dt)
1617 # Handle exceptional linkage splits and other linkage
1618 # conversions (including expanding to variant forms)
1619 if item1 in linkage_split_exceptions: 1619 ↛ 1620line 1619 didn't jump to line 1620 because the condition on line 1619 was never true
1620 for item2 in linkage_split_exceptions[item1]:
1621 add(item2, roman)
1622 continue
1624 # Various templates for letters in scripts use spaces as
1625 # separators and also have multiple characters without
1626 # spaces consecutively.
1627 v = sense or qualifier
1628 # print("lang={} v={} script_chars={} item1={!r}"
1629 # .format(wxr.wtp.section, v, script_chars, item1))
1630 if v and script_chars:
1631 if (
1632 len(item1.split()) > 1
1633 or len(list(re.finditer(unicode_dc_re, item1))) == 2
1634 or (len(subitems) > 10 and v in ("Hiragana", "Katakana"))
1635 ):
1636 if v == qualifier:
1637 # if sense:
1638 # sense += "; " + qualifier
1639 # else:
1640 # sense = qualifier
1641 qualifier = None
1642 if re.search(r" (letters|digits|script)$", v):
1643 qualifier = v # Also parse as qualifier
1644 elif re.search( 1644 ↛ 1651line 1644 didn't jump to line 1651 because the condition on line 1644 was always true
1645 r"Variations of letter |"
1646 r"Letters using |"
1647 r"Letters of the ",
1648 v,
1649 ):
1650 qualifier = "letter"
1651 parts = item1.split(". ")
1652 extra: Sequence[str] = ()
1653 if len(parts) > 1: 1653 ↛ 1654line 1653 didn't jump to line 1654 because the condition on line 1653 was never true
1654 extra = parts[1:]
1655 item1 = parts[0]
1656 # Handle multi-character names for chars in language's
1657 # alphabet, e.g., "Ny ny" in P/Hungarian.
1658 if (
1659 len(subitems) > 20
1660 and len(item1.split()) == 2
1661 and all(len(x) <= 3 for x in item1.split())
1662 ):
1663 parts = list(
1664 m.group(0)
1665 for m in re.finditer(r"(\w[\u0300-\u036f]?)+|.", item1)
1666 if not m.group(0).isspace()
1667 and m.group(0) not in ("(", ")")
1668 )
1669 else:
1670 parts = list(
1671 m.group(0)
1672 for m in re.finditer(r".[\u0300-\u036f]?", item1)
1673 if not m.group(0).isspace()
1674 and m.group(0) not in ("(", ")")
1675 )
1676 for e in extra: 1676 ↛ 1677line 1676 didn't jump to line 1677 because the loop on line 1676 never started
1677 idx = e.find(":")
1678 if idx >= 0:
1679 e = e[idx + 1 :].strip()
1680 if e.endswith("."):
1681 e = e[:-1]
1682 parts.extend(e.split())
1684 # XXX this is not correct - see P/Vietnamese
1685 # While some sequences have multiple consecutive
1686 # characters, others use pairs and some have
1687 # 2/3 character names, e.g., "Ng ng".
1689 rparts: Optional[list[Optional[str]]] = None
1690 if roman: 1690 ↛ 1691line 1690 didn't jump to line 1691 because the condition on line 1690 was never true
1691 rparts = list(
1692 m.group(0)
1693 for m in re.finditer(r".[\u0300-\u036f]", roman)
1694 if not m.group(0).isspace()
1695 )
1696 if len(rparts) != len(parts):
1697 rparts = None
1698 if not rparts: 1698 ↛ 1701line 1698 didn't jump to line 1701 because the condition on line 1698 was always true
1699 rparts = [None] * len(parts)
1701 for w, r in zip(parts, rparts):
1702 add(w, r)
1703 continue
1705 add(item1, roman)
1706 return None