Coverage for src/wiktextract/extractor/de/linkage.py: 89%
60 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from typing import Optional
4from wikitextprocessor.parser import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Linkage, WordEntry
9from .tags import translate_raw_tags
10from .utils import extract_sense_index
13def extract_linkages(
14 wxr: WiktextractContext,
15 word_entry: WordEntry,
16 level_node: LevelNode,
17 linkage_type: str,
18) -> None:
19 linkage_list = []
20 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
21 process_linkage_list_item(wxr, list_item, linkage_list, linkage_type)
22 pre_list = getattr(word_entry, linkage_type)
23 pre_list.extend(linkage_list)
26def process_linkage_list_item(
27 wxr: WiktextractContext,
28 list_item_node: WikiNode,
29 linkage_list: list[Linkage],
30 linkage_type: str,
31) -> None:
32 sense_idx = ""
33 raw_tags = []
34 after_dash = False
35 note_nodes = []
36 for child in list_item_node.children:
37 if after_dash:
38 note_nodes.append(child)
39 elif isinstance(child, str):
40 if child.startswith("["):
41 sense_idx, _ = extract_sense_index(child)
42 elif "," in child or ";" in child:
43 raw_tags.clear()
44 if linkage_type == "expressions" and contains_dash(child):
45 after_dash = True
46 note_nodes.append(child)
47 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
48 raw_tag = clean_node(wxr, None, child)
49 if raw_tag.endswith(":"):
50 raw_tags.append(raw_tag.strip(": "))
51 else:
52 for link_node in child.find_child(NodeKind.LINK):
53 link_text = clean_node(wxr, None, link_node)
54 if link_text != "": 54 ↛ 52line 54 didn't jump to line 52 because the condition on line 54 was always true
55 linkage = Linkage(
56 word=link_text,
57 sense_index=sense_idx,
58 raw_tags=raw_tags,
59 )
60 translate_raw_tags(linkage)
61 linkage_list.append(linkage)
62 elif isinstance(child, TemplateNode) and child.template_name.endswith(
63 "."
64 ):
65 raw_tag = clean_node(wxr, None, child)
66 raw_tag = raw_tag.strip(",: ")
67 if raw_tag != "": 67 ↛ 36line 67 didn't jump to line 36 because the condition on line 67 was always true
68 raw_tags.append(raw_tag)
69 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 69 ↛ 36line 69 didn't jump to line 36 because the condition on line 69 was always true
70 word = clean_node(wxr, None, child)
71 if not word.startswith("Verzeichnis:") and len(word) > 0: 71 ↛ 36line 71 didn't jump to line 36 because the condition on line 71 was always true
72 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis
73 # Links to this namesapce pages are ignored,
74 # should find what it contains later
75 linkage = Linkage(
76 word=word, sense_index=sense_idx, raw_tags=raw_tags
77 )
78 translate_raw_tags(linkage)
79 linkage_list.append(linkage)
81 if len(note_nodes) > 0 and len(linkage_list) > 0:
82 linkage_list[-1].note = clean_node(wxr, None, note_nodes).strip(
83 "–—―‒- "
84 )
87def process_link(
88 wxr: WiktextractContext,
89 sense_idx: str,
90 link_node: WikiNode,
91) -> Optional[Linkage]:
92 word = clean_node(wxr, None, link_node)
93 if word.startswith("Verzeichnis:") or len(word) == 0:
94 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis
95 # Links to this namesapce pages are ignored,
96 # should find what it contains later
97 return None
98 return Linkage(word=word, sense_index=sense_idx)
101def contains_dash(text: str) -> bool:
102 return re.search(r"[–—―‒-]", text) is not None