Coverage for src/wiktextract/extractor/de/linkage.py: 89%
80 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
3from mediawiki_langcodes import name_to_code
4from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Descendant, Linkage, Translation, WordEntry
9from .tags import translate_raw_tags
10from .utils import extract_sense_index
13def extract_linkages(
14 wxr: WiktextractContext,
15 word_entry: WordEntry,
16 level_node: LevelNode,
17 linkage_type: str,
18) -> None:
19 linkage_list = []
20 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
21 process_linkage_list_item(wxr, list_item, linkage_list, linkage_type)
22 getattr(word_entry, linkage_type).extend(linkage_list)
25def process_linkage_list_item(
26 wxr: WiktextractContext,
27 list_item_node: WikiNode,
28 linkage_list: list[Linkage],
29 linkage_type: str,
30) -> None:
31 sense_idx = ""
32 raw_tags = []
33 after_dash = False
34 note_nodes = []
35 for child in list_item_node.children:
36 if after_dash:
37 note_nodes.append(child)
38 elif isinstance(child, str):
39 if child.startswith("["):
40 sense_idx, _ = extract_sense_index(child)
41 elif "," in child or ";" in child:
42 raw_tags.clear()
43 if linkage_type == "expressions" and contains_dash(child):
44 after_dash = True
45 note_nodes.append(child)
46 elif isinstance(child, WikiNode) and child.kind == NodeKind.ITALIC:
47 raw_tag = clean_node(wxr, None, child)
48 if raw_tag.endswith(":"):
49 raw_tags.append(raw_tag.strip(": "))
50 else:
51 for link_node in child.find_child(NodeKind.LINK):
52 link_text = clean_node(wxr, None, link_node)
53 if link_text != "": 53 ↛ 51line 53 didn't jump to line 51 because the condition on line 53 was always true
54 linkage = Linkage(
55 word=link_text,
56 sense_index=sense_idx,
57 raw_tags=raw_tags,
58 )
59 translate_raw_tags(linkage)
60 linkage_list.append(linkage)
61 elif isinstance(child, TemplateNode) and child.template_name.endswith(
62 "."
63 ):
64 raw_tag = clean_node(wxr, None, child)
65 raw_tag = raw_tag.strip(",: ")
66 if raw_tag != "": 66 ↛ 35line 66 didn't jump to line 35 because the condition on line 66 was always true
67 raw_tags.append(raw_tag)
68 elif isinstance(child, WikiNode) and child.kind == NodeKind.LINK: 68 ↛ 35line 68 didn't jump to line 35 because the condition on line 68 was always true
69 word = clean_node(wxr, None, child)
70 if not word.startswith("Verzeichnis:") and len(word) > 0: 70 ↛ 35line 70 didn't jump to line 35 because the condition on line 70 was always true
71 # https://de.wiktionary.org/wiki/Wiktionary:Verzeichnis
72 # ignore index namespace links
73 linkage = Linkage(
74 word=word, sense_index=sense_idx, raw_tags=raw_tags
75 )
76 translate_raw_tags(linkage)
77 linkage_list.append(linkage)
79 if len(note_nodes) > 0 and len(linkage_list) > 0:
80 linkage_list[-1].note = clean_node(wxr, None, note_nodes).strip(
81 "–—―‒- "
82 )
85def contains_dash(text: str) -> bool:
86 return re.search(r"[–—―‒-]", text) is not None
89def extract_descendant_section(
90 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
91) -> None:
92 for list_node in level_node.find_child(NodeKind.LIST):
93 for list_item in list_node.find_child(NodeKind.LIST_ITEM):
94 extract_descendant_list_item(wxr, word_entry, list_item)
97def extract_descendant_list_item(
98 wxr: WiktextractContext, word_entry: WordEntry, list_item: WikiNode
99) -> None:
100 lang_name = "unknown"
101 lang_code = "unknown"
102 sense_index = ""
103 for node in list_item.children:
104 if isinstance(node, str) and node.strip().startswith("["):
105 sense_index, _ = extract_sense_index(node)
106 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
107 node_str = clean_node(wxr, None, node)
108 if node_str.endswith(":"): 108 ↛ 103line 108 didn't jump to line 103 because the condition on line 108 was always true
109 lang_name = node_str.strip(": ")
110 lang_code = name_to_code(lang_name, "de") or "unknown"
111 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
112 node_str = clean_node(wxr, None, node)
113 if node != "": 113 ↛ 103line 113 didn't jump to line 103 because the condition on line 113 was always true
114 word_entry.descendants.append(
115 Descendant(
116 lang=lang_name,
117 lang_code=lang_code,
118 word=node_str,
119 sense_index=sense_index,
120 )
121 )
122 elif isinstance(node, TemplateNode) and node.template_name.startswith( 122 ↛ 125line 122 didn't jump to line 125 because the condition on line 122 was never true
123 "Ü"
124 ):
125 from .translation import process_u_template
127 tr_data = Translation(lang=lang_name, lang_code=lang_code)
128 process_u_template(wxr, tr_data, node)
129 if tr_data.word != "":
130 word_entry.descendants.append(
131 Descendant(
132 lang=tr_data.lang,
133 lang_code=tr_data.lang_code,
134 word=tr_data.word,
135 roman=tr_data.roman,
136 sense_index=sense_index,
137 )
138 )