Coverage for src/wiktextract/extractor/nl/linkage.py: 85%
104 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
3from wikitextprocessor import LevelNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Linkage, WordEntry
8from .tags import LIST_ITEM_TAG_TEMPLATES
11def extract_linkage_section(
12 wxr: WiktextractContext,
13 word_entry: WordEntry,
14 level_node: LevelNode,
15 linkage_type: str,
16) -> None:
17 sense_index = 0
18 sense = ""
19 raw_tags = []
20 for node in level_node.children:
21 if isinstance(node, TemplateNode):
22 if node.template_name == "intens":
23 # https://nl.wiktionary.org/wiki/Sjabloon:intens
24 raw_tags = ["intensivering"]
25 s_index_str = node.template_parameters.get(2, "").strip()
26 if re.fullmatch(r"\d+", s_index_str): 26 ↛ 20line 26 didn't jump to line 20 because the condition on line 26 was always true
27 sense_index = int(s_index_str)
28 elif node.template_name == "L-top":
29 second_arg = clean_node(
30 wxr, None, node.template_parameters.get(2, "")
31 )
32 m = re.search(r"\[(\d+)\]", second_arg)
33 if m is not None: 33 ↛ 37line 33 didn't jump to line 37 because the condition on line 33 was always true
34 sense_index = int(m.group(1))
35 sense = second_arg[m.end() :].strip()
36 else:
37 sense = second_arg
38 elif node.template_name == "L-bottom":
39 sense = ""
40 sense_index = 0
41 elif node.template_name.startswith("nld-"):
42 extract_nld_template(wxr, word_entry, node, linkage_type)
43 elif node.template_name in ["expr", "fras"]: 43 ↛ 20line 43 didn't jump to line 20 because the condition on line 43 was always true
44 extract_expr_template(wxr, word_entry, node, linkage_type)
45 elif isinstance(node, WikiNode):
46 if node.kind == NodeKind.LINK:
47 word = clean_node(wxr, None, node)
48 if word != "": 48 ↛ 20line 48 didn't jump to line 20 because the condition on line 48 was always true
49 getattr(word_entry, linkage_type).append(
50 Linkage(
51 word=word,
52 sense=sense,
53 sense_index=sense_index,
54 raw_tags=raw_tags,
55 )
56 )
57 elif node.kind == NodeKind.LIST: 57 ↛ 20line 57 didn't jump to line 20 because the condition on line 57 was always true
58 for list_item in node.find_child(NodeKind.LIST_ITEM):
59 extract_linkage_list_item(
60 wxr,
61 word_entry,
62 list_item,
63 linkage_type,
64 sense,
65 sense_index,
66 )
69def extract_linkage_list_item(
70 wxr: WiktextractContext,
71 word_entry: WordEntry,
72 list_item: WordEntry,
73 linkage_type: str,
74 sense: str,
75 sense_index: str,
76) -> None:
77 linkage_list = getattr(word_entry, linkage_type)
78 orig_len = len(linkage_list)
79 tags = []
80 for index, node in enumerate(list_item.children):
81 if isinstance(node, str):
82 m = re.search(r"\[(\d+)\]", node)
83 if m is not None:
84 sense_index = int(m.group(1))
85 elif node.strip().startswith(("=", "–")):
86 sense = clean_node(wxr, None, list_item.children[index:]).strip(
87 "=– "
88 )
89 if len(linkage_list) > orig_len:
90 linkage_list[-1].sense = sense
91 else:
92 word_nodes = [
93 n
94 for n in list_item.children[:index]
95 if not isinstance(n, TemplateNode)
96 ]
97 word = clean_node(wxr, None, word_nodes)
98 if word != "": 98 ↛ 107line 98 didn't jump to line 107 because the condition on line 98 was always true
99 linkage_list.append(
100 Linkage(
101 word=word,
102 sense=sense,
103 sense_index=sense_index,
104 tags=tags,
105 )
106 )
107 return
108 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
109 word = clean_node(wxr, None, node)
110 if word != "": 110 ↛ 80line 110 didn't jump to line 80 because the condition on line 110 was always true
111 linkage_list.append(
112 Linkage(word=word, sense=sense, sense_index=sense_index)
113 )
114 elif isinstance(node, TemplateNode):
115 if node.template_name == "expr": 115 ↛ 116line 115 didn't jump to line 116 because the condition on line 115 was never true
116 extract_expr_template(wxr, word_entry, node, linkage_type)
117 elif node.template_name in LIST_ITEM_TAG_TEMPLATES: 117 ↛ 80line 117 didn't jump to line 80 because the condition on line 117 was always true
118 if len(linkage_list) > orig_len: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 linkage_list[-1].tags.append(
120 LIST_ITEM_TAG_TEMPLATES[node.template_name]
121 )
122 else:
123 tags.append(LIST_ITEM_TAG_TEMPLATES[node.template_name])
126def extract_nld_template(
127 wxr: WiktextractContext,
128 word_entry: WordEntry,
129 t_node: TemplateNode,
130 linkage_type: str,
131) -> None:
132 # https://nl.wiktionary.org/wiki/Sjabloon:nld-rashonden
133 expanded_node = wxr.wtp.parse(
134 wxr.wtp.node_to_wikitext(t_node), expand_all=True
135 )
136 sense_index_str = clean_node(
137 wxr, None, t_node.template_parameters.get(1, "")
138 )
139 sense_index = 0
140 if re.fullmatch(r"\d+", sense_index_str): 140 ↛ 142line 140 didn't jump to line 142 because the condition on line 140 was always true
141 sense_index = int(sense_index_str)
142 sense = ""
143 for italic_node in expanded_node.find_child_recursively(NodeKind.ITALIC): 143 ↛ 148line 143 didn't jump to line 148 because the loop on line 143 didn't complete
144 for link_node in italic_node.find_child(NodeKind.LINK):
145 sense = clean_node(wxr, None, link_node)
146 break
148 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):
149 for link_node in list_item.find_child(NodeKind.LINK):
150 word = clean_node(wxr, None, link_node)
151 if word != "": 151 ↛ 149line 151 didn't jump to line 149 because the condition on line 151 was always true
152 getattr(word_entry, linkage_type).append(
153 Linkage(word=word, sense_index=sense_index, sense=sense)
154 )
157def extract_expr_template(
158 wxr: WiktextractContext,
159 word_entry: WordEntry,
160 t_node: TemplateNode,
161 linkage_type: str,
162) -> None:
163 # https://nl.wiktionary.org/wiki/Sjabloon:expr
164 # https://nl.wiktionary.org/wiki/Sjabloon:fras
165 sense_index_str = clean_node(
166 wxr, None, t_node.template_parameters.get("n", "")
167 )
168 sense_index = 0
169 if re.fullmatch(r"\d+", sense_index_str) is not None: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 sense_index = int(sense_index_str)
171 sense_arg = 2 if t_node.template_name == "expr" else 3
172 word_arg = 1 if t_node.template_name == "expr" else 2
173 sense = clean_node(wxr, None, t_node.template_parameters.get(sense_arg, ""))
174 word = clean_node(wxr, None, t_node.template_parameters.get(word_arg, ""))
175 m = re.match(r"\[?(\d+)\]?", word)
176 if m is not None: # should use "n" arg
177 sense_index = int(m.group(1))
178 word = word[m.end() :].strip()
179 if word != "": 179 ↛ exitline 179 didn't return from function 'extract_expr_template' because the condition on line 179 was always true
180 getattr(word_entry, linkage_type).append(
181 Linkage(word=word, sense=sense, sense_index=sense_index)
182 )
185def extract_fixed_preposition_section(
186 wxr: WiktextractContext, word_entry: WordEntry, level_node: LevelNode
187) -> None:
188 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):
189 word = clean_node(wxr, None, list_item.children)
190 if len(word) > 0:
191 word_entry.derived.append(
192 Linkage(word=word, tags=["prepositional"])
193 )