Coverage for src/wiktextract/extractor/fr/gloss.py: 96%
131 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from collections import defaultdict
2from typing import Optional, Union
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import AltForm, Example, Sense, WordEntry
9from .tags import translate_raw_tags
12def extract_gloss(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 list_node: WikiNode,
16 parent_sense: Optional[Sense] = None,
17) -> None:
18 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
19 gloss_nodes = list(
20 list_item_node.invert_find_child(
21 NodeKind.LIST, include_empty_str=True
22 )
23 )
24 gloss_data = Sense()
25 if parent_sense is not None:
26 gloss_data.glosses.extend(parent_sense.glosses)
27 gloss_data.tags.extend(parent_sense.tags)
28 gloss_data.raw_tags.extend(parent_sense.raw_tags)
29 gloss_data.topics.extend(parent_sense.topics)
30 # process modifier, theme tempaltes before gloss text
31 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
32 tag_indexes = set()
33 for index, gloss_node in enumerate(gloss_nodes):
34 if isinstance(gloss_node, TemplateNode):
35 categories_data = defaultdict(list)
36 expanded_text = clean_node(wxr, categories_data, gloss_node)
37 if (
38 expanded_text.startswith("(")
39 and expanded_text.endswith(")")
40 and "(" not in expanded_text[1:-1]
41 ):
42 tags = expanded_text.strip("() \n").split(", ")
43 if len(tags) > 0: 43 ↛ 45line 43 didn't jump to line 45 because the condition on line 43 was always true
44 gloss_data.raw_tags.extend(tags)
45 if "categories" in categories_data:
46 gloss_data.categories.extend(
47 categories_data["categories"]
48 )
49 tag_indexes.add(index)
50 # if an italic node is between parentheses then it's a tag, also
51 # don't add the parenthese strings to `gloss_only_nodes`
52 elif (
53 isinstance(gloss_node, WikiNode)
54 and gloss_node.kind == NodeKind.ITALIC
55 and isinstance(gloss_nodes[index - 1], str)
56 and gloss_nodes[index - 1].strip() == "("
57 and index + 1 < len(gloss_nodes)
58 and isinstance(gloss_nodes[index + 1], str)
59 and gloss_nodes[index + 1].strip() == ")"
60 ):
61 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
62 tag_indexes |= {index - 1, index, index + 1}
64 gloss_only_nodes = [
65 node
66 for index, node in enumerate(gloss_nodes)
67 if index not in tag_indexes
68 ]
69 note_index = len(gloss_only_nodes)
70 for index in range(note_index):
71 if (
72 isinstance(gloss_only_nodes[index], TemplateNode)
73 and gloss_only_nodes[index].template_name == "note"
74 ):
75 note_index = index
76 gloss_text = find_alt_of_form(
77 wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data
78 )
79 if "form-of" in page_data[-1].tags:
80 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
81 if gloss_text != "":
82 gloss_data.glosses.append(gloss_text)
83 gloss_data.note = clean_node(
84 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
85 ).strip(" ().")
86 page_data[-1].senses.append(gloss_data)
88 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
89 if nest_gloss_list.sarg.endswith("#"):
90 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
91 elif nest_gloss_list.sarg.endswith("*"): 91 ↛ 88line 91 didn't jump to line 88 because the condition on line 91 was always true
92 extract_examples(wxr, gloss_data, nest_gloss_list)
94 translate_raw_tags(gloss_data)
95 if len(gloss_data.glosses) == 0:
96 gloss_data.tags.append("no-gloss")
99def extract_examples(
100 wxr: WiktextractContext,
101 gloss_data: Sense,
102 example_list_node: WikiNode,
103) -> None:
104 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):
105 example_node_children = list(example_node.filter_empty_str_child())
106 if len(example_node_children) == 0: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true
107 continue
108 first_child = example_node_children[0]
109 if isinstance(
110 first_child, TemplateNode
111 ) and first_child.template_name.endswith("exemple"):
112 process_exemple_template(wxr, first_child, gloss_data)
113 else:
114 example_data = Example()
115 ignored_nodes = []
116 for node in example_node.find_child(
117 NodeKind.TEMPLATE | NodeKind.LIST
118 ):
119 if (
120 node.kind == NodeKind.TEMPLATE
121 and node.template_name == "source"
122 ):
123 example_data.ref = clean_node(wxr, None, node).strip("— ()")
124 ignored_nodes.append(node)
125 elif node.kind == NodeKind.LIST: 125 ↛ 116line 125 didn't jump to line 116 because the condition on line 125 was always true
126 for tr_item in node.find_child(NodeKind.LIST_ITEM):
127 example_data.translation = clean_node(
128 wxr, None, tr_item.children
129 )
130 ignored_nodes.append(node)
131 example_nodes = [
132 node
133 for node in example_node_children
134 if node not in ignored_nodes
135 ]
136 example_data.text = clean_node(wxr, None, example_nodes)
137 gloss_data.examples.append(example_data)
140def process_exemple_template(
141 wxr: WiktextractContext,
142 node: TemplateNode,
143 gloss_data: Optional[Sense],
144 time: str = "",
145) -> Example:
146 # https://fr.wiktionary.org/wiki/Modèle:exemple
147 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
148 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
149 text = clean_node(wxr, None, node.template_parameters.get(1, ""))
150 translation = clean_node(
151 wxr,
152 None,
153 node.template_parameters.get(
154 2, node.template_parameters.get("sens", "")
155 ),
156 )
157 transcription = clean_node(
158 wxr,
159 None,
160 node.template_parameters.get(3, node.template_parameters.get("tr", "")),
161 )
162 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
163 example_data = Example(
164 text=clean_node(wxr, None, text),
165 translation=clean_node(wxr, None, translation),
166 roman=clean_node(wxr, None, transcription),
167 ref=clean_node(wxr, None, source),
168 time=time,
169 )
170 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
171 gloss_data.examples.append(example_data)
172 if gloss_data is not None: 172 ↛ 174line 172 didn't jump to line 174 because the condition on line 172 was always true
173 clean_node(wxr, gloss_data, node)
174 return example_data
177def find_alt_of_form(
178 wxr: WiktextractContext,
179 gloss_nodes: list[Union[str, WikiNode]],
180 pos_type: str,
181 gloss_data: Sense,
182) -> str:
183 """
184 Return gloss text, remove tag template expanded from "variante *" templates.
185 """
187 alt_of = ""
188 filtered_gloss_nodes = []
189 for gloss_node in gloss_nodes:
190 # https://fr.wiktionary.org/wiki/Modèle:variante_de
191 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
192 if isinstance(
193 gloss_node, TemplateNode
194 ) and gloss_node.template_name.startswith("variante "):
195 alt_of = clean_node(
196 wxr, None, gloss_node.template_parameters.get("dif", "")
197 )
198 if len(alt_of) == 0:
199 alt_of = clean_node(
200 wxr, None, gloss_node.template_parameters.get(1, "")
201 )
202 if len(alt_of) > 0: 202 ↛ 205line 202 didn't jump to line 205 because the condition on line 202 was always true
203 gloss_data.alt_of.append(AltForm(word=alt_of))
204 gloss_data.tags.append("alt-of")
205 expanded_template = wxr.wtp.parse(
206 wxr.wtp.node_to_wikitext(gloss_node),
207 pre_expand=True,
208 additional_expand={gloss_node.template_name},
209 )
210 for node in expanded_template.children:
211 if (
212 isinstance(node, TemplateNode)
213 and node.template_name == "désuet"
214 ):
215 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
216 gloss_data.raw_tags.append(raw_tag)
217 else:
218 filtered_gloss_nodes.append(node)
219 else:
220 filtered_gloss_nodes.append(gloss_node)
222 if alt_of == "" and pos_type == "typographic variant":
223 for gloss_node in filter(
224 lambda n: isinstance(n, WikiNode), gloss_nodes
225 ):
226 # use the last link
227 if gloss_node.kind == NodeKind.LINK:
228 alt_of = clean_node(wxr, None, gloss_node)
229 if isinstance(gloss_node, TemplateNode):
230 gloss_node = wxr.wtp.parse(
231 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
232 )
233 for link in gloss_node.find_child_recursively(NodeKind.LINK):
234 alt_of = clean_node(wxr, None, link)
235 if len(alt_of) > 0: 235 ↛ 238line 235 didn't jump to line 238 because the condition on line 235 was always true
236 gloss_data.alt_of.append(AltForm(word=alt_of))
238 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
239 brackets = 0
240 for char in gloss_text:
241 if char == "(":
242 brackets += 1
243 elif char == ")":
244 brackets -= 1
245 if brackets != 0:
246 gloss_text = gloss_text.strip(" ()")
247 return gloss_text
250def find_form_of_word(
251 wxr: WiktextractContext,
252 gloss_nodes: list[Union[str, WikiNode]],
253 gloss_data: Sense,
254) -> None:
255 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
256 form_of = ""
257 for node in gloss_nodes:
258 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
259 form_of = clean_node(wxr, None, node)
260 elif isinstance(node, TemplateNode):
261 if node.template_name in ("mutation de", "lien"): 261 ↛ 257line 261 didn't jump to line 257 because the condition on line 261 was always true
262 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
263 form_of = clean_node(
264 wxr, None, node.template_parameters.get(1, "")
265 )
266 if len(form_of) > 0: 266 ↛ exitline 266 didn't return from function 'find_form_of_word' because the condition on line 266 was always true
267 gloss_data.form_of.append(AltForm(word=form_of))