Coverage for src/wiktextract/extractor/fr/gloss.py: 95%
156 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import calculate_bold_offsets
9from .etymology import ATTESTATION_TEMPLATES, extract_date_template
10from .models import AltForm, AttestationData, Example, Sense, WordEntry
11from .tags import translate_raw_tags
14def extract_gloss(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 list_node: WikiNode,
18 parent_sense: Sense | None = None,
19) -> None:
20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
21 gloss_nodes = list(
22 list_item_node.invert_find_child(
23 NodeKind.LIST, include_empty_str=True
24 )
25 )
26 gloss_data = (
27 parent_sense.model_copy(deep=True)
28 if parent_sense is not None
29 else Sense()
30 )
31 gloss_data.examples.clear()
32 # process modifier, theme tempaltes before gloss text
33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
34 tag_indexes = set()
35 for index, gloss_node in enumerate(gloss_nodes):
36 if (
37 isinstance(gloss_node, TemplateNode)
38 and gloss_node.template_name in ATTESTATION_TEMPLATES
39 ):
40 gloss_data.attestations = extract_date_template(
41 wxr, gloss_data, gloss_node
42 )
43 tag_indexes.add(index)
44 elif (
45 isinstance(gloss_node, TemplateNode)
46 and gloss_node.template_name != "équiv-pour"
47 ):
48 categories_data = defaultdict(list)
49 expanded_text = clean_node(wxr, categories_data, gloss_node)
50 if (
51 expanded_text.startswith("(")
52 and expanded_text.endswith(")")
53 and "(" not in expanded_text[1:-1]
54 ):
55 tags = expanded_text.strip("() \n").split(", ")
56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true
57 gloss_data.raw_tags.extend(tags)
58 if "categories" in categories_data:
59 gloss_data.categories.extend(
60 categories_data["categories"]
61 )
62 tag_indexes.add(index)
63 # if an italic node is between parentheses then it's a tag, also
64 # don't add the parenthese strings to `gloss_only_nodes`
65 elif (
66 isinstance(gloss_node, WikiNode)
67 and gloss_node.kind == NodeKind.ITALIC
68 and isinstance(gloss_nodes[index - 1], str)
69 and gloss_nodes[index - 1].strip() == "("
70 and index + 1 < len(gloss_nodes)
71 and isinstance(gloss_nodes[index + 1], str)
72 and gloss_nodes[index + 1].strip() == ")"
73 ):
74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
75 tag_indexes |= {index - 1, index, index + 1}
77 gloss_only_nodes = [
78 node
79 for index, node in enumerate(gloss_nodes)
80 if index not in tag_indexes
81 ]
82 note_index = len(gloss_only_nodes)
83 for index in range(note_index):
84 if (
85 isinstance(gloss_only_nodes[index], TemplateNode)
86 and gloss_only_nodes[index].template_name == "note"
87 ):
88 note_index = index
89 gloss_text = find_alt_of_form(
90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
91 )
92 if "form-of" in page_data[-1].tags:
93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
94 gloss_text = gloss_text.strip("— \n")
95 if gloss_text != "":
96 gloss_data.glosses.append(gloss_text)
97 gloss_data.note = clean_node(
98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
99 ).strip(" ().")
100 if len(gloss_data.glosses) > 0:
101 page_data[-1].senses.append(gloss_data)
103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
104 if nest_gloss_list.sarg.endswith("#"):
105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true
107 for e_list_item in nest_gloss_list.find_child(
108 NodeKind.LIST_ITEM
109 ):
110 extract_example_list_item(wxr, gloss_data, e_list_item)
112 translate_raw_tags(gloss_data)
115def extract_example_list_item(
116 wxr: WiktextractContext, sense: Sense, list_item: WikiNode
117):
118 has_exemple_template = False
119 e_data = Example()
120 e_nodes = []
121 raw_tags = []
122 for node in list_item.children:
123 if isinstance(node, TemplateNode):
124 if node.template_name.endswith("exemple"):
125 process_exemple_template(wxr, node, sense, raw_tags=raw_tags)
126 has_exemple_template = True
127 elif node.template_name == "source":
128 e_data.ref = clean_node(wxr, sense, node).strip("— ()")
129 else:
130 t_text = clean_node(wxr, sense, node)
131 if t_text.startswith("(") and t_text.endswith(")"): 131 ↛ 134line 131 didn't jump to line 134 because the condition on line 131 was always true
132 raw_tags.append(t_text.strip("() "))
133 else:
134 e_nodes.append(node)
135 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
136 for tr_item in node.find_child(NodeKind.LIST_ITEM):
137 e_data.translation = clean_node(wxr, None, tr_item.children)
138 else:
139 e_nodes.append(node)
141 if not has_exemple_template:
142 e_data.text = clean_node(wxr, sense, e_nodes)
143 if e_data.text != "": 143 ↛ exitline 143 didn't return from function 'extract_example_list_item' because the condition on line 143 was always true
144 e_data.raw_tags.extend(raw_tags)
145 translate_raw_tags(e_data)
146 calculate_bold_offsets(
147 wxr,
148 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_nodes)),
149 e_data.text,
150 e_data,
151 "bold_text_offsets",
152 )
153 sense.examples.append(e_data)
156def process_exemple_template(
157 wxr: WiktextractContext,
158 node: TemplateNode,
159 gloss_data: Sense | None,
160 attestations: list[AttestationData] = [],
161 raw_tags: list[str] = [],
162) -> Example:
163 # https://fr.wiktionary.org/wiki/Modèle:exemple
164 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
165 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
166 text_arg = wxr.wtp.parse(
167 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, ""))
168 )
169 text = clean_node(wxr, None, text_arg)
170 trans_arg = wxr.wtp.parse(
171 wxr.wtp.node_to_wikitext(
172 node.template_parameters.get(
173 2, node.template_parameters.get("sens", "")
174 )
175 )
176 )
177 translation = clean_node(wxr, None, trans_arg)
178 roman_arg = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(
180 node.template_parameters.get(
181 3, node.template_parameters.get("tr", "")
182 )
183 )
184 )
185 transcription = clean_node(wxr, None, roman_arg)
186 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
187 example_data = Example(
188 text=text,
189 translation=translation,
190 roman=transcription,
191 ref=source,
192 attestations=attestations,
193 raw_tags=raw_tags,
194 )
195 calculate_bold_offsets(
196 wxr, text_arg, text, example_data, "bold_text_offsets"
197 )
198 calculate_bold_offsets(
199 wxr, trans_arg, translation, example_data, "bold_translation_offsets"
200 )
201 calculate_bold_offsets(
202 wxr, roman_arg, transcription, example_data, "bold_roman_offsets"
203 )
204 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
205 gloss_data.examples.append(example_data)
206 if gloss_data is not None: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true
207 clean_node(wxr, gloss_data, node)
208 translate_raw_tags(example_data)
209 return example_data
212def find_alt_of_form(
213 wxr: WiktextractContext,
214 gloss_nodes: list[str | WikiNode],
215 word_entry: WordEntry,
216 gloss_data: Sense,
217) -> str:
218 """
219 Return gloss text, remove tag template expanded from "variante *" templates.
220 """
221 from .form_line import process_equiv_pour_template
223 alt_of = ""
224 filtered_gloss_nodes = []
225 for gloss_node in gloss_nodes:
226 # https://fr.wiktionary.org/wiki/Modèle:variante_de
227 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
228 if isinstance(
229 gloss_node, TemplateNode
230 ) and gloss_node.template_name.startswith("variante "):
231 alt_of = clean_node(
232 wxr, None, gloss_node.template_parameters.get("dif", "")
233 )
234 if len(alt_of) == 0:
235 alt_of = clean_node(
236 wxr, None, gloss_node.template_parameters.get(1, "")
237 )
238 if len(alt_of) > 0: 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was always true
239 gloss_data.alt_of.append(AltForm(word=alt_of))
240 gloss_data.tags.append("alt-of")
241 expanded_template = wxr.wtp.parse(
242 wxr.wtp.node_to_wikitext(gloss_node),
243 pre_expand=True,
244 additional_expand={gloss_node.template_name},
245 )
246 for node in expanded_template.children:
247 if (
248 isinstance(node, TemplateNode)
249 and node.template_name == "désuet"
250 ):
251 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
252 gloss_data.raw_tags.append(raw_tag)
253 else:
254 filtered_gloss_nodes.append(node)
255 elif (
256 isinstance(gloss_node, TemplateNode)
257 and gloss_node.template_name == "équiv-pour"
258 ):
259 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
260 form_data.sense_index = len(word_entry.senses) + 1
261 word_entry.forms.append(form_data)
262 else:
263 filtered_gloss_nodes.append(gloss_node)
265 if alt_of == "" and word_entry.pos == "typographic variant":
266 for gloss_node in filter(
267 lambda n: isinstance(n, WikiNode), gloss_nodes
268 ):
269 # use the last link
270 if gloss_node.kind == NodeKind.LINK:
271 alt_of = clean_node(wxr, None, gloss_node)
272 if isinstance(gloss_node, TemplateNode):
273 gloss_node = wxr.wtp.parse(
274 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
275 )
276 for link in gloss_node.find_child_recursively(NodeKind.LINK):
277 alt_of = clean_node(wxr, None, link)
278 if len(alt_of) > 0: 278 ↛ 283line 278 didn't jump to line 283 because the condition on line 278 was always true
279 gloss_data.alt_of.append(AltForm(word=alt_of))
280 if "alt-of" not in gloss_data.tags: 280 ↛ 283line 280 didn't jump to line 283 because the condition on line 280 was always true
281 gloss_data.tags.append("alt-of")
283 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
284 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
285 brackets = 0
286 for char in gloss_text:
287 if char == "(":
288 brackets += 1
289 elif char == ")":
290 brackets -= 1
291 if brackets != 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 gloss_text = gloss_text.strip(" ()")
293 return gloss_text
296def find_form_of_word(
297 wxr: WiktextractContext,
298 gloss_nodes: list[str | WikiNode],
299 gloss_data: Sense,
300) -> None:
301 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
302 form_of = ""
303 for node in gloss_nodes:
304 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
305 form_of = clean_node(wxr, None, node)
306 elif isinstance(node, TemplateNode):
307 if node.template_name in ("mutation de", "lien"): 307 ↛ 303line 307 didn't jump to line 303 because the condition on line 307 was always true
308 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
309 form_of = clean_node(
310 wxr, None, node.template_parameters.get(1, "")
311 )
312 if len(form_of) > 0: 312 ↛ exitline 312 didn't return from function 'find_form_of_word' because the condition on line 312 was always true
313 gloss_data.form_of.append(AltForm(word=form_of))
314 if "form-of" not in gloss_data.tags: 314 ↛ exitline 314 didn't return from function 'find_form_of_word' because the condition on line 314 was always true
315 gloss_data.tags.append("form-of")