Coverage for src/wiktextract/extractor/fr/gloss.py: 95%
168 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 08:06 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-29 08:06 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import calculate_bold_offsets
9from .etymology import ATTESTATION_TEMPLATES, extract_date_template
10from .models import AltForm, AttestationData, Example, Sense, WordEntry
11from .tags import translate_raw_tags
14def extract_gloss(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 list_node: WikiNode,
18 parent_sense: Sense | None = None,
19) -> None:
20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
21 gloss_nodes = list(
22 list_item_node.invert_find_child(
23 NodeKind.LIST, include_empty_str=True
24 )
25 )
26 gloss_data = (
27 parent_sense.model_copy(deep=True)
28 if parent_sense is not None
29 else Sense()
30 )
31 gloss_data.examples.clear()
32 # process modifier, theme tempaltes before gloss text
33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
34 tag_indexes = set()
35 for index, gloss_node in enumerate(gloss_nodes):
36 if (
37 isinstance(gloss_node, TemplateNode)
38 and gloss_node.template_name in ATTESTATION_TEMPLATES
39 ):
40 gloss_data.attestations = extract_date_template(
41 wxr, gloss_data, gloss_node
42 )
43 tag_indexes.add(index)
44 elif (
45 isinstance(gloss_node, TemplateNode)
46 and gloss_node.template_name != "équiv-pour"
47 ):
48 categories_data = defaultdict(list)
49 expanded_text = clean_node(wxr, categories_data, gloss_node)
50 if (
51 expanded_text.startswith("(")
52 and expanded_text.endswith(")")
53 and "(" not in expanded_text[1:-1]
54 ):
55 tags = expanded_text.strip("() \n").split(", ")
56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true
57 gloss_data.raw_tags.extend(tags)
58 if "categories" in categories_data:
59 gloss_data.categories.extend(
60 categories_data["categories"]
61 )
62 tag_indexes.add(index)
63 # if an italic node is between parentheses then it's a tag, also
64 # don't add the parenthese strings to `gloss_only_nodes`
65 elif (
66 isinstance(gloss_node, WikiNode)
67 and gloss_node.kind == NodeKind.ITALIC
68 and isinstance(gloss_nodes[index - 1], str)
69 and gloss_nodes[index - 1].strip() == "("
70 and index + 1 < len(gloss_nodes)
71 and isinstance(gloss_nodes[index + 1], str)
72 and gloss_nodes[index + 1].strip() == ")"
73 ):
74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
75 tag_indexes |= {index - 1, index, index + 1}
77 gloss_only_nodes = [
78 node
79 for index, node in enumerate(gloss_nodes)
80 if index not in tag_indexes
81 ]
82 note_index = len(gloss_only_nodes)
83 for index in range(note_index):
84 if (
85 isinstance(gloss_only_nodes[index], TemplateNode)
86 and gloss_only_nodes[index].template_name == "note"
87 ):
88 note_index = index
89 gloss_text = find_alt_of_form(
90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
91 )
92 if "form-of" in page_data[-1].tags:
93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
94 gloss_text = gloss_text.strip("— \n")
95 if gloss_text != "":
96 gloss_data.glosses.append(gloss_text)
97 gloss_data.note = clean_node(
98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
99 ).strip(" ().")
100 if len(gloss_data.glosses) > 0:
101 page_data[-1].senses.append(gloss_data)
103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
104 if nest_gloss_list.sarg.endswith("#"):
105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true
107 for e_list_item in nest_gloss_list.find_child(
108 NodeKind.LIST_ITEM
109 ):
110 extract_example_list_item(wxr, gloss_data, e_list_item)
112 translate_raw_tags(gloss_data)
115def extract_example_list_item(
116 wxr: WiktextractContext, sense: Sense, list_item: WikiNode
117):
118 has_exemple_template = False
119 e_data = Example()
120 e_nodes = []
121 raw_tags = []
122 for node in list_item.children:
123 if isinstance(node, TemplateNode):
124 if node.template_name.endswith("exemple"):
125 process_exemple_template(wxr, node, sense, raw_tags=raw_tags)
126 has_exemple_template = True
127 elif node.template_name == "source":
128 e_data.ref = clean_node(wxr, sense, node).strip("— ()")
129 elif node.template_name.lower() == "lang":
130 e_data = extract_lang_example_template(wxr, node)
131 if e_data.text != "": 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true
132 sense.examples.append(e_data)
133 has_exemple_template = True
134 else:
135 t_text = clean_node(wxr, sense, node)
136 if t_text.startswith("(") and t_text.endswith(")"): 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true
137 raw_tags.append(t_text.strip("() "))
138 else:
139 e_nodes.append(node)
140 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:
141 for tr_item in node.find_child(NodeKind.LIST_ITEM):
142 e_data.translation = clean_node(wxr, None, tr_item.children)
143 calculate_bold_offsets(
144 wxr,
145 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_item.children)),
146 e_data.translation,
147 e_data,
148 "bold_translation_offsets",
149 )
150 else:
151 e_nodes.append(node)
153 if not has_exemple_template:
154 e_data.text = clean_node(wxr, sense, e_nodes)
155 if e_data.text != "": 155 ↛ exitline 155 didn't return from function 'extract_example_list_item' because the condition on line 155 was always true
156 e_data.raw_tags.extend(raw_tags)
157 translate_raw_tags(e_data)
158 calculate_bold_offsets(
159 wxr,
160 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_nodes)),
161 e_data.text,
162 e_data,
163 "bold_text_offsets",
164 )
165 sense.examples.append(e_data)
168def process_exemple_template(
169 wxr: WiktextractContext,
170 node: TemplateNode,
171 gloss_data: Sense | None,
172 attestations: list[AttestationData] = [],
173 raw_tags: list[str] = [],
174) -> Example:
175 # https://fr.wiktionary.org/wiki/Modèle:exemple
176 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
177 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
178 text_arg = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, ""))
180 )
181 text = clean_node(wxr, None, text_arg)
182 trans_arg = wxr.wtp.parse(
183 wxr.wtp.node_to_wikitext(
184 node.template_parameters.get(
185 2, node.template_parameters.get("sens", "")
186 )
187 )
188 )
189 translation = clean_node(wxr, None, trans_arg)
190 roman_arg = wxr.wtp.parse(
191 wxr.wtp.node_to_wikitext(
192 node.template_parameters.get(
193 3, node.template_parameters.get("tr", "")
194 )
195 )
196 )
197 transcription = clean_node(wxr, None, roman_arg)
198 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
199 example_data = Example(
200 text=text,
201 translation=translation,
202 roman=transcription,
203 ref=source,
204 attestations=attestations,
205 raw_tags=raw_tags,
206 )
207 calculate_bold_offsets(
208 wxr, text_arg, text, example_data, "bold_text_offsets"
209 )
210 calculate_bold_offsets(
211 wxr, trans_arg, translation, example_data, "bold_translation_offsets"
212 )
213 calculate_bold_offsets(
214 wxr, roman_arg, transcription, example_data, "bold_roman_offsets"
215 )
216 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
217 gloss_data.examples.append(example_data)
218 if gloss_data is not None: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true
219 clean_node(wxr, gloss_data, node)
220 translate_raw_tags(example_data)
221 return example_data
224def find_alt_of_form(
225 wxr: WiktextractContext,
226 gloss_nodes: list[str | WikiNode],
227 word_entry: WordEntry,
228 gloss_data: Sense,
229) -> str:
230 """
231 Return gloss text, remove tag template expanded from "variante *" templates.
232 """
233 from .form_line import process_equiv_pour_template
235 alt_of = ""
236 filtered_gloss_nodes = []
237 for gloss_node in gloss_nodes:
238 # https://fr.wiktionary.org/wiki/Modèle:variante_de
239 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
240 if isinstance(
241 gloss_node, TemplateNode
242 ) and gloss_node.template_name.startswith("variante "):
243 alt_of = clean_node(
244 wxr, None, gloss_node.template_parameters.get("dif", "")
245 )
246 if len(alt_of) == 0:
247 alt_of = clean_node(
248 wxr, None, gloss_node.template_parameters.get(1, "")
249 )
250 if len(alt_of) > 0: 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was always true
251 gloss_data.alt_of.append(AltForm(word=alt_of))
252 gloss_data.tags.append("alt-of")
253 expanded_template = wxr.wtp.parse(
254 wxr.wtp.node_to_wikitext(gloss_node),
255 pre_expand=True,
256 additional_expand={gloss_node.template_name},
257 )
258 for node in expanded_template.children:
259 if (
260 isinstance(node, TemplateNode)
261 and node.template_name == "désuet"
262 ):
263 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
264 gloss_data.raw_tags.append(raw_tag)
265 else:
266 filtered_gloss_nodes.append(node)
267 elif (
268 isinstance(gloss_node, TemplateNode)
269 and gloss_node.template_name == "équiv-pour"
270 ):
271 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
272 form_data.sense_index = len(word_entry.senses) + 1
273 word_entry.forms.append(form_data)
274 else:
275 filtered_gloss_nodes.append(gloss_node)
277 if alt_of == "" and word_entry.pos == "typographic variant":
278 for gloss_node in filter(
279 lambda n: isinstance(n, WikiNode), gloss_nodes
280 ):
281 # use the last link
282 if gloss_node.kind == NodeKind.LINK:
283 alt_of = clean_node(wxr, None, gloss_node)
284 if isinstance(gloss_node, TemplateNode):
285 gloss_node = wxr.wtp.parse(
286 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
287 )
288 for link in gloss_node.find_child_recursively(NodeKind.LINK):
289 alt_of = clean_node(wxr, None, link)
290 if len(alt_of) > 0: 290 ↛ 295line 290 didn't jump to line 295 because the condition on line 290 was always true
291 gloss_data.alt_of.append(AltForm(word=alt_of))
292 if "alt-of" not in gloss_data.tags: 292 ↛ 295line 292 didn't jump to line 295 because the condition on line 292 was always true
293 gloss_data.tags.append("alt-of")
295 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
296 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
297 brackets = 0
298 for char in gloss_text:
299 if char == "(":
300 brackets += 1
301 elif char == ")":
302 brackets -= 1
303 if brackets != 0: 303 ↛ 304line 303 didn't jump to line 304 because the condition on line 303 was never true
304 gloss_text = gloss_text.strip(" ()")
305 return gloss_text
308def find_form_of_word(
309 wxr: WiktextractContext,
310 gloss_nodes: list[str | WikiNode],
311 gloss_data: Sense,
312) -> None:
313 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
314 form_of = ""
315 for node in gloss_nodes:
316 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
317 form_of = clean_node(wxr, None, node)
318 elif isinstance(node, TemplateNode):
319 if node.template_name in ("mutation de", "lien"): 319 ↛ 315line 319 didn't jump to line 315 because the condition on line 319 was always true
320 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
321 form_of = clean_node(
322 wxr, None, node.template_parameters.get(1, "")
323 )
324 if len(form_of) > 0: 324 ↛ exitline 324 didn't return from function 'find_form_of_word' because the condition on line 324 was always true
325 gloss_data.form_of.append(AltForm(word=form_of))
326 if "form-of" not in gloss_data.tags: 326 ↛ exitline 326 didn't return from function 'find_form_of_word' because the condition on line 326 was always true
327 gloss_data.tags.append("form-of")
330def extract_lang_example_template(
331 wxr: WiktextractContext, t_node: TemplateNode
332) -> Example:
333 text_arg = wxr.wtp.parse(
334 wxr.wtp.node_to_wikitext(t_node.template_parameters.get(2, ""))
335 )
336 text = clean_node(wxr, None, text_arg)
337 e_data = Example(text=text)
338 calculate_bold_offsets(wxr, text_arg, text, e_data, "bold_text_offsets")
339 return e_data