Coverage for src/wiktextract/extractor/fr/gloss.py: 96%
143 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import calculate_bold_offsets
9from .etymology import ATTESTATION_TEMPLATES, extract_date_template
10from .models import AltForm, AttestationData, Example, Sense, WordEntry
11from .tags import translate_raw_tags
14def extract_gloss(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 list_node: WikiNode,
18 parent_sense: Sense | None = None,
19) -> None:
20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
21 gloss_nodes = list(
22 list_item_node.invert_find_child(
23 NodeKind.LIST, include_empty_str=True
24 )
25 )
26 gloss_data = (
27 parent_sense.model_copy(deep=True)
28 if parent_sense is not None
29 else Sense()
30 )
31 gloss_data.examples.clear()
32 # process modifier, theme tempaltes before gloss text
33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
34 tag_indexes = set()
35 for index, gloss_node in enumerate(gloss_nodes):
36 if (
37 isinstance(gloss_node, TemplateNode)
38 and gloss_node.template_name in ATTESTATION_TEMPLATES
39 ):
40 gloss_data.attestations = extract_date_template(
41 wxr, gloss_data, gloss_node
42 )
43 tag_indexes.add(index)
44 elif (
45 isinstance(gloss_node, TemplateNode)
46 and gloss_node.template_name != "équiv-pour"
47 ):
48 categories_data = defaultdict(list)
49 expanded_text = clean_node(wxr, categories_data, gloss_node)
50 if (
51 expanded_text.startswith("(")
52 and expanded_text.endswith(")")
53 and "(" not in expanded_text[1:-1]
54 ):
55 tags = expanded_text.strip("() \n").split(", ")
56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true
57 gloss_data.raw_tags.extend(tags)
58 if "categories" in categories_data:
59 gloss_data.categories.extend(
60 categories_data["categories"]
61 )
62 tag_indexes.add(index)
63 # if an italic node is between parentheses then it's a tag, also
64 # don't add the parenthese strings to `gloss_only_nodes`
65 elif (
66 isinstance(gloss_node, WikiNode)
67 and gloss_node.kind == NodeKind.ITALIC
68 and isinstance(gloss_nodes[index - 1], str)
69 and gloss_nodes[index - 1].strip() == "("
70 and index + 1 < len(gloss_nodes)
71 and isinstance(gloss_nodes[index + 1], str)
72 and gloss_nodes[index + 1].strip() == ")"
73 ):
74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
75 tag_indexes |= {index - 1, index, index + 1}
77 gloss_only_nodes = [
78 node
79 for index, node in enumerate(gloss_nodes)
80 if index not in tag_indexes
81 ]
82 note_index = len(gloss_only_nodes)
83 for index in range(note_index):
84 if (
85 isinstance(gloss_only_nodes[index], TemplateNode)
86 and gloss_only_nodes[index].template_name == "note"
87 ):
88 note_index = index
89 gloss_text = find_alt_of_form(
90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
91 )
92 if "form-of" in page_data[-1].tags:
93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
94 if gloss_text != "":
95 gloss_data.glosses.append(gloss_text)
96 gloss_data.note = clean_node(
97 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
98 ).strip(" ().")
99 if len(gloss_data.glosses) > 0:
100 page_data[-1].senses.append(gloss_data)
102 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
103 if nest_gloss_list.sarg.endswith("#"):
104 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
105 elif nest_gloss_list.sarg.endswith("*"): 105 ↛ 102line 105 didn't jump to line 102 because the condition on line 105 was always true
106 extract_examples(wxr, gloss_data, nest_gloss_list)
108 translate_raw_tags(gloss_data)
111def extract_examples(
112 wxr: WiktextractContext,
113 gloss_data: Sense,
114 example_list_node: WikiNode,
115) -> None:
116 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):
117 example_node_children = list(example_node.filter_empty_str_child())
118 if len(example_node_children) == 0: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true
119 continue
120 first_child = example_node_children[0]
121 if isinstance(
122 first_child, TemplateNode
123 ) and first_child.template_name.endswith("exemple"):
124 process_exemple_template(wxr, first_child, gloss_data)
125 else:
126 example_data = Example()
127 ignored_nodes = []
128 for node in example_node.find_child(
129 NodeKind.TEMPLATE | NodeKind.LIST
130 ):
131 if (
132 node.kind == NodeKind.TEMPLATE
133 and node.template_name == "source"
134 ):
135 example_data.ref = clean_node(wxr, None, node).strip("— ()")
136 ignored_nodes.append(node)
137 elif node.kind == NodeKind.LIST: 137 ↛ 128line 137 didn't jump to line 128 because the condition on line 137 was always true
138 for tr_item in node.find_child(NodeKind.LIST_ITEM):
139 example_data.translation = clean_node(
140 wxr, None, tr_item.children
141 )
142 ignored_nodes.append(node)
143 example_nodes = [
144 node
145 for node in example_node_children
146 if node not in ignored_nodes
147 ]
148 example_data.text = clean_node(wxr, None, example_nodes)
149 gloss_data.examples.append(example_data)
152def process_exemple_template(
153 wxr: WiktextractContext,
154 node: TemplateNode,
155 gloss_data: Sense | None,
156 attestations: list[AttestationData] = [],
157) -> Example:
158 # https://fr.wiktionary.org/wiki/Modèle:exemple
159 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
160 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
161 text_arg = wxr.wtp.parse(
162 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, ""))
163 )
164 text = clean_node(wxr, None, text_arg)
165 trans_arg = wxr.wtp.parse(
166 wxr.wtp.node_to_wikitext(
167 node.template_parameters.get(
168 2, node.template_parameters.get("sens", "")
169 )
170 )
171 )
172 translation = clean_node(wxr, None, trans_arg)
173 roman_arg = wxr.wtp.parse(
174 wxr.wtp.node_to_wikitext(
175 node.template_parameters.get(
176 3, node.template_parameters.get("tr", "")
177 )
178 )
179 )
180 transcription = clean_node(wxr, None, roman_arg)
181 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
182 example_data = Example(
183 text=text,
184 translation=translation,
185 roman=transcription,
186 ref=source,
187 attestations=attestations,
188 )
189 calculate_bold_offsets(
190 wxr, text_arg, text, example_data, "bold_text_offsets"
191 )
192 calculate_bold_offsets(
193 wxr, trans_arg, translation, example_data, "bold_translation_offsets"
194 )
195 calculate_bold_offsets(
196 wxr, roman_arg, transcription, example_data, "bold_roman_offsets"
197 )
198 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
199 gloss_data.examples.append(example_data)
200 if gloss_data is not None: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true
201 clean_node(wxr, gloss_data, node)
202 return example_data
205def find_alt_of_form(
206 wxr: WiktextractContext,
207 gloss_nodes: list[str | WikiNode],
208 word_entry: WordEntry,
209 gloss_data: Sense,
210) -> str:
211 """
212 Return gloss text, remove tag template expanded from "variante *" templates.
213 """
214 from .form_line import process_equiv_pour_template
216 alt_of = ""
217 filtered_gloss_nodes = []
218 for gloss_node in gloss_nodes:
219 # https://fr.wiktionary.org/wiki/Modèle:variante_de
220 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
221 if isinstance(
222 gloss_node, TemplateNode
223 ) and gloss_node.template_name.startswith("variante "):
224 alt_of = clean_node(
225 wxr, None, gloss_node.template_parameters.get("dif", "")
226 )
227 if len(alt_of) == 0:
228 alt_of = clean_node(
229 wxr, None, gloss_node.template_parameters.get(1, "")
230 )
231 if len(alt_of) > 0: 231 ↛ 234line 231 didn't jump to line 234 because the condition on line 231 was always true
232 gloss_data.alt_of.append(AltForm(word=alt_of))
233 gloss_data.tags.append("alt-of")
234 expanded_template = wxr.wtp.parse(
235 wxr.wtp.node_to_wikitext(gloss_node),
236 pre_expand=True,
237 additional_expand={gloss_node.template_name},
238 )
239 for node in expanded_template.children:
240 if (
241 isinstance(node, TemplateNode)
242 and node.template_name == "désuet"
243 ):
244 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
245 gloss_data.raw_tags.append(raw_tag)
246 else:
247 filtered_gloss_nodes.append(node)
248 elif (
249 isinstance(gloss_node, TemplateNode)
250 and gloss_node.template_name == "équiv-pour"
251 ):
252 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
253 form_data.sense_index = len(word_entry.senses) + 1
254 word_entry.forms.append(form_data)
255 else:
256 filtered_gloss_nodes.append(gloss_node)
258 if alt_of == "" and word_entry.pos == "typographic variant":
259 for gloss_node in filter(
260 lambda n: isinstance(n, WikiNode), gloss_nodes
261 ):
262 # use the last link
263 if gloss_node.kind == NodeKind.LINK:
264 alt_of = clean_node(wxr, None, gloss_node)
265 if isinstance(gloss_node, TemplateNode):
266 gloss_node = wxr.wtp.parse(
267 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
268 )
269 for link in gloss_node.find_child_recursively(NodeKind.LINK):
270 alt_of = clean_node(wxr, None, link)
271 if len(alt_of) > 0: 271 ↛ 274line 271 didn't jump to line 274 because the condition on line 271 was always true
272 gloss_data.alt_of.append(AltForm(word=alt_of))
274 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
275 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
276 brackets = 0
277 for char in gloss_text:
278 if char == "(":
279 brackets += 1
280 elif char == ")":
281 brackets -= 1
282 if brackets != 0:
283 gloss_text = gloss_text.strip(" ()")
284 return gloss_text
287def find_form_of_word(
288 wxr: WiktextractContext,
289 gloss_nodes: list[str | WikiNode],
290 gloss_data: Sense,
291) -> None:
292 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
293 form_of = ""
294 for node in gloss_nodes:
295 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
296 form_of = clean_node(wxr, None, node)
297 elif isinstance(node, TemplateNode):
298 if node.template_name in ("mutation de", "lien"): 298 ↛ 294line 298 didn't jump to line 294 because the condition on line 298 was always true
299 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
300 form_of = clean_node(
301 wxr, None, node.template_parameters.get(1, "")
302 )
303 if len(form_of) > 0: 303 ↛ exitline 303 didn't return from function 'find_form_of_word' because the condition on line 303 was always true
304 gloss_data.form_of.append(AltForm(word=form_of))