Coverage for src/wiktextract/extractor/fr/gloss.py: 94%
148 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import calculate_bold_offsets
9from .etymology import ATTESTATION_TEMPLATES, extract_date_template
10from .models import AltForm, AttestationData, Example, Sense, WordEntry
11from .tags import translate_raw_tags
14def extract_gloss(
15 wxr: WiktextractContext,
16 page_data: list[WordEntry],
17 list_node: WikiNode,
18 parent_sense: Sense | None = None,
19) -> None:
20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
21 gloss_nodes = list(
22 list_item_node.invert_find_child(
23 NodeKind.LIST, include_empty_str=True
24 )
25 )
26 gloss_data = (
27 parent_sense.model_copy(deep=True)
28 if parent_sense is not None
29 else Sense()
30 )
31 gloss_data.examples.clear()
32 # process modifier, theme tempaltes before gloss text
33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
34 tag_indexes = set()
35 for index, gloss_node in enumerate(gloss_nodes):
36 if (
37 isinstance(gloss_node, TemplateNode)
38 and gloss_node.template_name in ATTESTATION_TEMPLATES
39 ):
40 gloss_data.attestations = extract_date_template(
41 wxr, gloss_data, gloss_node
42 )
43 tag_indexes.add(index)
44 elif (
45 isinstance(gloss_node, TemplateNode)
46 and gloss_node.template_name != "équiv-pour"
47 ):
48 categories_data = defaultdict(list)
49 expanded_text = clean_node(wxr, categories_data, gloss_node)
50 if (
51 expanded_text.startswith("(")
52 and expanded_text.endswith(")")
53 and "(" not in expanded_text[1:-1]
54 ):
55 tags = expanded_text.strip("() \n").split(", ")
56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true
57 gloss_data.raw_tags.extend(tags)
58 if "categories" in categories_data:
59 gloss_data.categories.extend(
60 categories_data["categories"]
61 )
62 tag_indexes.add(index)
63 # if an italic node is between parentheses then it's a tag, also
64 # don't add the parenthese strings to `gloss_only_nodes`
65 elif (
66 isinstance(gloss_node, WikiNode)
67 and gloss_node.kind == NodeKind.ITALIC
68 and isinstance(gloss_nodes[index - 1], str)
69 and gloss_nodes[index - 1].strip() == "("
70 and index + 1 < len(gloss_nodes)
71 and isinstance(gloss_nodes[index + 1], str)
72 and gloss_nodes[index + 1].strip() == ")"
73 ):
74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
75 tag_indexes |= {index - 1, index, index + 1}
77 gloss_only_nodes = [
78 node
79 for index, node in enumerate(gloss_nodes)
80 if index not in tag_indexes
81 ]
82 note_index = len(gloss_only_nodes)
83 for index in range(note_index):
84 if (
85 isinstance(gloss_only_nodes[index], TemplateNode)
86 and gloss_only_nodes[index].template_name == "note"
87 ):
88 note_index = index
89 gloss_text = find_alt_of_form(
90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
91 )
92 if "form-of" in page_data[-1].tags:
93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
94 gloss_text = gloss_text.strip("— \n")
95 if gloss_text != "":
96 gloss_data.glosses.append(gloss_text)
97 gloss_data.note = clean_node(
98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
99 ).strip(" ().")
100 if len(gloss_data.glosses) > 0:
101 page_data[-1].senses.append(gloss_data)
103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
104 if nest_gloss_list.sarg.endswith("#"):
105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true
107 extract_examples(wxr, gloss_data, nest_gloss_list)
109 translate_raw_tags(gloss_data)
112def extract_examples(
113 wxr: WiktextractContext,
114 gloss_data: Sense,
115 example_list_node: WikiNode,
116) -> None:
117 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):
118 example_node_children = list(example_node.filter_empty_str_child())
119 if len(example_node_children) == 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true
120 continue
121 first_child = example_node_children[0]
122 if isinstance(
123 first_child, TemplateNode
124 ) and first_child.template_name.endswith("exemple"):
125 process_exemple_template(wxr, first_child, gloss_data)
126 else:
127 example_data = Example()
128 ignored_nodes = []
129 for node in example_node.find_child(
130 NodeKind.TEMPLATE | NodeKind.LIST
131 ):
132 if (
133 node.kind == NodeKind.TEMPLATE
134 and node.template_name == "source"
135 ):
136 example_data.ref = clean_node(wxr, None, node).strip("— ()")
137 ignored_nodes.append(node)
138 elif node.kind == NodeKind.LIST: 138 ↛ 129line 138 didn't jump to line 129 because the condition on line 138 was always true
139 for tr_item in node.find_child(NodeKind.LIST_ITEM):
140 example_data.translation = clean_node(
141 wxr, None, tr_item.children
142 )
143 ignored_nodes.append(node)
144 example_nodes = [
145 node
146 for node in example_node_children
147 if node not in ignored_nodes
148 ]
149 example_data.text = clean_node(wxr, None, example_nodes)
150 gloss_data.examples.append(example_data)
153def process_exemple_template(
154 wxr: WiktextractContext,
155 node: TemplateNode,
156 gloss_data: Sense | None,
157 attestations: list[AttestationData] = [],
158) -> Example:
159 # https://fr.wiktionary.org/wiki/Modèle:exemple
160 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
161 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
162 text_arg = wxr.wtp.parse(
163 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, ""))
164 )
165 text = clean_node(wxr, None, text_arg)
166 trans_arg = wxr.wtp.parse(
167 wxr.wtp.node_to_wikitext(
168 node.template_parameters.get(
169 2, node.template_parameters.get("sens", "")
170 )
171 )
172 )
173 translation = clean_node(wxr, None, trans_arg)
174 roman_arg = wxr.wtp.parse(
175 wxr.wtp.node_to_wikitext(
176 node.template_parameters.get(
177 3, node.template_parameters.get("tr", "")
178 )
179 )
180 )
181 transcription = clean_node(wxr, None, roman_arg)
182 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
183 example_data = Example(
184 text=text,
185 translation=translation,
186 roman=transcription,
187 ref=source,
188 attestations=attestations,
189 )
190 calculate_bold_offsets(
191 wxr, text_arg, text, example_data, "bold_text_offsets"
192 )
193 calculate_bold_offsets(
194 wxr, trans_arg, translation, example_data, "bold_translation_offsets"
195 )
196 calculate_bold_offsets(
197 wxr, roman_arg, transcription, example_data, "bold_roman_offsets"
198 )
199 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
200 gloss_data.examples.append(example_data)
201 if gloss_data is not None: 201 ↛ 203line 201 didn't jump to line 203 because the condition on line 201 was always true
202 clean_node(wxr, gloss_data, node)
203 return example_data
206def find_alt_of_form(
207 wxr: WiktextractContext,
208 gloss_nodes: list[str | WikiNode],
209 word_entry: WordEntry,
210 gloss_data: Sense,
211) -> str:
212 """
213 Return gloss text, remove tag template expanded from "variante *" templates.
214 """
215 from .form_line import process_equiv_pour_template
217 alt_of = ""
218 filtered_gloss_nodes = []
219 for gloss_node in gloss_nodes:
220 # https://fr.wiktionary.org/wiki/Modèle:variante_de
221 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
222 if isinstance(
223 gloss_node, TemplateNode
224 ) and gloss_node.template_name.startswith("variante "):
225 alt_of = clean_node(
226 wxr, None, gloss_node.template_parameters.get("dif", "")
227 )
228 if len(alt_of) == 0:
229 alt_of = clean_node(
230 wxr, None, gloss_node.template_parameters.get(1, "")
231 )
232 if len(alt_of) > 0: 232 ↛ 235line 232 didn't jump to line 235 because the condition on line 232 was always true
233 gloss_data.alt_of.append(AltForm(word=alt_of))
234 gloss_data.tags.append("alt-of")
235 expanded_template = wxr.wtp.parse(
236 wxr.wtp.node_to_wikitext(gloss_node),
237 pre_expand=True,
238 additional_expand={gloss_node.template_name},
239 )
240 for node in expanded_template.children:
241 if (
242 isinstance(node, TemplateNode)
243 and node.template_name == "désuet"
244 ):
245 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
246 gloss_data.raw_tags.append(raw_tag)
247 else:
248 filtered_gloss_nodes.append(node)
249 elif (
250 isinstance(gloss_node, TemplateNode)
251 and gloss_node.template_name == "équiv-pour"
252 ):
253 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
254 form_data.sense_index = len(word_entry.senses) + 1
255 word_entry.forms.append(form_data)
256 else:
257 filtered_gloss_nodes.append(gloss_node)
259 if alt_of == "" and word_entry.pos == "typographic variant":
260 for gloss_node in filter(
261 lambda n: isinstance(n, WikiNode), gloss_nodes
262 ):
263 # use the last link
264 if gloss_node.kind == NodeKind.LINK:
265 alt_of = clean_node(wxr, None, gloss_node)
266 if isinstance(gloss_node, TemplateNode):
267 gloss_node = wxr.wtp.parse(
268 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
269 )
270 for link in gloss_node.find_child_recursively(NodeKind.LINK):
271 alt_of = clean_node(wxr, None, link)
272 if len(alt_of) > 0: 272 ↛ 277line 272 didn't jump to line 277 because the condition on line 272 was always true
273 gloss_data.alt_of.append(AltForm(word=alt_of))
274 if "alt-of" not in gloss_data.tags: 274 ↛ 277line 274 didn't jump to line 277 because the condition on line 274 was always true
275 gloss_data.tags.append("alt-of")
277 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
278 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
279 brackets = 0
280 for char in gloss_text:
281 if char == "(":
282 brackets += 1
283 elif char == ")":
284 brackets -= 1
285 if brackets != 0: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true
286 gloss_text = gloss_text.strip(" ()")
287 return gloss_text
290def find_form_of_word(
291 wxr: WiktextractContext,
292 gloss_nodes: list[str | WikiNode],
293 gloss_data: Sense,
294) -> None:
295 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
296 form_of = ""
297 for node in gloss_nodes:
298 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
299 form_of = clean_node(wxr, None, node)
300 elif isinstance(node, TemplateNode):
301 if node.template_name in ("mutation de", "lien"): 301 ↛ 297line 301 didn't jump to line 297 because the condition on line 301 was always true
302 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
303 form_of = clean_node(
304 wxr, None, node.template_parameters.get(1, "")
305 )
306 if len(form_of) > 0: 306 ↛ exitline 306 didn't return from function 'find_form_of_word' because the condition on line 306 was always true
307 gloss_data.form_of.append(AltForm(word=form_of))
308 if "form-of" not in gloss_data.tags: 308 ↛ exitline 308 didn't return from function 'find_form_of_word' because the condition on line 308 was always true
309 gloss_data.tags.append("form-of")