Coverage for src/wiktextract/extractor/fr/gloss.py: 96%
139 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from ..share import calculate_bold_offsets
9from .models import AltForm, Example, Sense, WordEntry
10from .tags import translate_raw_tags
13def extract_gloss(
14 wxr: WiktextractContext,
15 page_data: list[WordEntry],
16 list_node: WikiNode,
17 parent_sense: Sense | None = None,
18) -> None:
19 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
20 gloss_nodes = list(
21 list_item_node.invert_find_child(
22 NodeKind.LIST, include_empty_str=True
23 )
24 )
25 gloss_data = (
26 parent_sense.model_copy(deep=True)
27 if parent_sense is not None
28 else Sense()
29 )
30 gloss_data.examples.clear()
31 # process modifier, theme tempaltes before gloss text
32 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
33 tag_indexes = set()
34 for index, gloss_node in enumerate(gloss_nodes):
35 if (
36 isinstance(gloss_node, TemplateNode)
37 and gloss_node.template_name != "équiv-pour"
38 ):
39 categories_data = defaultdict(list)
40 expanded_text = clean_node(wxr, categories_data, gloss_node)
41 if (
42 expanded_text.startswith("(")
43 and expanded_text.endswith(")")
44 and "(" not in expanded_text[1:-1]
45 ):
46 tags = expanded_text.strip("() \n").split(", ")
47 if len(tags) > 0: 47 ↛ 49line 47 didn't jump to line 49 because the condition on line 47 was always true
48 gloss_data.raw_tags.extend(tags)
49 if "categories" in categories_data:
50 gloss_data.categories.extend(
51 categories_data["categories"]
52 )
53 tag_indexes.add(index)
54 # if an italic node is between parentheses then it's a tag, also
55 # don't add the parenthese strings to `gloss_only_nodes`
56 elif (
57 isinstance(gloss_node, WikiNode)
58 and gloss_node.kind == NodeKind.ITALIC
59 and isinstance(gloss_nodes[index - 1], str)
60 and gloss_nodes[index - 1].strip() == "("
61 and index + 1 < len(gloss_nodes)
62 and isinstance(gloss_nodes[index + 1], str)
63 and gloss_nodes[index + 1].strip() == ")"
64 ):
65 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
66 tag_indexes |= {index - 1, index, index + 1}
68 gloss_only_nodes = [
69 node
70 for index, node in enumerate(gloss_nodes)
71 if index not in tag_indexes
72 ]
73 note_index = len(gloss_only_nodes)
74 for index in range(note_index):
75 if (
76 isinstance(gloss_only_nodes[index], TemplateNode)
77 and gloss_only_nodes[index].template_name == "note"
78 ):
79 note_index = index
80 gloss_text = find_alt_of_form(
81 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
82 )
83 if "form-of" in page_data[-1].tags:
84 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
85 if gloss_text != "":
86 gloss_data.glosses.append(gloss_text)
87 gloss_data.note = clean_node(
88 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
89 ).strip(" ().")
90 if len(gloss_data.glosses) > 0:
91 page_data[-1].senses.append(gloss_data)
93 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
94 if nest_gloss_list.sarg.endswith("#"):
95 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
96 elif nest_gloss_list.sarg.endswith("*"): 96 ↛ 93line 96 didn't jump to line 93 because the condition on line 96 was always true
97 extract_examples(wxr, gloss_data, nest_gloss_list)
99 translate_raw_tags(gloss_data)
102def extract_examples(
103 wxr: WiktextractContext,
104 gloss_data: Sense,
105 example_list_node: WikiNode,
106) -> None:
107 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):
108 example_node_children = list(example_node.filter_empty_str_child())
109 if len(example_node_children) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 continue
111 first_child = example_node_children[0]
112 if isinstance(
113 first_child, TemplateNode
114 ) and first_child.template_name.endswith("exemple"):
115 process_exemple_template(wxr, first_child, gloss_data)
116 else:
117 example_data = Example()
118 ignored_nodes = []
119 for node in example_node.find_child(
120 NodeKind.TEMPLATE | NodeKind.LIST
121 ):
122 if (
123 node.kind == NodeKind.TEMPLATE
124 and node.template_name == "source"
125 ):
126 example_data.ref = clean_node(wxr, None, node).strip("— ()")
127 ignored_nodes.append(node)
128 elif node.kind == NodeKind.LIST: 128 ↛ 119line 128 didn't jump to line 119 because the condition on line 128 was always true
129 for tr_item in node.find_child(NodeKind.LIST_ITEM):
130 example_data.translation = clean_node(
131 wxr, None, tr_item.children
132 )
133 ignored_nodes.append(node)
134 example_nodes = [
135 node
136 for node in example_node_children
137 if node not in ignored_nodes
138 ]
139 example_data.text = clean_node(wxr, None, example_nodes)
140 gloss_data.examples.append(example_data)
143def process_exemple_template(
144 wxr: WiktextractContext,
145 node: TemplateNode,
146 gloss_data: Sense | None,
147 time: str = "",
148) -> Example:
149 # https://fr.wiktionary.org/wiki/Modèle:exemple
150 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
151 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
152 text_arg = wxr.wtp.parse(
153 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, ""))
154 )
155 text = clean_node(wxr, None, text_arg)
156 trans_arg = wxr.wtp.parse(
157 wxr.wtp.node_to_wikitext(
158 node.template_parameters.get(
159 2, node.template_parameters.get("sens", "")
160 )
161 )
162 )
163 translation = clean_node(wxr, None, trans_arg)
164 roman_arg = wxr.wtp.parse(
165 wxr.wtp.node_to_wikitext(
166 node.template_parameters.get(
167 3, node.template_parameters.get("tr", "")
168 )
169 )
170 )
171 transcription = clean_node(wxr, None, roman_arg)
172 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
173 example_data = Example(
174 text=text,
175 translation=translation,
176 roman=transcription,
177 ref=source,
178 time=time,
179 )
180 calculate_bold_offsets(
181 wxr, text_arg, text, example_data, "bold_text_offsets"
182 )
183 calculate_bold_offsets(
184 wxr, trans_arg, translation, example_data, "bold_translation_offsets"
185 )
186 calculate_bold_offsets(
187 wxr, roman_arg, transcription, example_data, "bold_roman_offsets"
188 )
189 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
190 gloss_data.examples.append(example_data)
191 if gloss_data is not None: 191 ↛ 193line 191 didn't jump to line 193 because the condition on line 191 was always true
192 clean_node(wxr, gloss_data, node)
193 return example_data
196def find_alt_of_form(
197 wxr: WiktextractContext,
198 gloss_nodes: list[str | WikiNode],
199 word_entry: WordEntry,
200 gloss_data: Sense,
201) -> str:
202 """
203 Return gloss text, remove tag template expanded from "variante *" templates.
204 """
205 from .form_line import process_equiv_pour_template
207 alt_of = ""
208 filtered_gloss_nodes = []
209 for gloss_node in gloss_nodes:
210 # https://fr.wiktionary.org/wiki/Modèle:variante_de
211 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
212 if isinstance(
213 gloss_node, TemplateNode
214 ) and gloss_node.template_name.startswith("variante "):
215 alt_of = clean_node(
216 wxr, None, gloss_node.template_parameters.get("dif", "")
217 )
218 if len(alt_of) == 0:
219 alt_of = clean_node(
220 wxr, None, gloss_node.template_parameters.get(1, "")
221 )
222 if len(alt_of) > 0: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true
223 gloss_data.alt_of.append(AltForm(word=alt_of))
224 gloss_data.tags.append("alt-of")
225 expanded_template = wxr.wtp.parse(
226 wxr.wtp.node_to_wikitext(gloss_node),
227 pre_expand=True,
228 additional_expand={gloss_node.template_name},
229 )
230 for node in expanded_template.children:
231 if (
232 isinstance(node, TemplateNode)
233 and node.template_name == "désuet"
234 ):
235 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
236 gloss_data.raw_tags.append(raw_tag)
237 else:
238 filtered_gloss_nodes.append(node)
239 elif (
240 isinstance(gloss_node, TemplateNode)
241 and gloss_node.template_name == "équiv-pour"
242 ):
243 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
244 form_data.sense_index = len(word_entry.senses) + 1
245 word_entry.forms.append(form_data)
246 else:
247 filtered_gloss_nodes.append(gloss_node)
249 if alt_of == "" and word_entry.pos == "typographic variant":
250 for gloss_node in filter(
251 lambda n: isinstance(n, WikiNode), gloss_nodes
252 ):
253 # use the last link
254 if gloss_node.kind == NodeKind.LINK:
255 alt_of = clean_node(wxr, None, gloss_node)
256 if isinstance(gloss_node, TemplateNode):
257 gloss_node = wxr.wtp.parse(
258 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
259 )
260 for link in gloss_node.find_child_recursively(NodeKind.LINK):
261 alt_of = clean_node(wxr, None, link)
262 if len(alt_of) > 0: 262 ↛ 265line 262 didn't jump to line 265 because the condition on line 262 was always true
263 gloss_data.alt_of.append(AltForm(word=alt_of))
265 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
266 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
267 brackets = 0
268 for char in gloss_text:
269 if char == "(":
270 brackets += 1
271 elif char == ")":
272 brackets -= 1
273 if brackets != 0:
274 gloss_text = gloss_text.strip(" ()")
275 return gloss_text
278def find_form_of_word(
279 wxr: WiktextractContext,
280 gloss_nodes: list[str | WikiNode],
281 gloss_data: Sense,
282) -> None:
283 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
284 form_of = ""
285 for node in gloss_nodes:
286 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
287 form_of = clean_node(wxr, None, node)
288 elif isinstance(node, TemplateNode):
289 if node.template_name in ("mutation de", "lien"): 289 ↛ 285line 289 didn't jump to line 285 because the condition on line 289 was always true
290 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
291 form_of = clean_node(
292 wxr, None, node.template_parameters.get(1, "")
293 )
294 if len(form_of) > 0: 294 ↛ exitline 294 didn't return from function 'find_form_of_word' because the condition on line 294 was always true
295 gloss_data.form_of.append(AltForm(word=form_of))