Coverage for src/wiktextract/extractor/fr/gloss.py: 96%
137 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1import re
2from collections import defaultdict
4from wikitextprocessor import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import AltForm, Example, Sense, WordEntry
9from .tags import translate_raw_tags
12def extract_gloss(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 list_node: WikiNode,
16 parent_sense: Sense | None = None,
17) -> None:
18 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):
19 gloss_nodes = list(
20 list_item_node.invert_find_child(
21 NodeKind.LIST, include_empty_str=True
22 )
23 )
24 gloss_data = Sense()
25 if parent_sense is not None:
26 gloss_data.glosses.extend(parent_sense.glosses)
27 gloss_data.tags.extend(parent_sense.tags)
28 gloss_data.raw_tags.extend(parent_sense.raw_tags)
29 gloss_data.topics.extend(parent_sense.topics)
30 # process modifier, theme tempaltes before gloss text
31 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens
32 tag_indexes = set()
33 for index, gloss_node in enumerate(gloss_nodes):
34 if (
35 isinstance(gloss_node, TemplateNode)
36 and gloss_node.template_name != "équiv-pour"
37 ):
38 categories_data = defaultdict(list)
39 expanded_text = clean_node(wxr, categories_data, gloss_node)
40 if (
41 expanded_text.startswith("(")
42 and expanded_text.endswith(")")
43 and "(" not in expanded_text[1:-1]
44 ):
45 tags = expanded_text.strip("() \n").split(", ")
46 if len(tags) > 0: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was always true
47 gloss_data.raw_tags.extend(tags)
48 if "categories" in categories_data:
49 gloss_data.categories.extend(
50 categories_data["categories"]
51 )
52 tag_indexes.add(index)
53 # if an italic node is between parentheses then it's a tag, also
54 # don't add the parenthese strings to `gloss_only_nodes`
55 elif (
56 isinstance(gloss_node, WikiNode)
57 and gloss_node.kind == NodeKind.ITALIC
58 and isinstance(gloss_nodes[index - 1], str)
59 and gloss_nodes[index - 1].strip() == "("
60 and index + 1 < len(gloss_nodes)
61 and isinstance(gloss_nodes[index + 1], str)
62 and gloss_nodes[index + 1].strip() == ")"
63 ):
64 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))
65 tag_indexes |= {index - 1, index, index + 1}
67 gloss_only_nodes = [
68 node
69 for index, node in enumerate(gloss_nodes)
70 if index not in tag_indexes
71 ]
72 note_index = len(gloss_only_nodes)
73 for index in range(note_index):
74 if (
75 isinstance(gloss_only_nodes[index], TemplateNode)
76 and gloss_only_nodes[index].template_name == "note"
77 ):
78 note_index = index
79 gloss_text = find_alt_of_form(
80 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data
81 )
82 if "form-of" in page_data[-1].tags:
83 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)
84 if gloss_text != "":
85 gloss_data.glosses.append(gloss_text)
86 gloss_data.note = clean_node(
87 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]
88 ).strip(" ().")
89 page_data[-1].senses.append(gloss_data)
91 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):
92 if nest_gloss_list.sarg.endswith("#"):
93 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)
94 elif nest_gloss_list.sarg.endswith("*"): 94 ↛ 91line 94 didn't jump to line 91 because the condition on line 94 was always true
95 extract_examples(wxr, gloss_data, nest_gloss_list)
97 translate_raw_tags(gloss_data)
98 if len(gloss_data.glosses) == 0:
99 gloss_data.tags.append("no-gloss")
102def extract_examples(
103 wxr: WiktextractContext,
104 gloss_data: Sense,
105 example_list_node: WikiNode,
106) -> None:
107 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):
108 example_node_children = list(example_node.filter_empty_str_child())
109 if len(example_node_children) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true
110 continue
111 first_child = example_node_children[0]
112 if isinstance(
113 first_child, TemplateNode
114 ) and first_child.template_name.endswith("exemple"):
115 process_exemple_template(wxr, first_child, gloss_data)
116 else:
117 example_data = Example()
118 ignored_nodes = []
119 for node in example_node.find_child(
120 NodeKind.TEMPLATE | NodeKind.LIST
121 ):
122 if (
123 node.kind == NodeKind.TEMPLATE
124 and node.template_name == "source"
125 ):
126 example_data.ref = clean_node(wxr, None, node).strip("— ()")
127 ignored_nodes.append(node)
128 elif node.kind == NodeKind.LIST: 128 ↛ 119line 128 didn't jump to line 119 because the condition on line 128 was always true
129 for tr_item in node.find_child(NodeKind.LIST_ITEM):
130 example_data.translation = clean_node(
131 wxr, None, tr_item.children
132 )
133 ignored_nodes.append(node)
134 example_nodes = [
135 node
136 for node in example_node_children
137 if node not in ignored_nodes
138 ]
139 example_data.text = clean_node(wxr, None, example_nodes)
140 gloss_data.examples.append(example_data)
143def process_exemple_template(
144 wxr: WiktextractContext,
145 node: TemplateNode,
146 gloss_data: Sense | None,
147 time: str = "",
148) -> Example:
149 # https://fr.wiktionary.org/wiki/Modèle:exemple
150 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple
151 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple
152 text = clean_node(wxr, None, node.template_parameters.get(1, ""))
153 translation = clean_node(
154 wxr,
155 None,
156 node.template_parameters.get(
157 2, node.template_parameters.get("sens", "")
158 ),
159 )
160 transcription = clean_node(
161 wxr,
162 None,
163 node.template_parameters.get(3, node.template_parameters.get("tr", "")),
164 )
165 source = clean_node(wxr, None, node.template_parameters.get("source", ""))
166 example_data = Example(
167 text=clean_node(wxr, None, text),
168 translation=clean_node(wxr, None, translation),
169 roman=clean_node(wxr, None, transcription),
170 ref=clean_node(wxr, None, source),
171 time=time,
172 )
173 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):
174 gloss_data.examples.append(example_data)
175 if gloss_data is not None: 175 ↛ 177line 175 didn't jump to line 177 because the condition on line 175 was always true
176 clean_node(wxr, gloss_data, node)
177 return example_data
180def find_alt_of_form(
181 wxr: WiktextractContext,
182 gloss_nodes: list[str | WikiNode],
183 word_entry: WordEntry,
184 gloss_data: Sense,
185) -> str:
186 """
187 Return gloss text, remove tag template expanded from "variante *" templates.
188 """
189 from .form_line import process_equiv_pour_template
191 alt_of = ""
192 filtered_gloss_nodes = []
193 for gloss_node in gloss_nodes:
194 # https://fr.wiktionary.org/wiki/Modèle:variante_de
195 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de
196 if isinstance(
197 gloss_node, TemplateNode
198 ) and gloss_node.template_name.startswith("variante "):
199 alt_of = clean_node(
200 wxr, None, gloss_node.template_parameters.get("dif", "")
201 )
202 if len(alt_of) == 0:
203 alt_of = clean_node(
204 wxr, None, gloss_node.template_parameters.get(1, "")
205 )
206 if len(alt_of) > 0: 206 ↛ 209line 206 didn't jump to line 209 because the condition on line 206 was always true
207 gloss_data.alt_of.append(AltForm(word=alt_of))
208 gloss_data.tags.append("alt-of")
209 expanded_template = wxr.wtp.parse(
210 wxr.wtp.node_to_wikitext(gloss_node),
211 pre_expand=True,
212 additional_expand={gloss_node.template_name},
213 )
214 for node in expanded_template.children:
215 if (
216 isinstance(node, TemplateNode)
217 and node.template_name == "désuet"
218 ):
219 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")
220 gloss_data.raw_tags.append(raw_tag)
221 else:
222 filtered_gloss_nodes.append(node)
223 elif (
224 isinstance(gloss_node, TemplateNode)
225 and gloss_node.template_name == "équiv-pour"
226 ):
227 for form_data in process_equiv_pour_template(wxr, gloss_node, []):
228 form_data.sense_index = len(word_entry.senses) + 1
229 word_entry.forms.append(form_data)
230 else:
231 filtered_gloss_nodes.append(gloss_node)
233 if alt_of == "" and word_entry.pos == "typographic variant":
234 for gloss_node in filter(
235 lambda n: isinstance(n, WikiNode), gloss_nodes
236 ):
237 # use the last link
238 if gloss_node.kind == NodeKind.LINK:
239 alt_of = clean_node(wxr, None, gloss_node)
240 if isinstance(gloss_node, TemplateNode):
241 gloss_node = wxr.wtp.parse(
242 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True
243 )
244 for link in gloss_node.find_child_recursively(NodeKind.LINK):
245 alt_of = clean_node(wxr, None, link)
246 if len(alt_of) > 0: 246 ↛ 249line 246 didn't jump to line 249 because the condition on line 246 was always true
247 gloss_data.alt_of.append(AltForm(word=alt_of))
249 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)
250 gloss_text = re.sub(r"\s+\.$", ".", gloss_text)
251 brackets = 0
252 for char in gloss_text:
253 if char == "(":
254 brackets += 1
255 elif char == ")":
256 brackets -= 1
257 if brackets != 0:
258 gloss_text = gloss_text.strip(" ()")
259 return gloss_text
262def find_form_of_word(
263 wxr: WiktextractContext,
264 gloss_nodes: list[str | WikiNode],
265 gloss_data: Sense,
266) -> None:
267 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes
268 form_of = ""
269 for node in gloss_nodes:
270 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
271 form_of = clean_node(wxr, None, node)
272 elif isinstance(node, TemplateNode):
273 if node.template_name in ("mutation de", "lien"): 273 ↛ 269line 273 didn't jump to line 269 because the condition on line 273 was always true
274 # https://fr.wiktionary.org/wiki/Modèle:mutation_de
275 form_of = clean_node(
276 wxr, None, node.template_parameters.get(1, "")
277 )
278 if len(form_of) > 0: 278 ↛ exitline 278 didn't return from function 'find_form_of_word' because the condition on line 278 was always true
279 gloss_data.form_of.append(AltForm(word=form_of))