Coverage for src/wiktextract/extractor/fr/conjugation.py: 94%
180 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor.parser import (
2 LEVEL_KIND_FLAGS,
3 HTMLNode,
4 NodeKind,
5 TemplateNode,
6 WikiNode,
7)
9from ...page import clean_node
10from ...wxr_context import WiktextractContext
11from .models import Form, WordEntry
12from .tags import translate_raw_tags
15def extract_conjugation(
16 wxr: WiktextractContext,
17 entry: WordEntry,
18 conj_page_title: str,
19 select_tab: str = "1",
20) -> None:
21 """
22 Find and extract conjugation page.
24 https://fr.wiktionary.org/wiki/Conjugaison:français
25 https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Français/Conjugaison
26 https://fr.wiktionary.org/wiki/Aide:Conjugaisons
27 """
28 conj_page = wxr.wtp.get_page_body(
29 conj_page_title, wxr.wtp.NAMESPACE_DATA["Conjugaison"]["id"]
30 )
31 if conj_page is None: 31 ↛ 32line 31 didn't jump to line 32 because the condition on line 31 was never true
32 return
33 conj_root = wxr.wtp.parse(conj_page)
34 for conj_template in conj_root.find_child(NodeKind.TEMPLATE):
35 if conj_template.template_name.endswith("-intro"): 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 continue
37 elif "-conj" in conj_template.template_name:
38 process_conj_template(wxr, entry, conj_template, conj_page_title)
39 elif conj_template.template_name == "Onglets conjugaison":
40 process_onglets_conjugaison_template(
41 wxr, entry, conj_template, conj_page_title, select_tab
42 )
43 elif conj_template.template_name.removeprefix(":").startswith(
44 "Conjugaison:"
45 ):
46 extract_conjugation(
47 wxr,
48 entry,
49 conj_template.template_name.removeprefix(":"),
50 clean_node(
51 wxr, None, conj_template.template_parameters.get("sél", "2")
52 ),
53 )
54 elif conj_template.template_name.startswith("ja-flx-adj"):
55 proces_ja_flx_adj_template(
56 wxr, entry, conj_template, conj_page_title
57 )
58 elif conj_template.template_name.startswith("ja-"): 58 ↛ 34line 58 didn't jump to line 34 because the condition on line 58 was always true
59 proces_ja_conj_template(wxr, entry, conj_template, conj_page_title)
62def process_onglets_conjugaison_template(
63 wxr: WiktextractContext,
64 entry: WordEntry,
65 node: TemplateNode,
66 conj_page_title: str,
67 select_tab: str,
68) -> None:
69 # https://fr.wiktionary.org/wiki/Modèle:Onglets_conjugaison
70 # this template expands to two tabs of tables
71 selected_tabs = []
72 if select_tab != "1" or ( 72 ↛ 80line 72 didn't jump to line 80 because the condition on line 72 was always true
73 select_tab == "1"
74 and clean_node(wxr, None, node.template_parameters.get("onglet1", ""))
75 == "Conjugaison active"
76 ):
77 # don't extract or only extract "Conjugaison pronominale" tab
78 selected_tabs = [select_tab]
79 else:
80 selected_tabs = [str(i) for i in range(1, 7)]
82 for tab_index in selected_tabs:
83 arg_name = f"contenu{tab_index}"
84 if arg_name not in node.template_parameters: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true
85 break
86 arg_value = node.template_parameters[arg_name]
87 if ( 87 ↛ 91line 87 didn't jump to line 91 because the condition on line 87 was never true
88 isinstance(arg_value, TemplateNode)
89 and "-conj" in arg_value.template_name
90 ):
91 process_conj_template(wxr, entry, arg_value, conj_page_title)
92 elif isinstance(arg_value, list): 92 ↛ 82line 92 didn't jump to line 82 because the condition on line 92 was always true
93 for arg_node in arg_value:
94 if (
95 isinstance(arg_node, TemplateNode)
96 and "-conj" in arg_node.template_name
97 ):
98 process_conj_template(wxr, entry, arg_node, conj_page_title)
101def process_conj_template(
102 wxr: WiktextractContext,
103 entry: WordEntry,
104 template_node: TemplateNode,
105 conj_page_title: str,
106) -> None:
107 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_conjugaison_en_français
108 # https://fr.wiktionary.org/wiki/Modèle:fr-conj-1-ger
109 expanded_template = wxr.wtp.parse(
110 wxr.wtp.node_to_wikitext(template_node), expand_all=True
111 )
112 process_expanded_conj_template(
113 wxr, entry, expanded_template, conj_page_title
114 )
117def process_expanded_conj_template(
118 wxr: WiktextractContext,
119 entry: WordEntry,
120 node: WikiNode,
121 conj_page_title: str,
122) -> None:
123 h3_text = ""
124 for child in node.find_child(NodeKind.HTML | LEVEL_KIND_FLAGS):
125 if child.kind in LEVEL_KIND_FLAGS: 125 ↛ 126line 125 didn't jump to line 126 because the condition on line 125 was never true
126 process_expanded_conj_template(wxr, entry, child, conj_page_title)
127 elif child.kind == NodeKind.HTML: 127 ↛ 124line 127 didn't jump to line 124 because the condition on line 127 was always true
128 if child.tag == "h3":
129 h3_text = clean_node(wxr, None, child)
130 elif child.tag == "div": 130 ↛ 124line 130 didn't jump to line 124 because the condition on line 130 was always true
131 if h3_text == "Modes impersonnels":
132 process_fr_conj_modes_table(
133 wxr, entry, child, conj_page_title
134 )
135 else:
136 process_fr_conj_table(
137 wxr, entry, child, h3_text, conj_page_title
138 )
141def process_fr_conj_modes_table(
142 wxr: WiktextractContext,
143 entry: WordEntry,
144 div_node: HTMLNode,
145 conj_page_title: str,
146) -> None:
147 # the first "Modes impersonnels" table
148 added_forms = {f.form for f in entry.forms}
150 for table_node in div_node.find_child(NodeKind.TABLE):
151 for row_index, row in enumerate(
152 table_node.find_child(NodeKind.TABLE_ROW)
153 ):
154 if row_index == 0:
155 continue # skip header
156 form_text = ""
157 tags = []
158 for cell_index, cell in enumerate(
159 row.find_child(NodeKind.TABLE_CELL)
160 ):
161 if cell_index == 0:
162 tags.append(clean_node(wxr, None, cell))
163 elif cell_index % 3 == 0:
164 form = Form(
165 form=form_text,
166 raw_tags=tags.copy(),
167 ipas=[clean_node(wxr, None, cell)],
168 source=conj_page_title,
169 )
170 form.raw_tags.append(
171 "Présent" if cell_index == 3 else "Passé"
172 )
173 translate_raw_tags(form)
174 if form.form not in added_forms: 174 ↛ 177line 174 didn't jump to line 177 because the condition on line 174 was always true
175 entry.forms.append(form)
176 added_forms.add(form.form)
177 form_text = ""
178 else:
179 if len(form_text) > 0 and not form_text.endswith("’"):
180 form_text += " "
181 form_text += clean_node(wxr, None, cell)
184def process_fr_conj_table(
185 wxr: WiktextractContext,
186 entry: WordEntry,
187 div_node: HTMLNode,
188 h3_text: str,
189 conj_page_title: str,
190) -> None:
191 for table_node in div_node.find_child(NodeKind.TABLE):
192 for row_index, row in enumerate(
193 table_node.find_child(NodeKind.TABLE_ROW)
194 ):
195 for cell_index, cell in enumerate(
196 row.find_child(NodeKind.TABLE_CELL)
197 ):
198 for cell_child in cell.children:
199 if isinstance(cell_child, WikiNode):
200 if (
201 cell_child.kind == NodeKind.HTML
202 and cell_child.tag == "table"
203 ):
204 process_fr_conj_html_table(
205 wxr, entry, cell_child, h3_text, conj_page_title
206 )
207 elif cell_child.kind == NodeKind.TABLE: 207 ↛ 198line 207 didn't jump to line 198 because the condition on line 207 was always true
208 process_fr_conj_wiki_table(
209 wxr, entry, cell_child, h3_text, conj_page_title
210 )
213def process_fr_conj_html_table(
214 wxr: WiktextractContext,
215 entry: WordEntry,
216 table_node: HTMLNode,
217 h3_text: str,
218 conj_page_title: str,
219):
220 tags = [h3_text]
221 for tr_index, tr_node in enumerate(table_node.find_html_recursively("tr")):
222 if tr_index == 0:
223 tags.append(clean_node(wxr, None, tr_node.children))
224 else:
225 form = Form(raw_tags=tags, source=conj_page_title)
226 for td_index, td_node in enumerate(
227 tr_node.find_html_recursively("td")
228 ):
229 td_text = clean_node(wxr, None, td_node)
230 if td_index < 2:
231 form.form += td_text
232 if td_index == 0 and not td_text.endswith("’"):
233 form.form += " "
234 else:
235 if len(form.ipas) > 0:
236 form.ipas[0] += td_text
237 else:
238 if not td_text.endswith("‿"): 238 ↛ 240line 238 didn't jump to line 240 because the condition on line 238 was always true
239 td_text += " "
240 form.ipas.append(td_text)
242 translate_raw_tags(form)
243 entry.forms.append(form)
246def process_fr_conj_wiki_table(
247 wxr: WiktextractContext,
248 entry: WordEntry,
249 table_node: WikiNode,
250 h3_text: str,
251 conj_page_title: str,
252):
253 tags = [h3_text]
254 for row_index, row in enumerate(table_node.find_child(NodeKind.TABLE_ROW)):
255 if row_index == 0:
256 tags.append(clean_node(wxr, None, row.children))
257 else:
258 form = Form(raw_tags=tags, source=conj_page_title)
259 for cell_index, cell in enumerate(
260 row.find_child(NodeKind.TABLE_CELL)
261 ):
262 cell_text = clean_node(wxr, None, cell)
263 if cell_index < 2:
264 if cell_text == "—":
265 continue
266 if cell_text.startswith("-"):
267 form.form = form.form.strip()
268 form.form += cell_text
269 if cell_index == 0 and len(cell_text) > 0:
270 form.form += " "
271 else:
272 form.ipas.append(cell_text)
274 if len(form.form) > 0:
275 translate_raw_tags(form)
276 entry.forms.append(form)
279def proces_ja_flx_adj_template(
280 wxr: WiktextractContext,
281 entry: WordEntry,
282 template_node: TemplateNode,
283 conj_page_title: str,
284) -> None:
285 # https://fr.wiktionary.org/wiki/Modèle:ja-adj
286 # https://fr.wiktionary.org/wiki/Modèle:ja-flx-adj-な
287 expanded_template = wxr.wtp.parse(
288 wxr.wtp.node_to_wikitext(template_node), expand_all=True
289 )
290 for table_node in expanded_template.find_child(NodeKind.TABLE):
291 first_tag = ""
292 for row in table_node.find_child(NodeKind.TABLE_ROW):
293 forms = []
294 tags = [first_tag]
295 for cell_index, row_child in enumerate(
296 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
297 ):
298 row_child_text = clean_node(wxr, None, row_child)
299 if row_child.kind == NodeKind.TABLE_HEADER_CELL:
300 first_tag = row_child_text
301 else:
302 for line_index, line in enumerate(
303 row_child_text.splitlines()
304 ):
305 if cell_index == 0:
306 tags.append(line)
307 continue
308 if line_index + 1 > len(forms):
309 forms.append(
310 translate_raw_tags(
311 Form(raw_tags=tags, source=conj_page_title)
312 )
313 )
314 if cell_index == 1:
315 forms[line_index].form = line
316 elif cell_index == 2:
317 forms[line_index].hiragana = line
318 elif cell_index == 3: 318 ↛ 302line 318 didn't jump to line 302 because the condition on line 318 was always true
319 forms[line_index].roman = line
321 entry.forms.extend(forms)
324def proces_ja_conj_template(
325 wxr: WiktextractContext,
326 entry: WordEntry,
327 template_node: TemplateNode,
328 conj_page_title: str,
329) -> None:
330 # https://fr.wiktionary.org/wiki/Modèle:ja-verbe-conj
331 # Modèle:ja-在る
332 expanded_template = wxr.wtp.parse(
333 wxr.wtp.node_to_wikitext(template_node), expand_all=True
334 )
335 for table_node in expanded_template.find_child(NodeKind.TABLE):
336 first_tag = ""
337 row_headers = {}
338 for row in table_node.find_child(NodeKind.TABLE_ROW):
339 if (
340 all(
341 isinstance(c, WikiNode)
342 and c.kind == NodeKind.TABLE_HEADER_CELL
343 for c in row.children
344 )
345 and len(row.children) > 1
346 ):
347 # skip header row of the "Clefs de constructions" table
348 continue
350 for header in row.find_child(NodeKind.TABLE_HEADER_CELL):
351 header_text = clean_node(wxr, None, header)
352 if len(row.children) == 1:
353 first_tag = header_text
354 else:
355 row_headers[header_text] = int(
356 header.attrs.get("rowspan", "1")
357 )
359 tags = [first_tag]
360 for tag, rowspan in row_headers.copy().items():
361 tags.append(tag)
362 if rowspan == 1:
363 del row_headers[tag]
364 else:
365 row_headers[tag] = rowspan - 1
366 form = Form(raw_tags=tags, source=conj_page_title)
367 for cell_index, cell in enumerate(
368 row.find_child(NodeKind.TABLE_CELL)
369 ):
370 cell_text = clean_node(wxr, None, cell)
371 if cell_index == 0:
372 form.form = cell_text
373 elif cell_index == 1:
374 form.hiragana = cell_text
375 elif cell_index == 2: 375 ↛ 367line 375 didn't jump to line 367 because the condition on line 375 was always true
376 form.roman = cell_text
377 if len(form.form) > 0:
378 translate_raw_tags(form)
379 entry.forms.append(form)