Coverage for src/wiktextract/extractor/fr/inflection.py: 90%
180 statements
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
« prev ^ index » next coverage.py v7.10.6, created at 2025-09-12 08:27 +0000
1import re
2from dataclasses import dataclass
4from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .pronunciation import is_ipa_text
10from .tags import translate_raw_tags
13def extract_inflection(
14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode
15):
16 # inflection templates
17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
18 if t_node.template_name.startswith("en-adj"):
19 process_en_adj_table(wxr, page_data[-1], t_node)
20 elif t_node.template_name == "fro-adj":
21 extract_fro_adj_template(wxr, page_data[-1], t_node)
22 else:
23 process_inflection_table(wxr, page_data, t_node)
26IGNORE_TABLE_HEADERS = frozenset(
27 {
28 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
29 "forme", # br-flex-adj
30 "temps", # en-conj-rég,
31 "cas", # lt_décl_as, ro-nom-tab(lower case)
32 "commun", # sv-nom-c-ar
33 "personne", # hu-pos-otok
34 "pronom personnel", # it-enclise
35 "mutation", # br-nom
36 "nombre", # ca-accord-mixte2
37 "nature", # de-adj
38 "genre", # es-accord-oa
39 "conjugaison présent indicatif", # avk-tab-conjug
40 "mode", # eo-conj
41 "avec suffixes possessifs", # fi-décl-valo
42 "en kurmandji", # flex-ku-nomf
43 }
44)
45IGNORE_TABLE_HEADER_PREFIXES = (
46 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion
47 "conjugaison de ", # sv-conj-ar
48 "déclinaison de ", # da-adj
49)
50IGNORE_TABLE_CELL = frozenset(
51 {
52 "Déclinaisons", # de-adj
53 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
54 }
55)
56IGNORE_TABLE_CELL_PREFIXES = (
57 "voir conjugaison ", # en-conj, avk-conj
58)
61@dataclass
62class TableHeader:
63 text: str
64 index: int
65 span: int
68def table_data_cell_is_header(
69 wxr: WiktextractContext, cell_node: WikiNode, page_title: str
70) -> bool:
71 # first child is bold node
72 if cell_node.kind == NodeKind.TABLE_CELL: 72 ↛ 83line 72 didn't jump to line 83 because the condition on line 72 was always true
73 for child in cell_node.filter_empty_str_child(): 73 ↛ 83line 73 didn't jump to line 83 because the loop on line 73 didn't complete
74 cell_text = clean_node(wxr, None, child)
75 return (
76 isinstance(child, WikiNode)
77 and child.kind == NodeKind.BOLD
78 and len(cell_text) > 0
79 and cell_text[0].isupper()
80 and cell_text != page_title
81 )
83 return False
86def process_inflection_table(
87 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode
88) -> None:
89 from .form_line import is_conj_link, process_conj_link_node
91 expanded_node = wxr.wtp.parse(
92 wxr.wtp.node_to_wikitext(t_node), expand_all=True
93 )
94 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
95 if len(table_nodes) == 0: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 return
97 table_node = table_nodes[0]
98 column_headers = []
99 rowspan_headers = []
100 colspan_headers = []
101 for row_num, table_row in enumerate(
102 table_node.find_child(NodeKind.TABLE_ROW)
103 ):
104 # filter empty table cells
105 table_row_nodes = [
106 row_node_child
107 for row_node_child in table_row.children
108 if isinstance(row_node_child, WikiNode)
109 and (
110 row_node_child.kind == NodeKind.TABLE_HEADER_CELL
111 or (
112 row_node_child.kind == NodeKind.TABLE_CELL
113 and len(list(row_node_child.filter_empty_str_child())) > 0
114 )
115 )
116 and row_node_child.attrs.get("style") != "display:none"
117 and "invisible" not in row_node_child.attrs.get("class", "")
118 ]
119 current_row_has_data_cell = any(
120 isinstance(cell, WikiNode)
121 and cell.kind == NodeKind.TABLE_CELL
122 and not table_data_cell_is_header(wxr, cell, page_data[-1].word)
123 for cell in table_row_nodes
124 )
125 if not current_row_has_data_cell:
126 column_headers.clear()
127 row_headers = []
128 new_rowspan_headers = []
129 for rowspan_text, rowspan_count in rowspan_headers:
130 row_headers.append(rowspan_text)
131 if rowspan_count - 1 > 0: 131 ↛ 132line 131 didn't jump to line 132 because the condition on line 131 was never true
132 new_rowspan_headers.append((rowspan_text, rowspan_count - 1))
133 rowspan_headers = new_rowspan_headers
135 column_cell_index = 0
136 for column_num, table_cell in enumerate(table_row_nodes):
137 form_data = Form()
138 if isinstance(table_cell, WikiNode): 138 ↛ 136line 138 didn't jump to line 136 because the condition on line 138 was always true
139 if (
140 table_cell.kind == NodeKind.TABLE_HEADER_CELL
141 or table_data_cell_is_header(
142 wxr, table_cell, page_data[-1].word
143 )
144 ):
145 if any(
146 table_cell.find_html(
147 "span",
148 attr_name="class",
149 attr_value="ligne-de-forme",
150 )
151 ):
152 # ignore gender header in template "ro-nom-tab"
153 continue
154 table_header_text = clean_node(
155 wxr, None, table_cell
156 ).replace("\n", " ")
157 if (
158 table_header_text.lower() in IGNORE_TABLE_HEADERS
159 or table_header_text.lower().startswith(
160 IGNORE_TABLE_HEADER_PREFIXES
161 )
162 or len(table_header_text.strip()) == 0
163 ):
164 continue
165 rsplit_header = table_header_text.rsplit(maxsplit=1)
166 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal():
167 # "Pluriel 1" in template "br-nom"
168 table_header_text = rsplit_header[0]
170 if not current_row_has_data_cell:
171 # if all cells of the row are header cells
172 # then the header cells are column headers
173 if "colspan" in table_cell.attrs:
174 colspan_headers.append(
175 TableHeader(
176 table_header_text,
177 column_cell_index,
178 int(table_cell.attrs.get("colspan")),
179 )
180 )
181 else:
182 column_headers.append(table_header_text)
183 column_cell_index += int(
184 table_cell.attrs.get("colspan", 1)
185 )
186 else:
187 if table_header_text not in row_headers: 187 ↛ 189line 187 didn't jump to line 189 because the condition on line 187 was always true
188 row_headers.append(table_header_text)
189 if "rowspan" in table_cell.attrs:
190 rowspan_headers.append(
191 (
192 table_header_text,
193 int(table_cell.attrs.get("rowspan")) - 1,
194 )
195 )
196 elif table_cell.kind == NodeKind.TABLE_CELL: 196 ↛ 136line 196 didn't jump to line 136 because the condition on line 196 was always true
197 has_conj_link = False
198 for link_node in table_cell.find_child(NodeKind.LINK):
199 if is_conj_link(wxr, link_node): 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 process_conj_link_node(wxr, link_node, page_data)
201 has_conj_link = True
202 break
203 if has_conj_link: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 continue
205 table_cell_lines = clean_node(wxr, None, table_cell)
206 for table_cell_line in table_cell_lines.splitlines():
207 if is_ipa_text(table_cell_line):
208 insert_ipa(form_data, table_cell_line)
209 elif (
210 table_cell_line != page_data[-1].word
211 and table_cell_line not in IGNORE_TABLE_CELL
212 and not table_cell_line.lower().startswith(
213 IGNORE_TABLE_CELL_PREFIXES
214 )
215 ):
216 if form_data.form == "":
217 form_data.form = table_cell_line
218 else:
219 form_data.form += "\n" + table_cell_line
220 for colspan_header in colspan_headers:
221 if (
222 column_cell_index >= colspan_header.index
223 and column_cell_index
224 < colspan_header.index + colspan_header.span
225 ):
226 form_data.raw_tags.append(colspan_header.text)
227 if (
228 "colspan" not in table_cell.attrs
229 and len(column_headers) > column_cell_index
230 and column_headers[column_cell_index].lower()
231 not in IGNORE_TABLE_HEADERS
232 ):
233 form_data.raw_tags.append(
234 column_headers[column_cell_index]
235 )
237 if len(row_headers) > 0:
238 form_data.raw_tags.extend(row_headers)
239 if form_data.form != "":
240 for form in form_data.form.splitlines():
241 if form.startswith("(") and form.endswith(")"): 241 ↛ 242line 241 didn't jump to line 242 because the condition on line 241 was never true
242 form_data.raw_tags.append(form.strip("()"))
243 continue
244 new_form_data = form_data.model_copy(deep=True)
245 new_form_data.form = form.removeprefix("ou ")
246 translate_raw_tags(
247 new_form_data, t_node.template_name
248 )
249 if len(new_form_data.form.strip()) > 0: 249 ↛ 240line 249 didn't jump to line 240 because the condition on line 249 was always true
250 page_data[-1].forms.append(new_form_data)
252 colspan_text = table_cell.attrs.get("colspan", "1")
253 if colspan_text.isdecimal(): 253 ↛ 136line 253 didn't jump to line 136 because the condition on line 253 was always true
254 column_cell_index += int(colspan_text)
257def split_ipa(text: str) -> list[str]:
258 # break IPA text if it contains "ou"(or)
259 if " ou " in text:
260 # two ipa texts in the same line: "en-conj-rég" template
261 return text.split(" ou ")
262 if text.startswith("ou "):
263 return [text.removeprefix("ou ")]
264 if text.endswith("Prononciation ?\\"):
265 # inflection table templates use a edit link when the ipa data is
266 # missing, and the link usually ends with "Prononciation ?"
267 return []
268 return [text]
271def insert_ipa(form: Form, ipa_text: str) -> None:
272 ipa_data = split_ipa(ipa_text)
273 if len(ipa_data) == 0:
274 return
275 form.ipas.extend(ipa_data)
278def process_en_adj_table(
279 wxr: WiktextractContext, word_entry: WordEntry, t_node: WikiNode
280) -> None:
281 # https://fr.wiktionary.org/wiki/Modèle:en-adj
282 # and other en-adj* templates
283 # these templates use normal table cell for column table header
284 expanded_node = wxr.wtp.parse(
285 wxr.wtp.node_to_wikitext(t_node), expand_all=True
286 )
287 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
288 if len(table_nodes) == 0: 288 ↛ 289line 288 didn't jump to line 289 because the condition on line 288 was never true
289 return
290 table_node = table_nodes[0]
291 for row_num, table_row in enumerate(
292 table_node.find_child(NodeKind.TABLE_ROW)
293 ):
294 if row_num == 0:
295 # skip header
296 continue
297 if len(table_row.children) > 1: 297 ↛ 291line 297 didn't jump to line 291 because the condition on line 297 was always true
298 form_data = Form()
299 form_data.raw_tags.append(
300 clean_node(wxr, None, table_row.children[0])
301 )
302 form_text = clean_node(wxr, None, table_row.children[1])
303 for form_line in form_text.splitlines():
304 if form_line in IGNORE_TABLE_CELL: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true
305 continue
306 elif is_ipa_text(form_line):
307 insert_ipa(form_data, form_line)
308 else:
309 form_data.form = form_line
310 if form_data.form != word_entry.word and len(form_data.form) > 0:
311 translate_raw_tags(form_data)
312 word_entry.forms.append(form_data)
315def extract_fro_adj_template(
316 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
317):
318 # https://fr.wiktionary.org/wiki/Modèle:fro-adj
319 expanded_node = wxr.wtp.parse(
320 wxr.wtp.node_to_wikitext(t_node), expand_all=True
321 )
322 col_headers = []
323 row_headers = []
324 for table in expanded_node.find_child(NodeKind.TABLE):
325 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
326 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
327 for col_index, cell_node in enumerate(
328 row.find_child(NodeKind.TABLE_HEADER_CELL)
329 ):
330 cell_text = clean_node(wxr, None, cell_node)
331 if cell_text == "" or cell_text.lower() in IGNORE_TABLE_HEADERS:
332 continue
333 if not row_has_data:
334 col_headers.append(cell_text)
335 else:
336 rowspan_str = cell_node.attrs.get("rowspan", "1")
337 rowspan = 1
338 if re.fullmatch(r"\d+", rowspan_str) is not None: 338 ↛ 340line 338 didn't jump to line 340 because the condition on line 338 was always true
339 rowspan = int(rowspan_str)
340 row_headers.append(
341 TableHeader(cell_text, row_index, rowspan)
342 )
344 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
345 for col_index, cell_node in enumerate(
346 row.find_child(NodeKind.TABLE_CELL)
347 ):
348 cell_text = clean_node(wxr, None, cell_node)
349 if cell_text in ["", wxr.wtp.title]:
350 continue
351 form = Form(form=cell_text)
352 if col_index < len(col_headers): 352 ↛ 354line 352 didn't jump to line 354 because the condition on line 352 was always true
353 form.raw_tags.append(col_headers[col_index])
354 rowspan_str = cell_node.attrs.get("rowspan", "1")
355 rowspan = 1
356 if re.fullmatch(r"\d+", rowspan_str) is not None: 356 ↛ 358line 356 didn't jump to line 358 because the condition on line 356 was always true
357 rowspan = int(rowspan_str)
358 for header in row_headers:
359 if (
360 header.index < row_index + rowspan
361 and row_index < header.index + header.span
362 and header.text not in form.raw_tags
363 ):
364 form.raw_tags.append(header.text)
365 translate_raw_tags(form)
366 word_entry.forms.append(form)