Coverage for src/wiktextract/extractor/fr/inflection.py: 89%
142 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from dataclasses import dataclass
3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .pronunciation import is_ipa_text
9from .tags import translate_raw_tags
12def extract_inflection(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 template_node: TemplateNode,
16) -> None:
17 # inflection templates
18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
19 if template_node.template_name.startswith("en-adj"):
20 process_en_adj_table(wxr, page_data, template_node)
21 else:
22 process_inflection_table(wxr, page_data, template_node)
25IGNORE_TABLE_HEADERS = frozenset(
26 {
27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
28 "forme", # br-flex-adj
29 "temps", # en-conj-rég,
30 "cas", # lt_décl_as, ro-nom-tab(lower case)
31 "commun", # sv-nom-c-ar
32 "personne", # hu-pos-otok
33 "pronom personnel", # it-enclise
34 "mutation", # br-nom
35 "nombre", # ca-accord-mixte2
36 "nature", # de-adj
37 "genre", # es-accord-oa
38 "conjugaison présent indicatif", # avk-tab-conjug
39 "mode", # eo-conj
40 "avec suffixes possessifs", # fi-décl-valo
41 "en kurmandji", # flex-ku-nomf
42 }
43)
44IGNORE_TABLE_HEADER_PREFIXES = (
45 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion
46 "conjugaison de ", # sv-conj-ar
47 "déclinaison de ", # da-adj
48)
49IGNORE_TABLE_CELL = frozenset(
50 {
51 "Déclinaisons", # de-adj
52 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
53 }
54)
55IGNORE_TABLE_CELL_PREFIXES = (
56 "voir conjugaison ", # en-conj, avk-conj
57)
60@dataclass
61class ColspanHeader:
62 text: str
63 index: int
64 span: int
67def table_data_cell_is_header(
68 wxr: WiktextractContext, cell_node: WikiNode, page_title: str
69) -> bool:
70 # first child is bold node
71 if cell_node.kind == NodeKind.TABLE_CELL: 71 ↛ 82line 71 didn't jump to line 82 because the condition on line 71 was always true
72 for child in cell_node.filter_empty_str_child(): 72 ↛ 82line 72 didn't jump to line 82 because the loop on line 72 didn't complete
73 cell_text = clean_node(wxr, None, child)
74 return (
75 isinstance(child, WikiNode)
76 and child.kind == NodeKind.BOLD
77 and len(cell_text) > 0
78 and cell_text[0].isupper()
79 and cell_text != page_title
80 )
82 return False
85def process_inflection_table(
86 wxr: WiktextractContext,
87 page_data: list[WordEntry],
88 table_template: TemplateNode,
89) -> None:
90 from .form_line import is_conj_link, process_conj_link_node
92 expanded_node = wxr.wtp.parse(
93 wxr.wtp.node_to_wikitext(table_template), expand_all=True
94 )
95 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
96 if len(table_nodes) == 0: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 return
98 table_node = table_nodes[0]
99 column_headers = []
100 rowspan_headers = []
101 colspan_headers = []
102 for row_num, table_row in enumerate(
103 table_node.find_child(NodeKind.TABLE_ROW)
104 ):
105 # filter empty table cells
106 table_row_nodes = [
107 row_node_child
108 for row_node_child in table_row.children
109 if isinstance(row_node_child, WikiNode)
110 and (
111 row_node_child.kind == NodeKind.TABLE_HEADER_CELL
112 or (
113 row_node_child.kind == NodeKind.TABLE_CELL
114 and len(list(row_node_child.filter_empty_str_child())) > 0
115 )
116 )
117 and row_node_child.attrs.get("style") != "display:none"
118 and "invisible" not in row_node_child.attrs.get("class", "")
119 ]
120 current_row_has_data_cell = any(
121 isinstance(cell, WikiNode)
122 and cell.kind == NodeKind.TABLE_CELL
123 and not table_data_cell_is_header(wxr, cell, page_data[-1].word)
124 for cell in table_row_nodes
125 )
126 if not current_row_has_data_cell:
127 column_headers.clear()
128 row_headers = []
129 new_rowspan_headers = []
130 for rowspan_text, rowspan_count in rowspan_headers:
131 row_headers.append(rowspan_text)
132 if rowspan_count - 1 > 0: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true
133 new_rowspan_headers.append((rowspan_text, rowspan_count - 1))
134 rowspan_headers = new_rowspan_headers
136 column_cell_index = 0
137 for column_num, table_cell in enumerate(table_row_nodes):
138 form_data = Form()
139 if isinstance(table_cell, WikiNode): 139 ↛ 137line 139 didn't jump to line 137 because the condition on line 139 was always true
140 if (
141 table_cell.kind == NodeKind.TABLE_HEADER_CELL
142 or table_data_cell_is_header(
143 wxr, table_cell, page_data[-1].word
144 )
145 ):
146 if any(
147 table_cell.find_html(
148 "span",
149 attr_name="class",
150 attr_value="ligne-de-forme",
151 )
152 ):
153 # ignore gender header in template "ro-nom-tab"
154 continue
155 table_header_text = clean_node(
156 wxr, None, table_cell
157 ).replace("\n", " ")
158 if (
159 table_header_text.lower() in IGNORE_TABLE_HEADERS
160 or table_header_text.lower().startswith(
161 IGNORE_TABLE_HEADER_PREFIXES
162 )
163 or len(table_header_text.strip()) == 0
164 ):
165 continue
166 rsplit_header = table_header_text.rsplit(maxsplit=1)
167 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal():
168 # "Pluriel 1" in template "br-nom"
169 table_header_text = rsplit_header[0]
171 if not current_row_has_data_cell:
172 # if all cells of the row are header cells
173 # then the header cells are column headers
174 if "colspan" in table_cell.attrs:
175 colspan_headers.append(
176 ColspanHeader(
177 table_header_text,
178 column_cell_index,
179 int(table_cell.attrs.get("colspan")),
180 )
181 )
182 else:
183 column_headers.append(table_header_text)
184 column_cell_index += int(
185 table_cell.attrs.get("colspan", 1)
186 )
187 else:
188 if table_header_text not in row_headers: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true
189 row_headers.append(table_header_text)
190 if "rowspan" in table_cell.attrs:
191 rowspan_headers.append(
192 (
193 table_header_text,
194 int(table_cell.attrs.get("rowspan")) - 1,
195 )
196 )
197 elif table_cell.kind == NodeKind.TABLE_CELL: 197 ↛ 137line 197 didn't jump to line 137 because the condition on line 197 was always true
198 has_conj_link = False
199 for link_node in table_cell.find_child(NodeKind.LINK):
200 if is_conj_link(wxr, link_node): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true
201 process_conj_link_node(wxr, link_node, page_data)
202 has_conj_link = True
203 break
204 if has_conj_link: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 continue
206 table_cell_lines = clean_node(wxr, None, table_cell)
207 for table_cell_line in table_cell_lines.splitlines():
208 if is_ipa_text(table_cell_line):
209 insert_ipa(form_data, table_cell_line)
210 elif (
211 table_cell_line != page_data[-1].word
212 and table_cell_line not in IGNORE_TABLE_CELL
213 and not table_cell_line.lower().startswith(
214 IGNORE_TABLE_CELL_PREFIXES
215 )
216 ):
217 if form_data.form == "":
218 form_data.form = table_cell_line
219 else:
220 form_data.form += "\n" + table_cell_line
221 for colspan_header in colspan_headers:
222 if (
223 column_cell_index >= colspan_header.index
224 and column_cell_index
225 < colspan_header.index + colspan_header.span
226 ):
227 form_data.raw_tags.append(colspan_header.text)
228 if (
229 "colspan" not in table_cell.attrs
230 and len(column_headers) > column_cell_index
231 and column_headers[column_cell_index].lower()
232 not in IGNORE_TABLE_HEADERS
233 ):
234 form_data.raw_tags.append(
235 column_headers[column_cell_index]
236 )
238 if len(row_headers) > 0:
239 form_data.raw_tags.extend(row_headers)
240 if form_data.form != "":
241 for form in form_data.form.splitlines():
242 if form.startswith("(") and form.endswith(")"): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true
243 form_data.raw_tags.append(form.strip("()"))
244 continue
245 new_form_data = form_data.model_copy(deep=True)
246 new_form_data.form = form.removeprefix("ou ")
247 translate_raw_tags(
248 new_form_data, table_template.template_name
249 )
250 if len(new_form_data.form.strip()) > 0: 250 ↛ 241line 250 didn't jump to line 241 because the condition on line 250 was always true
251 page_data[-1].forms.append(new_form_data)
253 colspan_text = table_cell.attrs.get("colspan", "1")
254 if colspan_text.isdecimal(): 254 ↛ 137line 254 didn't jump to line 137 because the condition on line 254 was always true
255 column_cell_index += int(colspan_text)
258def split_ipa(text: str) -> list[str]:
259 # break IPA text if it contains "ou"(or)
260 if " ou " in text:
261 # two ipa texts in the same line: "en-conj-rég" template
262 return text.split(" ou ")
263 if text.startswith("ou "):
264 return [text.removeprefix("ou ")]
265 if text.endswith("Prononciation ?\\"):
266 # inflection table templates use a edit link when the ipa data is
267 # missing, and the link usually ends with "Prononciation ?"
268 return []
269 return [text]
272def insert_ipa(form: Form, ipa_text: str) -> None:
273 ipa_data = split_ipa(ipa_text)
274 if len(ipa_data) == 0:
275 return
276 form.ipas.extend(ipa_data)
279def process_en_adj_table(
280 wxr: WiktextractContext,
281 page_data: list[WordEntry],
282 template_node: WikiNode,
283) -> None:
284 # https://fr.wiktionary.org/wiki/Modèle:en-adj
285 # and other en-adj* templates
286 # these templates use normal table cell for column table header
287 expanded_node = wxr.wtp.parse(
288 wxr.wtp.node_to_wikitext(template_node), expand_all=True
289 )
290 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
291 if len(table_nodes) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 return
293 table_node = table_nodes[0]
294 for row_num, table_row in enumerate(
295 table_node.find_child(NodeKind.TABLE_ROW)
296 ):
297 if row_num == 0:
298 # skip header
299 continue
300 if len(table_row.children) > 1: 300 ↛ 294line 300 didn't jump to line 294 because the condition on line 300 was always true
301 form_data = Form()
302 form_data.raw_tags.append(
303 clean_node(wxr, None, table_row.children[0])
304 )
305 form_text = clean_node(wxr, None, table_row.children[1])
306 for form_line in form_text.splitlines():
307 if form_line in IGNORE_TABLE_CELL: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true
308 continue
309 elif is_ipa_text(form_line):
310 insert_ipa(form_data, form_line)
311 else:
312 form_data.form = form_line
313 if form_data.form != page_data[-1].word and len(form_data.form) > 0:
314 translate_raw_tags(form_data)
315 page_data[-1].forms.append(form_data)