Coverage for src/wiktextract/extractor/fr/inflection.py: 91%
133 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from dataclasses import dataclass
3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .models import Form, WordEntry
8from .pronunciation import is_ipa_text
9from .tags import translate_raw_tags
12def extract_inflection(
13 wxr: WiktextractContext,
14 page_data: list[WordEntry],
15 template_node: TemplateNode,
16) -> None:
17 # inflection templates
18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
19 if template_node.template_name.startswith("en-adj"):
20 process_en_adj_table(wxr, page_data, template_node)
21 else:
22 process_inflection_table(wxr, page_data, template_node)
25IGNORE_TABLE_HEADERS = frozenset(
26 {
27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
28 "forme", # br-flex-adj
29 "temps", # en-conj-rég,
30 "cas", # lt_décl_as, ro-nom-tab(lower case)
31 "commun", # sv-nom-c-ar
32 "personne", # hu-pos-otok
33 "pronom personnel", # it-enclise
34 "mutation", # br-nom
35 "nombre", # ca-accord-mixte2
36 "nature", # de-adj
37 "genre", # es-accord-oa
38 "conjugaison présent indicatif", # avk-tab-conjug
39 "mode", # eo-conj
40 "avec suffixes possessifs", # fi-décl-valo
41 }
42)
43IGNORE_TABLE_HEADER_PREFIXES = (
44 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion
45 "conjugaison de ", # sv-conj-ar
46 "déclinaison de ", # da-adj
47)
48IGNORE_TABLE_CELL = frozenset(
49 {
50 "Déclinaisons", # de-adj
51 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom
52 }
53)
54IGNORE_TABLE_CELL_PREFIXES = (
55 "voir conjugaison ", # en-conj, avk-conj
56)
59@dataclass
60class ColspanHeader:
61 text: str
62 index: int
63 span: int
66def table_data_cell_is_header(
67 wxr: WiktextractContext, cell_node: WikiNode, page_title: str
68) -> bool:
69 # first child is bold node
70 if cell_node.kind == NodeKind.TABLE_CELL: 70 ↛ 81line 70 didn't jump to line 81 because the condition on line 70 was always true
71 for child in cell_node.filter_empty_str_child(): 71 ↛ 81line 71 didn't jump to line 81 because the loop on line 71 didn't complete
72 cell_text = clean_node(wxr, None, child)
73 return (
74 isinstance(child, WikiNode)
75 and child.kind == NodeKind.BOLD
76 and len(cell_text) > 0
77 and cell_text[0].isupper()
78 and cell_text != page_title
79 )
81 return False
84def process_inflection_table(
85 wxr: WiktextractContext,
86 page_data: list[WordEntry],
87 table_template: TemplateNode,
88) -> None:
89 expanded_node = wxr.wtp.parse(
90 wxr.wtp.node_to_wikitext(table_template), expand_all=True
91 )
92 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
93 if len(table_nodes) == 0: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 return
95 table_node = table_nodes[0]
96 column_headers = []
97 rowspan_headers = []
98 colspan_headers = []
99 for row_num, table_row in enumerate(
100 table_node.find_child(NodeKind.TABLE_ROW)
101 ):
102 # filter empty table cells
103 table_row_nodes = [
104 row_node_child
105 for row_node_child in table_row.children
106 if isinstance(row_node_child, WikiNode)
107 and (
108 row_node_child.kind == NodeKind.TABLE_HEADER_CELL
109 or (
110 row_node_child.kind == NodeKind.TABLE_CELL
111 and len(list(row_node_child.filter_empty_str_child())) > 0
112 )
113 )
114 and row_node_child.attrs.get("style") != "display:none"
115 and "invisible" not in row_node_child.attrs.get("class", "")
116 ]
117 current_row_has_data_cell = any(
118 isinstance(cell, WikiNode)
119 and cell.kind == NodeKind.TABLE_CELL
120 and not table_data_cell_is_header(wxr, cell, page_data[-1].word)
121 for cell in table_row_nodes
122 )
123 if not current_row_has_data_cell:
124 column_headers.clear()
125 row_headers = []
126 new_rowspan_headers = []
127 for rowspan_text, rowspan_count in rowspan_headers:
128 row_headers.append(rowspan_text)
129 if rowspan_count - 1 > 0: 129 ↛ 130line 129 didn't jump to line 130 because the condition on line 129 was never true
130 new_rowspan_headers.append((rowspan_text, rowspan_count - 1))
131 rowspan_headers = new_rowspan_headers
133 column_cell_index = 0
134 for column_num, table_cell in enumerate(table_row_nodes):
135 form_data = Form()
136 if isinstance(table_cell, WikiNode): 136 ↛ 134line 136 didn't jump to line 134 because the condition on line 136 was always true
137 if (
138 table_cell.kind == NodeKind.TABLE_HEADER_CELL
139 or table_data_cell_is_header(
140 wxr, table_cell, page_data[-1].word
141 )
142 ):
143 if any(
144 table_cell.find_html(
145 "span",
146 attr_name="class",
147 attr_value="ligne-de-forme",
148 )
149 ):
150 # ignore gender header in template "ro-nom-tab"
151 continue
152 table_header_text = clean_node(
153 wxr, None, table_cell
154 ).replace("\n", " ")
155 if (
156 table_header_text.lower() in IGNORE_TABLE_HEADERS
157 or table_header_text.lower().startswith(
158 IGNORE_TABLE_HEADER_PREFIXES
159 )
160 or len(table_header_text.strip()) == 0
161 ):
162 continue
163 rsplit_header = table_header_text.rsplit(maxsplit=1)
164 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal():
165 # "Pluriel 1" in template "br-nom"
166 table_header_text = rsplit_header[0]
168 if not current_row_has_data_cell:
169 # if all cells of the row are header cells
170 # then the header cells are column headers
171 if "colspan" in table_cell.attrs:
172 colspan_headers.append(
173 ColspanHeader(
174 table_header_text,
175 column_cell_index,
176 int(table_cell.attrs.get("colspan")),
177 )
178 )
179 else:
180 column_headers.append(table_header_text)
181 column_cell_index += int(
182 table_cell.attrs.get("colspan", 1)
183 )
184 else:
185 if table_header_text not in row_headers: 185 ↛ 187line 185 didn't jump to line 187 because the condition on line 185 was always true
186 row_headers.append(table_header_text)
187 if "rowspan" in table_cell.attrs:
188 rowspan_headers.append(
189 (
190 table_header_text,
191 int(table_cell.attrs.get("rowspan")) - 1,
192 )
193 )
194 elif table_cell.kind == NodeKind.TABLE_CELL: 194 ↛ 134line 194 didn't jump to line 134 because the condition on line 194 was always true
195 table_cell_lines = clean_node(wxr, None, table_cell)
196 for table_cell_line in table_cell_lines.splitlines():
197 if is_ipa_text(table_cell_line):
198 insert_ipa(form_data, table_cell_line)
199 elif (
200 table_cell_line != page_data[-1].word
201 and table_cell_line not in IGNORE_TABLE_CELL
202 and not table_cell_line.lower().startswith(
203 IGNORE_TABLE_CELL_PREFIXES
204 )
205 ):
206 if form_data.form == "":
207 form_data.form = table_cell_line
208 else:
209 form_data.form += "\n" + table_cell_line
210 for colspan_header in colspan_headers:
211 if (
212 column_cell_index >= colspan_header.index
213 and column_cell_index
214 < colspan_header.index + colspan_header.span
215 ):
216 form_data.raw_tags.append(colspan_header.text)
217 if (
218 "colspan" not in table_cell.attrs
219 and len(column_headers) > column_cell_index
220 and column_headers[column_cell_index].lower()
221 not in IGNORE_TABLE_HEADERS
222 ):
223 form_data.raw_tags.append(
224 column_headers[column_cell_index]
225 )
227 if len(row_headers) > 0:
228 form_data.raw_tags.extend(row_headers)
229 if form_data.form != "":
230 for form in form_data.form.splitlines():
231 if form.startswith("(") and form.endswith(")"): 231 ↛ 232line 231 didn't jump to line 232 because the condition on line 231 was never true
232 form_data.raw_tags.append(form.strip("()"))
233 continue
234 new_form_data = form_data.model_copy(deep=True)
235 new_form_data.form = form.removeprefix("ou ")
236 translate_raw_tags(
237 new_form_data, table_template.template_name
238 )
239 if len(new_form_data.form.strip()) > 0: 239 ↛ 230line 239 didn't jump to line 230 because the condition on line 239 was always true
240 page_data[-1].forms.append(new_form_data)
242 colspan_text = table_cell.attrs.get("colspan", "1")
243 if colspan_text.isdecimal(): 243 ↛ 134line 243 didn't jump to line 134 because the condition on line 243 was always true
244 column_cell_index += int(colspan_text)
247def split_ipa(text: str) -> list[str]:
248 # break IPA text if it contains "ou"(or)
249 if " ou " in text:
250 # two ipa texts in the same line: "en-conj-rég" template
251 return text.split(" ou ")
252 if text.startswith("ou "):
253 return [text.removeprefix("ou ")]
254 if text.endswith("Prononciation ?\\"):
255 # inflection table templates use a edit link when the ipa data is
256 # missing, and the link usually ends with "Prononciation ?"
257 return []
258 return [text]
261def insert_ipa(form: Form, ipa_text: str) -> None:
262 ipa_data = split_ipa(ipa_text)
263 if len(ipa_data) == 0:
264 return
265 form.ipas.extend(ipa_data)
268def process_en_adj_table(
269 wxr: WiktextractContext,
270 page_data: list[WordEntry],
271 template_node: WikiNode,
272) -> None:
273 # https://fr.wiktionary.org/wiki/Modèle:en-adj
274 # and other en-adj* templates
275 # these templates use normal table cell for column table header
276 expanded_node = wxr.wtp.parse(
277 wxr.wtp.node_to_wikitext(template_node), expand_all=True
278 )
279 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))
280 if len(table_nodes) == 0: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 return
282 table_node = table_nodes[0]
283 for row_num, table_row in enumerate(
284 table_node.find_child(NodeKind.TABLE_ROW)
285 ):
286 if row_num == 0:
287 # skip header
288 continue
289 if len(table_row.children) > 1: 289 ↛ 283line 289 didn't jump to line 283 because the condition on line 289 was always true
290 form_data = Form()
291 form_data.raw_tags.append(
292 clean_node(wxr, None, table_row.children[0])
293 )
294 form_text = clean_node(wxr, None, table_row.children[1])
295 for form_line in form_text.splitlines():
296 if form_line in IGNORE_TABLE_CELL: 296 ↛ 297line 296 didn't jump to line 297 because the condition on line 296 was never true
297 continue
298 elif is_ipa_text(form_line):
299 insert_ipa(form_data, form_line)
300 else:
301 form_data.form = form_line
302 if form_data.form != page_data[-1].word and len(form_data.form) > 0:
303 translate_raw_tags(form_data)
304 page_data[-1].forms.append(form_data)