Coverage for src / wiktextract / extractor / fr / inflection.py: 96%
152 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 10:39 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2025-12-03 10:39 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .pronunciation import is_ipa_text
10from .tags import translate_raw_tags
13def extract_inflection(
14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode
15):
16 # inflection templates
17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
18 if t_node.template_name == "avk-tab-conjug":
19 extract_avk_tab_conjug(wxr, page_data[-1], t_node)
20 else:
21 extract_inf_table_template(wxr, page_data[-1], t_node)
24IGNORE_TABLE_HEADERS = frozenset(
25 {
26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
27 "forme", # br-flex-adj
28 }
29)
32def split_ipa(text: str) -> list[str]:
33 # break IPA text if it contains "ou"(or)
34 if " ou " in text:
35 # two ipa texts in the same line: "en-conj-rég" template
36 return text.split(" ou ")
37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 return [text.removeprefix("ou ")]
39 if text.endswith("\\Prononciation ?\\"):
40 # inflection table templates use a edit link when the ipa data is
41 # missing, and the link usually ends with "\Prononciation ?\"
42 return []
43 return [text]
46@dataclass
47class TableSpanHeader:
48 text: str
49 col_index: int = 0
50 colspan: int = 1
51 row_index: int = 0
52 rowspan: int = 1
55def extract_inf_table_template(
56 wxr: WiktextractContext,
57 word_entry: WordEntry,
58 t_node: TemplateNode,
59 source: str = "",
60):
61 # https://fr.wiktionary.org/wiki/Modèle:fro-adj
62 from .form_line import is_conj_link, process_conj_link_node
64 expanded_node = wxr.wtp.parse(
65 wxr.wtp.node_to_wikitext(t_node), expand_all=True
66 )
67 for table in expanded_node.find_child_recursively(NodeKind.TABLE):
68 col_headers = []
69 row_headers = []
70 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
71 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
72 col_index = 0
73 for header in chain(col_headers, row_headers):
74 if (
75 row_index > header.row_index
76 and row_index < header.row_index + header.rowspan
77 and header.col_index <= col_index
78 ):
79 col_index += header.colspan
80 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
81 if cell_node.attrs.get("style") == "display:none":
82 continue
83 has_conj_link = False
84 for link_node in cell_node.find_child_recursively(
85 NodeKind.LINK
86 ):
87 if is_conj_link(wxr, link_node):
88 if "form-of" not in word_entry.tags:
89 # Template:fr-verbe-flexion
90 process_conj_link_node(wxr, link_node, [word_entry])
91 has_conj_link = True
92 break
93 if has_conj_link:
94 continue
95 cell_text = clean_node(wxr, None, cell_node)
96 colspan = int(cell_node.attrs.get("colspan", "1"))
97 rowspan = int(cell_node.attrs.get("rowspan", "1"))
98 if not row_has_data:
99 col_headers.append(
100 TableSpanHeader(
101 cell_text, col_index, colspan, row_index, rowspan
102 )
103 )
104 else:
105 row_headers.append(
106 TableSpanHeader(
107 cell_text, col_index, colspan, row_index, rowspan
108 )
109 )
110 col_index += colspan
112 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
113 col_index = 0
114 last_col_header_row = 0
115 for col_header in col_headers[::-1]:
116 if col_header.row_index < row_index:
117 last_col_header_row = col_header.row_index
118 break
119 for row_header in row_headers:
120 if (
121 row_index >= row_header.row_index
122 and row_index < row_header.row_index + row_header.rowspan
123 and row_header.col_index <= col_index
124 ):
125 col_index += row_header.colspan
126 article = ""
127 for cell_node in row.find_child(NodeKind.TABLE_CELL):
128 if cell_node.attrs.get("style") == "display:none":
129 continue
130 has_collapsible_div = False
131 has_conj_link = False
132 for link_node in cell_node.find_child_recursively(
133 NodeKind.LINK
134 ):
135 if is_conj_link(wxr, link_node):
136 if "form-of" not in word_entry.tags: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true
137 process_conj_link_node(wxr, link_node, [word_entry])
138 has_conj_link = True
139 break
140 # ignore note in Template:fi-décl-ihminen
141 for div_tag in cell_node.find_html("div"):
142 div_class = div_tag.attrs.get("class", "").split()
143 if "mw-collapsible" in div_class: 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true
144 has_collapsible_div = True
145 break
146 if has_conj_link or has_collapsible_div:
147 continue
148 colspan = int(cell_node.attrs.get("colspan", "1"))
149 rowspan = int(cell_node.attrs.get("rowspan", "1"))
150 cell_classes = cell_node.attrs.get("class", "").split()
151 filtered_cell = []
152 cell_tags = []
153 for cell_child in cell_node.children:
154 if (
155 isinstance(cell_child, HTMLNode)
156 and cell_child.tag == "small"
157 ):
158 # Modèle:fr-verbe-flexion
159 raw_tag = clean_node(wxr, None, cell_child)
160 if raw_tag.startswith("(") and raw_tag.endswith(")"): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true
161 cell_tags.append(raw_tag.strip("() "))
162 else:
163 filtered_cell.append(cell_child)
164 elif (
165 isinstance(cell_child, HTMLNode)
166 and cell_child.tag == "span"
167 ):
168 # note ref number in Template:fi-décl-ihminen
169 span_id = cell_child.attrs.get("id", "")
170 if not span_id.startswith("ref-"):
171 filtered_cell.append(cell_child)
172 else:
173 filtered_cell.append(cell_child)
174 cell_text = clean_node(wxr, None, filtered_cell)
175 # Template:grc-décl-nomf-1-α-ης
176 if "article" in cell_classes:
177 article = cell_text
178 col_index += colspan
179 continue
180 for line in cell_text.splitlines():
181 line = line.removeprefix("ou ").strip()
182 if is_ipa_text(line):
183 if len(word_entry.forms) > 0:
184 word_entry.forms[-1].ipas.extend(split_ipa(line))
185 continue
186 form = Form(
187 form=line,
188 raw_tags=cell_tags,
189 article=article,
190 source=source,
191 )
192 use_col_tags = []
193 for col_header in col_headers[::-1]:
194 if (
195 col_header.col_index < col_index + colspan
196 and col_index
197 < col_header.col_index + col_header.colspan
198 and col_header.text not in form.raw_tags
199 and col_header.text not in use_col_tags
200 and col_header.text.lower()
201 not in IGNORE_TABLE_HEADERS
202 # column header above cell and above last header
203 # don't use headers for other top sections
204 # Modèle:eo-conj
205 and col_header.row_index + col_header.rowspan
206 in [last_col_header_row, last_col_header_row + 1]
207 ):
208 use_col_tags.append(col_header.text)
209 form.raw_tags.extend(use_col_tags[::-1])
210 for row_header in row_headers:
211 if (
212 row_header.row_index < row_index + rowspan
213 and row_index
214 < row_header.row_index + row_header.rowspan
215 and row_header.text not in form.raw_tags
216 and row_header.text.lower()
217 not in IGNORE_TABLE_HEADERS
218 ):
219 form.raw_tags.append(row_header.text)
220 if form.form not in [
221 "",
222 "—",
223 "non comparable", # Template:de-adj
224 wxr.wtp.title,
225 ]:
226 translate_raw_tags(form)
227 word_entry.forms.append(form)
228 col_index += colspan
229 article = ""
232def extract_avk_tab_conjug(
233 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
234):
235 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
236 expanded_node = wxr.wtp.parse(
237 wxr.wtp.node_to_wikitext(t_node), expand_all=True
238 )
239 for table in expanded_node.find_child(NodeKind.TABLE):
240 col_headers = []
241 for row in table.find_child(NodeKind.TABLE_ROW):
242 row_header = ""
243 is_row_header = row.contain_node(NodeKind.TABLE_CELL)
244 for col_index, cell in enumerate(
245 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
246 ):
247 cell_text = clean_node(wxr, None, cell)
248 if cell_text == "": 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true
249 continue
250 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
251 if is_row_header:
252 row_header = cell_text
253 elif cell_text != "Conjugaison Présent Indicatif":
254 col_headers.append(cell_text)
255 else:
256 form = Form(form=cell_text, tags=["present", "indicative"])
257 if col_index < len(col_headers): 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was always true
258 form.raw_tags.append(col_headers[col_index])
259 if row_header != "": 259 ↛ 261line 259 didn't jump to line 261 because the condition on line 259 was always true
260 form.raw_tags.append(row_header)
261 translate_raw_tags(form)
262 word_entry.forms.append(form)