Coverage for src/wiktextract/extractor/fr/inflection.py: 96%
142 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-11-03 05:44 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .pronunciation import is_ipa_text
10from .tags import translate_raw_tags
13def extract_inflection(
14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode
15):
16 # inflection templates
17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
18 if t_node.template_name == "avk-tab-conjug":
19 extract_avk_tab_conjug(wxr, page_data[-1], t_node)
20 else:
21 extract_inf_table_template(wxr, page_data[-1], t_node)
24IGNORE_TABLE_HEADERS = frozenset(
25 {
26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
27 "forme", # br-flex-adj
28 }
29)
32def split_ipa(text: str) -> list[str]:
33 # break IPA text if it contains "ou"(or)
34 if " ou " in text:
35 # two ipa texts in the same line: "en-conj-rég" template
36 return text.split(" ou ")
37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 return [text.removeprefix("ou ")]
39 if text.endswith("\\Prononciation ?\\"):
40 # inflection table templates use a edit link when the ipa data is
41 # missing, and the link usually ends with "\Prononciation ?\"
42 return []
43 return [text]
46@dataclass
47class TableSpanHeader:
48 text: str
49 col_index: int = 0
50 colspan: int = 1
51 row_index: int = 0
52 rowspan: int = 1
55def extract_inf_table_template(
56 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
57):
58 # https://fr.wiktionary.org/wiki/Modèle:fro-adj
59 from .form_line import is_conj_link, process_conj_link_node
61 expanded_node = wxr.wtp.parse(
62 wxr.wtp.node_to_wikitext(t_node), expand_all=True
63 )
64 for table in expanded_node.find_child(NodeKind.TABLE):
65 col_headers = []
66 row_headers = []
67 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
68 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
69 col_index = 0
70 for header in chain(col_headers, row_headers):
71 if (
72 row_index > header.row_index
73 and row_index < header.row_index + header.rowspan
74 and header.col_index <= col_index
75 ):
76 col_index += header.colspan
77 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
78 if cell_node.attrs.get("style") == "display:none":
79 continue
80 has_conj_link = False
81 for link_node in cell_node.find_child_recursively(
82 NodeKind.LINK
83 ):
84 if is_conj_link(wxr, link_node): 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true
85 if "form-of" not in word_entry.tags:
86 # Template:fr-verbe-flexion
87 process_conj_link_node(wxr, link_node, [word_entry])
88 has_conj_link = True
89 break
90 if has_conj_link:
91 continue
92 cell_text = clean_node(wxr, None, cell_node)
93 colspan = int(cell_node.attrs.get("colspan", "1"))
94 rowspan = int(cell_node.attrs.get("rowspan", "1"))
95 if not row_has_data:
96 col_headers.append(
97 TableSpanHeader(
98 cell_text, col_index, colspan, row_index, rowspan
99 )
100 )
101 else:
102 row_headers.append(
103 TableSpanHeader(
104 cell_text, col_index, colspan, row_index, rowspan
105 )
106 )
107 col_index += colspan
109 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
110 col_index = 0
111 last_col_header_row = 0
112 for col_header in col_headers[::-1]:
113 if col_header.row_index < row_index:
114 last_col_header_row = col_header.row_index
115 break
116 for row_header in row_headers:
117 if (
118 row_index >= row_header.row_index
119 and row_index < row_header.row_index + row_header.rowspan
120 and row_header.col_index <= col_index
121 ):
122 col_index += row_header.colspan
123 article = ""
124 for cell_node in row.find_child(NodeKind.TABLE_CELL):
125 if cell_node.attrs.get("style") == "display:none":
126 continue
127 has_conj_link = False
128 for link_node in cell_node.find_child_recursively(
129 NodeKind.LINK
130 ):
131 if is_conj_link(wxr, link_node):
132 if "form-of" not in word_entry.tags: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was always true
133 process_conj_link_node(wxr, link_node, [word_entry])
134 has_conj_link = True
135 break
136 if has_conj_link:
137 continue
138 colspan = int(cell_node.attrs.get("colspan", "1"))
139 rowspan = int(cell_node.attrs.get("rowspan", "1"))
140 cell_classes = cell_node.attrs.get("class", "").split()
141 filtered_cell = []
142 cell_tags = []
143 for cell_child in cell_node.children:
144 if (
145 isinstance(cell_child, HTMLNode)
146 and cell_child.tag == "small"
147 ):
148 # Modèle:fr-verbe-flexion
149 raw_tag = clean_node(wxr, None, cell_child)
150 if raw_tag.startswith("(") and raw_tag.endswith(")"): 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true
151 cell_tags.append(raw_tag.strip("() "))
152 else:
153 filtered_cell.append(cell_child)
154 else:
155 filtered_cell.append(cell_child)
156 cell_text = clean_node(wxr, None, filtered_cell)
157 # Template:grc-décl-nomf-1-α-ης
158 if "article" in cell_classes:
159 article = cell_text
160 col_index += colspan
161 continue
162 for line in cell_text.splitlines():
163 line = line.removeprefix("ou ").strip()
164 if is_ipa_text(line):
165 if len(word_entry.forms) > 0:
166 word_entry.forms[-1].ipas.extend(split_ipa(line))
167 continue
168 form = Form(form=line, raw_tags=cell_tags, article=article)
169 use_col_tags = []
170 for col_header in col_headers[::-1]:
171 if (
172 col_header.col_index < col_index + colspan
173 and col_index
174 < col_header.col_index + col_header.colspan
175 and col_header.text not in form.raw_tags
176 and col_header.text not in use_col_tags
177 and col_header.text.lower()
178 not in IGNORE_TABLE_HEADERS
179 # column header above cell and above last header
180 # don't use headers for other top sections
181 # Modèle:eo-conj
182 and col_header.row_index + col_header.rowspan
183 in [last_col_header_row, last_col_header_row + 1]
184 ):
185 use_col_tags.append(col_header.text)
186 form.raw_tags.extend(use_col_tags[::-1])
187 for row_header in row_headers:
188 if (
189 row_header.row_index < row_index + rowspan
190 and row_index
191 < row_header.row_index + row_header.rowspan
192 and row_header.text not in form.raw_tags
193 and row_header.text.lower()
194 not in IGNORE_TABLE_HEADERS
195 ):
196 form.raw_tags.append(row_header.text)
197 if form.form not in [
198 "",
199 "—",
200 "non comparable", # Template:de-adj
201 wxr.wtp.title,
202 ]:
203 translate_raw_tags(form)
204 word_entry.forms.append(form)
205 col_index += colspan
206 article = ""
209def extract_avk_tab_conjug(
210 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
211):
212 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
213 expanded_node = wxr.wtp.parse(
214 wxr.wtp.node_to_wikitext(t_node), expand_all=True
215 )
216 for table in expanded_node.find_child(NodeKind.TABLE):
217 col_headers = []
218 for row in table.find_child(NodeKind.TABLE_ROW):
219 row_header = ""
220 is_row_header = row.contain_node(NodeKind.TABLE_CELL)
221 for col_index, cell in enumerate(
222 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
223 ):
224 cell_text = clean_node(wxr, None, cell)
225 if cell_text == "": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true
226 continue
227 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
228 if is_row_header:
229 row_header = cell_text
230 elif cell_text != "Conjugaison Présent Indicatif":
231 col_headers.append(cell_text)
232 else:
233 form = Form(form=cell_text, tags=["present", "indicative"])
234 if col_index < len(col_headers): 234 ↛ 236line 234 didn't jump to line 236 because the condition on line 234 was always true
235 form.raw_tags.append(col_headers[col_index])
236 if row_header != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true
237 form.raw_tags.append(row_header)
238 translate_raw_tags(form)
239 word_entry.forms.append(form)