Coverage for src/wiktextract/extractor/fr/inflection.py: 95%
133 statements
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
« prev ^ index » next coverage.py v7.10.7, created at 2025-10-13 10:14 +0000
1from dataclasses import dataclass
2from itertools import chain
4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode
6from ...page import clean_node
7from ...wxr_context import WiktextractContext
8from .models import Form, WordEntry
9from .pronunciation import is_ipa_text
10from .tags import translate_raw_tags
13def extract_inflection(
14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode
15):
16 # inflection templates
17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français
18 if t_node.template_name == "avk-tab-conjug":
19 extract_avk_tab_conjug(wxr, page_data[-1], t_node)
20 else:
21 extract_inf_table_template(wxr, page_data[-1], t_node)
24IGNORE_TABLE_HEADERS = frozenset(
25 {
26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj
27 "forme", # br-flex-adj
28 }
29)
32def split_ipa(text: str) -> list[str]:
33 # break IPA text if it contains "ou"(or)
34 if " ou " in text:
35 # two ipa texts in the same line: "en-conj-rég" template
36 return text.split(" ou ")
37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true
38 return [text.removeprefix("ou ")]
39 if text.endswith("\\Prononciation ?\\"):
40 # inflection table templates use a edit link when the ipa data is
41 # missing, and the link usually ends with "\Prononciation ?\"
42 return []
43 return [text]
46@dataclass
47class TableSpanHeader:
48 text: str
49 col_index: int = 0
50 colspan: int = 1
51 row_index: int = 0
52 rowspan: int = 1
55def extract_inf_table_template(
56 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
57):
58 # https://fr.wiktionary.org/wiki/Modèle:fro-adj
59 from .form_line import is_conj_link, process_conj_link_node
61 expanded_node = wxr.wtp.parse(
62 wxr.wtp.node_to_wikitext(t_node), expand_all=True
63 )
64 for table in expanded_node.find_child(NodeKind.TABLE):
65 col_headers = []
66 row_headers = []
67 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
68 row_has_data = row.contain_node(NodeKind.TABLE_CELL)
69 col_index = 0
70 for header in chain(col_headers, row_headers):
71 if (
72 row_index >= header.row_index
73 and row_index < header.row_index + header.rowspan
74 ):
75 col_index += header.colspan
76 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL):
77 has_conj_link = False
78 for link_node in cell_node.find_child_recursively(
79 NodeKind.LINK
80 ):
81 if is_conj_link(wxr, link_node): 81 ↛ 78line 81 didn't jump to line 78 because the condition on line 81 was always true
82 if "form-of" not in word_entry.tags: 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true
83 # Template:fr-verbe-flexion
84 process_conj_link_node(wxr, link_node, [word_entry])
85 has_conj_link = True
86 break
87 if has_conj_link:
88 continue
89 cell_text = clean_node(wxr, None, cell_node)
90 colspan = int(cell_node.attrs.get("colspan", "1"))
91 rowspan = int(cell_node.attrs.get("rowspan", "1"))
92 if not row_has_data:
93 col_headers.append(
94 TableSpanHeader(
95 cell_text, col_index, colspan, row_index, rowspan
96 )
97 )
98 else:
99 row_headers.append(
100 TableSpanHeader(
101 cell_text, col_index, colspan, row_index, rowspan
102 )
103 )
104 col_index += colspan
106 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)):
107 col_index = 0
108 last_col_header_row = 0
109 for col_header in col_headers[::-1]:
110 if col_header.row_index < row_index:
111 last_col_header_row = col_header.row_index
112 break
113 for row_header in row_headers:
114 if (
115 row_index >= row_header.row_index
116 and row_index < row_header.row_index + row_header.rowspan
117 ):
118 col_index += row_header.colspan
119 for cell_node in row.find_child(NodeKind.TABLE_CELL):
120 has_conj_link = False
121 for link_node in cell_node.find_child_recursively(
122 NodeKind.LINK
123 ):
124 if is_conj_link(wxr, link_node):
125 if "form-of" not in word_entry.tags: 125 ↛ 127line 125 didn't jump to line 127 because the condition on line 125 was always true
126 process_conj_link_node(wxr, link_node, [word_entry])
127 has_conj_link = True
128 break
129 if has_conj_link:
130 continue
131 colspan = int(cell_node.attrs.get("colspan", "1"))
132 rowspan = int(cell_node.attrs.get("rowspan", "1"))
133 filtered_cell = []
134 cell_tags = []
135 for cell_child in cell_node.children:
136 if (
137 isinstance(cell_child, HTMLNode)
138 and cell_child.tag == "small"
139 ):
140 # Modèle:fr-verbe-flexion
141 raw_tag = clean_node(wxr, None, cell_child)
142 if raw_tag.startswith("(") and raw_tag.endswith(")"): 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true
143 cell_tags.append(raw_tag.strip("() "))
144 else:
145 filtered_cell.append(cell_child)
146 else:
147 filtered_cell.append(cell_child)
148 cell_text = clean_node(wxr, None, filtered_cell)
149 if cell_text == "":
150 continue
151 for line in cell_text.splitlines():
152 line = line.removeprefix("ou ").strip()
153 if is_ipa_text(line):
154 if len(word_entry.forms) > 0:
155 word_entry.forms[-1].ipas.extend(split_ipa(line))
156 continue
157 form = Form(form=line, raw_tags=cell_tags)
158 use_col_tags = []
159 for col_header in col_headers[::-1]:
160 if (
161 col_header.col_index < col_index + colspan
162 and col_index
163 < col_header.col_index + col_header.colspan
164 and col_header.text not in form.raw_tags
165 and col_header.text not in use_col_tags
166 and col_header.text.lower()
167 not in IGNORE_TABLE_HEADERS
168 # column header above cell and above last header
169 # don't use headers for other top sections
170 # Modèle:eo-conj
171 and col_header.row_index + col_header.rowspan
172 in [last_col_header_row, last_col_header_row + 1]
173 ):
174 use_col_tags.append(col_header.text)
175 form.raw_tags.extend(use_col_tags[::-1])
176 for row_header in row_headers:
177 if (
178 row_header.row_index < row_index + rowspan
179 and row_index
180 < row_header.row_index + row_header.rowspan
181 and row_header.text not in form.raw_tags
182 and row_header.text.lower()
183 not in IGNORE_TABLE_HEADERS
184 ):
185 form.raw_tags.append(row_header.text)
186 if form.form not in [
187 "",
188 "—",
189 "non comparable", # Template:de-adj
190 wxr.wtp.title,
191 ]:
192 translate_raw_tags(form)
193 word_entry.forms.append(form)
194 col_index += colspan
197def extract_avk_tab_conjug(
198 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode
199):
200 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug
201 expanded_node = wxr.wtp.parse(
202 wxr.wtp.node_to_wikitext(t_node), expand_all=True
203 )
204 for table in expanded_node.find_child(NodeKind.TABLE):
205 col_headers = []
206 for row in table.find_child(NodeKind.TABLE_ROW):
207 row_header = ""
208 is_row_header = row.contain_node(NodeKind.TABLE_CELL)
209 for col_index, cell in enumerate(
210 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL)
211 ):
212 cell_text = clean_node(wxr, None, cell)
213 if cell_text == "": 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true
214 continue
215 elif cell.kind == NodeKind.TABLE_HEADER_CELL:
216 if is_row_header:
217 row_header = cell_text
218 elif cell_text != "Conjugaison Présent Indicatif":
219 col_headers.append(cell_text)
220 else:
221 form = Form(form=cell_text, tags=["present", "indicative"])
222 if col_index < len(col_headers): 222 ↛ 224line 222 didn't jump to line 224 because the condition on line 222 was always true
223 form.raw_tags.append(col_headers[col_index])
224 if row_header != "": 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true
225 form.raw_tags.append(row_header)
226 translate_raw_tags(form)
227 word_entry.forms.append(form)