Coverage for src/wiktextract/extractor/fr/inflection.py: 89%

1from dataclasses import dataclass

3from wikitextprocessor.parser import NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .models import Form, WordEntry

8from .pronunciation import is_ipa_text

9from .tags import translate_raw_tags

12def extract_inflection(

13 wxr: WiktextractContext,

14 page_data: list[WordEntry],

15 template_node: TemplateNode,

16) -> None:

17 # inflection templates

18 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français

19 if template_node.template_name.startswith("en-adj"):

20 process_en_adj_table(wxr, page_data, template_node)

21 else:

22 process_inflection_table(wxr, page_data, template_node)

25IGNORE_TABLE_HEADERS = frozenset(

26 {

27 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj

28 "forme", # br-flex-adj

29 "temps", # en-conj-rég,

30 "cas", # lt_décl_as, ro-nom-tab(lower case)

31 "commun", # sv-nom-c-ar

32 "personne", # hu-pos-otok

33 "pronom personnel", # it-enclise

34 "mutation", # br-nom

35 "nombre", # ca-accord-mixte2

36 "nature", # de-adj

37 "genre", # es-accord-oa

38 "conjugaison présent indicatif", # avk-tab-conjug

39 "mode", # eo-conj

40 "avec suffixes possessifs", # fi-décl-valo

41 "en kurmandji", # flex-ku-nomf

42 }

43)

44IGNORE_TABLE_HEADER_PREFIXES = (

45 "voir la conjugaison du verbe ", # Modèle:fr-verbe-flexion

46 "conjugaison de ", # sv-conj-ar

47 "déclinaison de ", # da-adj

48)

49IGNORE_TABLE_CELL = frozenset(

50 {

51 "Déclinaisons", # de-adj

52 "—", # https://fr.wiktionary.org/wiki/Modèle:vls-nom

53 }

54)

55IGNORE_TABLE_CELL_PREFIXES = (

56 "voir conjugaison ", # en-conj, avk-conj

57)

60@dataclass

61class ColspanHeader:

62 text: str

63 index: int

64 span: int

67def table_data_cell_is_header(

68 wxr: WiktextractContext, cell_node: WikiNode, page_title: str

69) -> bool:

70 # first child is bold node

71 if cell_node.kind == NodeKind.TABLE_CELL: 71 ↛ 82line 71 didn't jump to line 82 because the condition on line 71 was always true

72 for child in cell_node.filter_empty_str_child(): 72 ↛ 82line 72 didn't jump to line 82 because the loop on line 72 didn't complete

73 cell_text = clean_node(wxr, None, child)

74 return (

75 isinstance(child, WikiNode)

76 and child.kind == NodeKind.BOLD

77 and len(cell_text) > 0

78 and cell_text[0].isupper()

79 and cell_text != page_title

80 )

82 return False

85def process_inflection_table(

86 wxr: WiktextractContext,

87 page_data: list[WordEntry],

88 table_template: TemplateNode,

89) -> None:

90 from .form_line import is_conj_link, process_conj_link_node

92 expanded_node = wxr.wtp.parse(

93 wxr.wtp.node_to_wikitext(table_template), expand_all=True

94 )

95 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))

96 if len(table_nodes) == 0: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 return

98 table_node = table_nodes[0]

99 column_headers = []

100 rowspan_headers = []

101 colspan_headers = []

102 for row_num, table_row in enumerate(

103 table_node.find_child(NodeKind.TABLE_ROW)

104 ):

105 # filter empty table cells

106 table_row_nodes = [

107 row_node_child

108 for row_node_child in table_row.children

109 if isinstance(row_node_child, WikiNode)

110 and (

111 row_node_child.kind == NodeKind.TABLE_HEADER_CELL

112 or (

113 row_node_child.kind == NodeKind.TABLE_CELL

114 and len(list(row_node_child.filter_empty_str_child())) > 0

115 )

116 )

117 and row_node_child.attrs.get("style") != "display:none"

118 and "invisible" not in row_node_child.attrs.get("class", "")

119 ]

120 current_row_has_data_cell = any(

121 isinstance(cell, WikiNode)

122 and cell.kind == NodeKind.TABLE_CELL

123 and not table_data_cell_is_header(wxr, cell, page_data[-1].word)

124 for cell in table_row_nodes

125 )

126 if not current_row_has_data_cell:

127 column_headers.clear()

128 row_headers = []

129 new_rowspan_headers = []

130 for rowspan_text, rowspan_count in rowspan_headers:

131 row_headers.append(rowspan_text)

132 if rowspan_count - 1 > 0: 132 ↛ 133line 132 didn't jump to line 133 because the condition on line 132 was never true

133 new_rowspan_headers.append((rowspan_text, rowspan_count - 1))

134 rowspan_headers = new_rowspan_headers

135

136 column_cell_index = 0

137 for column_num, table_cell in enumerate(table_row_nodes):

138 form_data = Form()

139 if isinstance(table_cell, WikiNode): 139 ↛ 137line 139 didn't jump to line 137 because the condition on line 139 was always true

140 if (

141 table_cell.kind == NodeKind.TABLE_HEADER_CELL

142 or table_data_cell_is_header(

143 wxr, table_cell, page_data[-1].word

144 )

145 ):

146 if any(

147 table_cell.find_html(

148 "span",

149 attr_name="class",

150 attr_value="ligne-de-forme",

151 )

152 ):

153 # ignore gender header in template "ro-nom-tab"

154 continue

155 table_header_text = clean_node(

156 wxr, None, table_cell

157 ).replace("\n", " ")

158 if (

159 table_header_text.lower() in IGNORE_TABLE_HEADERS

160 or table_header_text.lower().startswith(

161 IGNORE_TABLE_HEADER_PREFIXES

162 )

163 or len(table_header_text.strip()) == 0

164 ):

165 continue

166 rsplit_header = table_header_text.rsplit(maxsplit=1)

167 if len(rsplit_header) > 1 and rsplit_header[-1].isdecimal():

168 # "Pluriel 1" in template "br-nom"

169 table_header_text = rsplit_header[0]

170

171 if not current_row_has_data_cell:

172 # if all cells of the row are header cells

173 # then the header cells are column headers

174 if "colspan" in table_cell.attrs:

175 colspan_headers.append(

176 ColspanHeader(

177 table_header_text,

178 column_cell_index,

179 int(table_cell.attrs.get("colspan")),

180 )

181 )

182 else:

183 column_headers.append(table_header_text)

184 column_cell_index += int(

185 table_cell.attrs.get("colspan", 1)

186 )

187 else:

188 if table_header_text not in row_headers: 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was always true

189 row_headers.append(table_header_text)

190 if "rowspan" in table_cell.attrs:

191 rowspan_headers.append(

192 (

193 table_header_text,

194 int(table_cell.attrs.get("rowspan")) - 1,

195 )

196 )

197 elif table_cell.kind == NodeKind.TABLE_CELL: 197 ↛ 137line 197 didn't jump to line 137 because the condition on line 197 was always true

198 has_conj_link = False

199 for link_node in table_cell.find_child(NodeKind.LINK):

200 if is_conj_link(wxr, link_node): 200 ↛ 201line 200 didn't jump to line 201 because the condition on line 200 was never true

201 process_conj_link_node(wxr, link_node, page_data)

202 has_conj_link = True

203 break

204 if has_conj_link: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 continue

206 table_cell_lines = clean_node(wxr, None, table_cell)

207 for table_cell_line in table_cell_lines.splitlines():

208 if is_ipa_text(table_cell_line):

209 insert_ipa(form_data, table_cell_line)

210 elif (

211 table_cell_line != page_data[-1].word

212 and table_cell_line not in IGNORE_TABLE_CELL

213 and not table_cell_line.lower().startswith(

214 IGNORE_TABLE_CELL_PREFIXES

215 )

216 ):

217 if form_data.form == "":

218 form_data.form = table_cell_line

219 else:

220 form_data.form += "\n" + table_cell_line

221 for colspan_header in colspan_headers:

222 if (

223 column_cell_index >= colspan_header.index

224 and column_cell_index

225 < colspan_header.index + colspan_header.span

226 ):

227 form_data.raw_tags.append(colspan_header.text)

228 if (

229 "colspan" not in table_cell.attrs

230 and len(column_headers) > column_cell_index

231 and column_headers[column_cell_index].lower()

232 not in IGNORE_TABLE_HEADERS

233 ):

234 form_data.raw_tags.append(

235 column_headers[column_cell_index]

236 )

237

238 if len(row_headers) > 0:

239 form_data.raw_tags.extend(row_headers)

240 if form_data.form != "":

241 for form in form_data.form.splitlines():

242 if form.startswith("(") and form.endswith(")"): 242 ↛ 243line 242 didn't jump to line 243 because the condition on line 242 was never true

243 form_data.raw_tags.append(form.strip("()"))

244 continue

245 new_form_data = form_data.model_copy(deep=True)

246 new_form_data.form = form.removeprefix("ou ")

247 translate_raw_tags(

248 new_form_data, table_template.template_name

249 )

250 if len(new_form_data.form.strip()) > 0: 250 ↛ 241line 250 didn't jump to line 241 because the condition on line 250 was always true

251 page_data[-1].forms.append(new_form_data)

252

253 colspan_text = table_cell.attrs.get("colspan", "1")

254 if colspan_text.isdecimal(): 254 ↛ 137line 254 didn't jump to line 137 because the condition on line 254 was always true

255 column_cell_index += int(colspan_text)

256

257

258def split_ipa(text: str) -> list[str]:

259 # break IPA text if it contains "ou"(or)

260 if " ou " in text:

261 # two ipa texts in the same line: "en-conj-rég" template

262 return text.split(" ou ")

263 if text.startswith("ou "):

264 return [text.removeprefix("ou ")]

265 if text.endswith("Prononciation ?\\"):

266 # inflection table templates use a edit link when the ipa data is

267 # missing, and the link usually ends with "Prononciation ?"

268 return []

269 return [text]

270

271

272def insert_ipa(form: Form, ipa_text: str) -> None:

273 ipa_data = split_ipa(ipa_text)

274 if len(ipa_data) == 0:

275 return

276 form.ipas.extend(ipa_data)

277

278

279def process_en_adj_table(

280 wxr: WiktextractContext,

281 page_data: list[WordEntry],

282 template_node: WikiNode,

283) -> None:

284 # https://fr.wiktionary.org/wiki/Modèle:en-adj

285 # and other en-adj* templates

286 # these templates use normal table cell for column table header

287 expanded_node = wxr.wtp.parse(

288 wxr.wtp.node_to_wikitext(template_node), expand_all=True

289 )

290 table_nodes = list(expanded_node.find_child(NodeKind.TABLE))

291 if len(table_nodes) == 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 return

293 table_node = table_nodes[0]

294 for row_num, table_row in enumerate(

295 table_node.find_child(NodeKind.TABLE_ROW)

296 ):

297 if row_num == 0:

298 # skip header

299 continue

300 if len(table_row.children) > 1: 300 ↛ 294line 300 didn't jump to line 294 because the condition on line 300 was always true

301 form_data = Form()

302 form_data.raw_tags.append(

303 clean_node(wxr, None, table_row.children[0])

304 )

305 form_text = clean_node(wxr, None, table_row.children[1])

306 for form_line in form_text.splitlines():

307 if form_line in IGNORE_TABLE_CELL: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 continue

309 elif is_ipa_text(form_line):

310 insert_ipa(form_data, form_line)

311 else:

312 form_data.form = form_line

313 if form_data.form != page_data[-1].word and len(form_data.form) > 0:

314 translate_raw_tags(form_data)

315 page_data[-1].forms.append(form_data)