Coverage for src / wiktextract / extractor / fr / inflection.py: 96%

152 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-03 10:39 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .pronunciation import is_ipa_text 

10from .tags import translate_raw_tags 

11 

12 

13def extract_inflection( 

14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode 

15): 

16 # inflection templates 

17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

18 if t_node.template_name == "avk-tab-conjug": 

19 extract_avk_tab_conjug(wxr, page_data[-1], t_node) 

20 else: 

21 extract_inf_table_template(wxr, page_data[-1], t_node) 

22 

23 

24IGNORE_TABLE_HEADERS = frozenset( 

25 { 

26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

27 "forme", # br-flex-adj 

28 } 

29) 

30 

31 

32def split_ipa(text: str) -> list[str]: 

33 # break IPA text if it contains "ou"(or) 

34 if " ou " in text: 

35 # two ipa texts in the same line: "en-conj-rég" template 

36 return text.split(" ou ") 

37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 return [text.removeprefix("ou ")] 

39 if text.endswith("\\Prononciation ?\\"): 

40 # inflection table templates use a edit link when the ipa data is 

41 # missing, and the link usually ends with "\Prononciation ?\" 

42 return [] 

43 return [text] 

44 

45 

46@dataclass 

47class TableSpanHeader: 

48 text: str 

49 col_index: int = 0 

50 colspan: int = 1 

51 row_index: int = 0 

52 rowspan: int = 1 

53 

54 

55def extract_inf_table_template( 

56 wxr: WiktextractContext, 

57 word_entry: WordEntry, 

58 t_node: TemplateNode, 

59 source: str = "", 

60): 

61 # https://fr.wiktionary.org/wiki/Modèle:fro-adj 

62 from .form_line import is_conj_link, process_conj_link_node 

63 

64 expanded_node = wxr.wtp.parse( 

65 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

66 ) 

67 for table in expanded_node.find_child_recursively(NodeKind.TABLE): 

68 col_headers = [] 

69 row_headers = [] 

70 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

71 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

72 col_index = 0 

73 for header in chain(col_headers, row_headers): 

74 if ( 

75 row_index > header.row_index 

76 and row_index < header.row_index + header.rowspan 

77 and header.col_index <= col_index 

78 ): 

79 col_index += header.colspan 

80 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

81 if cell_node.attrs.get("style") == "display:none": 

82 continue 

83 has_conj_link = False 

84 for link_node in cell_node.find_child_recursively( 

85 NodeKind.LINK 

86 ): 

87 if is_conj_link(wxr, link_node): 

88 if "form-of" not in word_entry.tags: 

89 # Template:fr-verbe-flexion 

90 process_conj_link_node(wxr, link_node, [word_entry]) 

91 has_conj_link = True 

92 break 

93 if has_conj_link: 

94 continue 

95 cell_text = clean_node(wxr, None, cell_node) 

96 colspan = int(cell_node.attrs.get("colspan", "1")) 

97 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

98 if not row_has_data: 

99 col_headers.append( 

100 TableSpanHeader( 

101 cell_text, col_index, colspan, row_index, rowspan 

102 ) 

103 ) 

104 else: 

105 row_headers.append( 

106 TableSpanHeader( 

107 cell_text, col_index, colspan, row_index, rowspan 

108 ) 

109 ) 

110 col_index += colspan 

111 

112 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

113 col_index = 0 

114 last_col_header_row = 0 

115 for col_header in col_headers[::-1]: 

116 if col_header.row_index < row_index: 

117 last_col_header_row = col_header.row_index 

118 break 

119 for row_header in row_headers: 

120 if ( 

121 row_index >= row_header.row_index 

122 and row_index < row_header.row_index + row_header.rowspan 

123 and row_header.col_index <= col_index 

124 ): 

125 col_index += row_header.colspan 

126 article = "" 

127 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

128 if cell_node.attrs.get("style") == "display:none": 

129 continue 

130 has_collapsible_div = False 

131 has_conj_link = False 

132 for link_node in cell_node.find_child_recursively( 

133 NodeKind.LINK 

134 ): 

135 if is_conj_link(wxr, link_node): 

136 if "form-of" not in word_entry.tags: 136 ↛ 138line 136 didn't jump to line 138 because the condition on line 136 was always true

137 process_conj_link_node(wxr, link_node, [word_entry]) 

138 has_conj_link = True 

139 break 

140 # ignore note in Template:fi-décl-ihminen 

141 for div_tag in cell_node.find_html("div"): 

142 div_class = div_tag.attrs.get("class", "").split() 

143 if "mw-collapsible" in div_class: 143 ↛ 141line 143 didn't jump to line 141 because the condition on line 143 was always true

144 has_collapsible_div = True 

145 break 

146 if has_conj_link or has_collapsible_div: 

147 continue 

148 colspan = int(cell_node.attrs.get("colspan", "1")) 

149 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

150 cell_classes = cell_node.attrs.get("class", "").split() 

151 filtered_cell = [] 

152 cell_tags = [] 

153 for cell_child in cell_node.children: 

154 if ( 

155 isinstance(cell_child, HTMLNode) 

156 and cell_child.tag == "small" 

157 ): 

158 # Modèle:fr-verbe-flexion 

159 raw_tag = clean_node(wxr, None, cell_child) 

160 if raw_tag.startswith("(") and raw_tag.endswith(")"): 160 ↛ 161line 160 didn't jump to line 161 because the condition on line 160 was never true

161 cell_tags.append(raw_tag.strip("() ")) 

162 else: 

163 filtered_cell.append(cell_child) 

164 elif ( 

165 isinstance(cell_child, HTMLNode) 

166 and cell_child.tag == "span" 

167 ): 

168 # note ref number in Template:fi-décl-ihminen 

169 span_id = cell_child.attrs.get("id", "") 

170 if not span_id.startswith("ref-"): 

171 filtered_cell.append(cell_child) 

172 else: 

173 filtered_cell.append(cell_child) 

174 cell_text = clean_node(wxr, None, filtered_cell) 

175 # Template:grc-décl-nomf-1-α-ης 

176 if "article" in cell_classes: 

177 article = cell_text 

178 col_index += colspan 

179 continue 

180 for line in cell_text.splitlines(): 

181 line = line.removeprefix("ou ").strip() 

182 if is_ipa_text(line): 

183 if len(word_entry.forms) > 0: 

184 word_entry.forms[-1].ipas.extend(split_ipa(line)) 

185 continue 

186 form = Form( 

187 form=line, 

188 raw_tags=cell_tags, 

189 article=article, 

190 source=source, 

191 ) 

192 use_col_tags = [] 

193 for col_header in col_headers[::-1]: 

194 if ( 

195 col_header.col_index < col_index + colspan 

196 and col_index 

197 < col_header.col_index + col_header.colspan 

198 and col_header.text not in form.raw_tags 

199 and col_header.text not in use_col_tags 

200 and col_header.text.lower() 

201 not in IGNORE_TABLE_HEADERS 

202 # column header above cell and above last header 

203 # don't use headers for other top sections 

204 # Modèle:eo-conj 

205 and col_header.row_index + col_header.rowspan 

206 in [last_col_header_row, last_col_header_row + 1] 

207 ): 

208 use_col_tags.append(col_header.text) 

209 form.raw_tags.extend(use_col_tags[::-1]) 

210 for row_header in row_headers: 

211 if ( 

212 row_header.row_index < row_index + rowspan 

213 and row_index 

214 < row_header.row_index + row_header.rowspan 

215 and row_header.text not in form.raw_tags 

216 and row_header.text.lower() 

217 not in IGNORE_TABLE_HEADERS 

218 ): 

219 form.raw_tags.append(row_header.text) 

220 if form.form not in [ 

221 "", 

222 "—", 

223 "non comparable", # Template:de-adj 

224 wxr.wtp.title, 

225 ]: 

226 translate_raw_tags(form) 

227 word_entry.forms.append(form) 

228 col_index += colspan 

229 article = "" 

230 

231 

232def extract_avk_tab_conjug( 

233 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

234): 

235 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug 

236 expanded_node = wxr.wtp.parse( 

237 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

238 ) 

239 for table in expanded_node.find_child(NodeKind.TABLE): 

240 col_headers = [] 

241 for row in table.find_child(NodeKind.TABLE_ROW): 

242 row_header = "" 

243 is_row_header = row.contain_node(NodeKind.TABLE_CELL) 

244 for col_index, cell in enumerate( 

245 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

246 ): 

247 cell_text = clean_node(wxr, None, cell) 

248 if cell_text == "": 248 ↛ 249line 248 didn't jump to line 249 because the condition on line 248 was never true

249 continue 

250 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

251 if is_row_header: 

252 row_header = cell_text 

253 elif cell_text != "Conjugaison Présent Indicatif": 

254 col_headers.append(cell_text) 

255 else: 

256 form = Form(form=cell_text, tags=["present", "indicative"]) 

257 if col_index < len(col_headers): 257 ↛ 259line 257 didn't jump to line 259 because the condition on line 257 was always true

258 form.raw_tags.append(col_headers[col_index]) 

259 if row_header != "": 259 ↛ 261line 259 didn't jump to line 261 because the condition on line 259 was always true

260 form.raw_tags.append(row_header) 

261 translate_raw_tags(form) 

262 word_entry.forms.append(form)