Coverage for src/wiktextract/extractor/fr/inflection.py: 96%

142 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-11-03 05:44 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .pronunciation import is_ipa_text 

10from .tags import translate_raw_tags 

11 

12 

13def extract_inflection( 

14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode 

15): 

16 # inflection templates 

17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

18 if t_node.template_name == "avk-tab-conjug": 

19 extract_avk_tab_conjug(wxr, page_data[-1], t_node) 

20 else: 

21 extract_inf_table_template(wxr, page_data[-1], t_node) 

22 

23 

24IGNORE_TABLE_HEADERS = frozenset( 

25 { 

26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

27 "forme", # br-flex-adj 

28 } 

29) 

30 

31 

32def split_ipa(text: str) -> list[str]: 

33 # break IPA text if it contains "ou"(or) 

34 if " ou " in text: 

35 # two ipa texts in the same line: "en-conj-rég" template 

36 return text.split(" ou ") 

37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 return [text.removeprefix("ou ")] 

39 if text.endswith("\\Prononciation ?\\"): 

40 # inflection table templates use a edit link when the ipa data is 

41 # missing, and the link usually ends with "\Prononciation ?\" 

42 return [] 

43 return [text] 

44 

45 

46@dataclass 

47class TableSpanHeader: 

48 text: str 

49 col_index: int = 0 

50 colspan: int = 1 

51 row_index: int = 0 

52 rowspan: int = 1 

53 

54 

55def extract_inf_table_template( 

56 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

57): 

58 # https://fr.wiktionary.org/wiki/Modèle:fro-adj 

59 from .form_line import is_conj_link, process_conj_link_node 

60 

61 expanded_node = wxr.wtp.parse( 

62 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

63 ) 

64 for table in expanded_node.find_child(NodeKind.TABLE): 

65 col_headers = [] 

66 row_headers = [] 

67 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

68 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

69 col_index = 0 

70 for header in chain(col_headers, row_headers): 

71 if ( 

72 row_index > header.row_index 

73 and row_index < header.row_index + header.rowspan 

74 and header.col_index <= col_index 

75 ): 

76 col_index += header.colspan 

77 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

78 if cell_node.attrs.get("style") == "display:none": 

79 continue 

80 has_conj_link = False 

81 for link_node in cell_node.find_child_recursively( 

82 NodeKind.LINK 

83 ): 

84 if is_conj_link(wxr, link_node): 84 ↛ 81line 84 didn't jump to line 81 because the condition on line 84 was always true

85 if "form-of" not in word_entry.tags: 

86 # Template:fr-verbe-flexion 

87 process_conj_link_node(wxr, link_node, [word_entry]) 

88 has_conj_link = True 

89 break 

90 if has_conj_link: 

91 continue 

92 cell_text = clean_node(wxr, None, cell_node) 

93 colspan = int(cell_node.attrs.get("colspan", "1")) 

94 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

95 if not row_has_data: 

96 col_headers.append( 

97 TableSpanHeader( 

98 cell_text, col_index, colspan, row_index, rowspan 

99 ) 

100 ) 

101 else: 

102 row_headers.append( 

103 TableSpanHeader( 

104 cell_text, col_index, colspan, row_index, rowspan 

105 ) 

106 ) 

107 col_index += colspan 

108 

109 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

110 col_index = 0 

111 last_col_header_row = 0 

112 for col_header in col_headers[::-1]: 

113 if col_header.row_index < row_index: 

114 last_col_header_row = col_header.row_index 

115 break 

116 for row_header in row_headers: 

117 if ( 

118 row_index >= row_header.row_index 

119 and row_index < row_header.row_index + row_header.rowspan 

120 and row_header.col_index <= col_index 

121 ): 

122 col_index += row_header.colspan 

123 article = "" 

124 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

125 if cell_node.attrs.get("style") == "display:none": 

126 continue 

127 has_conj_link = False 

128 for link_node in cell_node.find_child_recursively( 

129 NodeKind.LINK 

130 ): 

131 if is_conj_link(wxr, link_node): 

132 if "form-of" not in word_entry.tags: 132 ↛ 134line 132 didn't jump to line 134 because the condition on line 132 was always true

133 process_conj_link_node(wxr, link_node, [word_entry]) 

134 has_conj_link = True 

135 break 

136 if has_conj_link: 

137 continue 

138 colspan = int(cell_node.attrs.get("colspan", "1")) 

139 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

140 cell_classes = cell_node.attrs.get("class", "").split() 

141 filtered_cell = [] 

142 cell_tags = [] 

143 for cell_child in cell_node.children: 

144 if ( 

145 isinstance(cell_child, HTMLNode) 

146 and cell_child.tag == "small" 

147 ): 

148 # Modèle:fr-verbe-flexion 

149 raw_tag = clean_node(wxr, None, cell_child) 

150 if raw_tag.startswith("(") and raw_tag.endswith(")"): 150 ↛ 151line 150 didn't jump to line 151 because the condition on line 150 was never true

151 cell_tags.append(raw_tag.strip("() ")) 

152 else: 

153 filtered_cell.append(cell_child) 

154 else: 

155 filtered_cell.append(cell_child) 

156 cell_text = clean_node(wxr, None, filtered_cell) 

157 # Template:grc-décl-nomf-1-α-ης 

158 if "article" in cell_classes: 

159 article = cell_text 

160 col_index += colspan 

161 continue 

162 for line in cell_text.splitlines(): 

163 line = line.removeprefix("ou ").strip() 

164 if is_ipa_text(line): 

165 if len(word_entry.forms) > 0: 

166 word_entry.forms[-1].ipas.extend(split_ipa(line)) 

167 continue 

168 form = Form(form=line, raw_tags=cell_tags, article=article) 

169 use_col_tags = [] 

170 for col_header in col_headers[::-1]: 

171 if ( 

172 col_header.col_index < col_index + colspan 

173 and col_index 

174 < col_header.col_index + col_header.colspan 

175 and col_header.text not in form.raw_tags 

176 and col_header.text not in use_col_tags 

177 and col_header.text.lower() 

178 not in IGNORE_TABLE_HEADERS 

179 # column header above cell and above last header 

180 # don't use headers for other top sections 

181 # Modèle:eo-conj 

182 and col_header.row_index + col_header.rowspan 

183 in [last_col_header_row, last_col_header_row + 1] 

184 ): 

185 use_col_tags.append(col_header.text) 

186 form.raw_tags.extend(use_col_tags[::-1]) 

187 for row_header in row_headers: 

188 if ( 

189 row_header.row_index < row_index + rowspan 

190 and row_index 

191 < row_header.row_index + row_header.rowspan 

192 and row_header.text not in form.raw_tags 

193 and row_header.text.lower() 

194 not in IGNORE_TABLE_HEADERS 

195 ): 

196 form.raw_tags.append(row_header.text) 

197 if form.form not in [ 

198 "", 

199 "—", 

200 "non comparable", # Template:de-adj 

201 wxr.wtp.title, 

202 ]: 

203 translate_raw_tags(form) 

204 word_entry.forms.append(form) 

205 col_index += colspan 

206 article = "" 

207 

208 

209def extract_avk_tab_conjug( 

210 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

211): 

212 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug 

213 expanded_node = wxr.wtp.parse( 

214 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

215 ) 

216 for table in expanded_node.find_child(NodeKind.TABLE): 

217 col_headers = [] 

218 for row in table.find_child(NodeKind.TABLE_ROW): 

219 row_header = "" 

220 is_row_header = row.contain_node(NodeKind.TABLE_CELL) 

221 for col_index, cell in enumerate( 

222 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

223 ): 

224 cell_text = clean_node(wxr, None, cell) 

225 if cell_text == "": 225 ↛ 226line 225 didn't jump to line 226 because the condition on line 225 was never true

226 continue 

227 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

228 if is_row_header: 

229 row_header = cell_text 

230 elif cell_text != "Conjugaison Présent Indicatif": 

231 col_headers.append(cell_text) 

232 else: 

233 form = Form(form=cell_text, tags=["present", "indicative"]) 

234 if col_index < len(col_headers): 234 ↛ 236line 234 didn't jump to line 236 because the condition on line 234 was always true

235 form.raw_tags.append(col_headers[col_index]) 

236 if row_header != "": 236 ↛ 238line 236 didn't jump to line 238 because the condition on line 236 was always true

237 form.raw_tags.append(row_header) 

238 translate_raw_tags(form) 

239 word_entry.forms.append(form)