Coverage for src/wiktextract/extractor/fr/inflection.py: 95%

133 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1from dataclasses import dataclass 

2from itertools import chain 

3 

4from wikitextprocessor import HTMLNode, NodeKind, TemplateNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import Form, WordEntry 

9from .pronunciation import is_ipa_text 

10from .tags import translate_raw_tags 

11 

12 

13def extract_inflection( 

14 wxr: WiktextractContext, page_data: list[WordEntry], t_node: TemplateNode 

15): 

16 # inflection templates 

17 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_d’accord_en_français 

18 if t_node.template_name == "avk-tab-conjug": 

19 extract_avk_tab_conjug(wxr, page_data[-1], t_node) 

20 else: 

21 extract_inf_table_template(wxr, page_data[-1], t_node) 

22 

23 

24IGNORE_TABLE_HEADERS = frozenset( 

25 { 

26 "terme", # https://fr.wiktionary.org/wiki/Modèle:de-adj 

27 "forme", # br-flex-adj 

28 } 

29) 

30 

31 

32def split_ipa(text: str) -> list[str]: 

33 # break IPA text if it contains "ou"(or) 

34 if " ou " in text: 

35 # two ipa texts in the same line: "en-conj-rég" template 

36 return text.split(" ou ") 

37 if text.startswith("ou "): 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 return [text.removeprefix("ou ")] 

39 if text.endswith("\\Prononciation ?\\"): 

40 # inflection table templates use a edit link when the ipa data is 

41 # missing, and the link usually ends with "\Prononciation ?\" 

42 return [] 

43 return [text] 

44 

45 

46@dataclass 

47class TableSpanHeader: 

48 text: str 

49 col_index: int = 0 

50 colspan: int = 1 

51 row_index: int = 0 

52 rowspan: int = 1 

53 

54 

55def extract_inf_table_template( 

56 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

57): 

58 # https://fr.wiktionary.org/wiki/Modèle:fro-adj 

59 from .form_line import is_conj_link, process_conj_link_node 

60 

61 expanded_node = wxr.wtp.parse( 

62 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

63 ) 

64 for table in expanded_node.find_child(NodeKind.TABLE): 

65 col_headers = [] 

66 row_headers = [] 

67 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

68 row_has_data = row.contain_node(NodeKind.TABLE_CELL) 

69 col_index = 0 

70 for header in chain(col_headers, row_headers): 

71 if ( 

72 row_index >= header.row_index 

73 and row_index < header.row_index + header.rowspan 

74 ): 

75 col_index += header.colspan 

76 for cell_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

77 has_conj_link = False 

78 for link_node in cell_node.find_child_recursively( 

79 NodeKind.LINK 

80 ): 

81 if is_conj_link(wxr, link_node): 81 ↛ 78line 81 didn't jump to line 78 because the condition on line 81 was always true

82 if "form-of" not in word_entry.tags: 82 ↛ 85line 82 didn't jump to line 85 because the condition on line 82 was always true

83 # Template:fr-verbe-flexion 

84 process_conj_link_node(wxr, link_node, [word_entry]) 

85 has_conj_link = True 

86 break 

87 if has_conj_link: 

88 continue 

89 cell_text = clean_node(wxr, None, cell_node) 

90 colspan = int(cell_node.attrs.get("colspan", "1")) 

91 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

92 if not row_has_data: 

93 col_headers.append( 

94 TableSpanHeader( 

95 cell_text, col_index, colspan, row_index, rowspan 

96 ) 

97 ) 

98 else: 

99 row_headers.append( 

100 TableSpanHeader( 

101 cell_text, col_index, colspan, row_index, rowspan 

102 ) 

103 ) 

104 col_index += colspan 

105 

106 for row_index, row in enumerate(table.find_child(NodeKind.TABLE_ROW)): 

107 col_index = 0 

108 last_col_header_row = 0 

109 for col_header in col_headers[::-1]: 

110 if col_header.row_index < row_index: 

111 last_col_header_row = col_header.row_index 

112 break 

113 for row_header in row_headers: 

114 if ( 

115 row_index >= row_header.row_index 

116 and row_index < row_header.row_index + row_header.rowspan 

117 ): 

118 col_index += row_header.colspan 

119 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

120 has_conj_link = False 

121 for link_node in cell_node.find_child_recursively( 

122 NodeKind.LINK 

123 ): 

124 if is_conj_link(wxr, link_node): 

125 if "form-of" not in word_entry.tags: 125 ↛ 127line 125 didn't jump to line 127 because the condition on line 125 was always true

126 process_conj_link_node(wxr, link_node, [word_entry]) 

127 has_conj_link = True 

128 break 

129 if has_conj_link: 

130 continue 

131 colspan = int(cell_node.attrs.get("colspan", "1")) 

132 rowspan = int(cell_node.attrs.get("rowspan", "1")) 

133 filtered_cell = [] 

134 cell_tags = [] 

135 for cell_child in cell_node.children: 

136 if ( 

137 isinstance(cell_child, HTMLNode) 

138 and cell_child.tag == "small" 

139 ): 

140 # Modèle:fr-verbe-flexion 

141 raw_tag = clean_node(wxr, None, cell_child) 

142 if raw_tag.startswith("(") and raw_tag.endswith(")"): 142 ↛ 143line 142 didn't jump to line 143 because the condition on line 142 was never true

143 cell_tags.append(raw_tag.strip("() ")) 

144 else: 

145 filtered_cell.append(cell_child) 

146 else: 

147 filtered_cell.append(cell_child) 

148 cell_text = clean_node(wxr, None, filtered_cell) 

149 if cell_text == "": 

150 continue 

151 for line in cell_text.splitlines(): 

152 line = line.removeprefix("ou ").strip() 

153 if is_ipa_text(line): 

154 if len(word_entry.forms) > 0: 

155 word_entry.forms[-1].ipas.extend(split_ipa(line)) 

156 continue 

157 form = Form(form=line, raw_tags=cell_tags) 

158 use_col_tags = [] 

159 for col_header in col_headers[::-1]: 

160 if ( 

161 col_header.col_index < col_index + colspan 

162 and col_index 

163 < col_header.col_index + col_header.colspan 

164 and col_header.text not in form.raw_tags 

165 and col_header.text not in use_col_tags 

166 and col_header.text.lower() 

167 not in IGNORE_TABLE_HEADERS 

168 # column header above cell and above last header 

169 # don't use headers for other top sections 

170 # Modèle:eo-conj 

171 and col_header.row_index + col_header.rowspan 

172 in [last_col_header_row, last_col_header_row + 1] 

173 ): 

174 use_col_tags.append(col_header.text) 

175 form.raw_tags.extend(use_col_tags[::-1]) 

176 for row_header in row_headers: 

177 if ( 

178 row_header.row_index < row_index + rowspan 

179 and row_index 

180 < row_header.row_index + row_header.rowspan 

181 and row_header.text not in form.raw_tags 

182 and row_header.text.lower() 

183 not in IGNORE_TABLE_HEADERS 

184 ): 

185 form.raw_tags.append(row_header.text) 

186 if form.form not in [ 

187 "", 

188 "—", 

189 "non comparable", # Template:de-adj 

190 wxr.wtp.title, 

191 ]: 

192 translate_raw_tags(form) 

193 word_entry.forms.append(form) 

194 col_index += colspan 

195 

196 

197def extract_avk_tab_conjug( 

198 wxr: WiktextractContext, word_entry: WordEntry, t_node: TemplateNode 

199): 

200 # https://fr.wiktionary.org/wiki/Modèle:avk-tab-conjug 

201 expanded_node = wxr.wtp.parse( 

202 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

203 ) 

204 for table in expanded_node.find_child(NodeKind.TABLE): 

205 col_headers = [] 

206 for row in table.find_child(NodeKind.TABLE_ROW): 

207 row_header = "" 

208 is_row_header = row.contain_node(NodeKind.TABLE_CELL) 

209 for col_index, cell in enumerate( 

210 row.find_child(NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL) 

211 ): 

212 cell_text = clean_node(wxr, None, cell) 

213 if cell_text == "": 213 ↛ 214line 213 didn't jump to line 214 because the condition on line 213 was never true

214 continue 

215 elif cell.kind == NodeKind.TABLE_HEADER_CELL: 

216 if is_row_header: 

217 row_header = cell_text 

218 elif cell_text != "Conjugaison Présent Indicatif": 

219 col_headers.append(cell_text) 

220 else: 

221 form = Form(form=cell_text, tags=["present", "indicative"]) 

222 if col_index < len(col_headers): 222 ↛ 224line 222 didn't jump to line 224 because the condition on line 222 was always true

223 form.raw_tags.append(col_headers[col_index]) 

224 if row_header != "": 224 ↛ 226line 224 didn't jump to line 226 because the condition on line 224 was always true

225 form.raw_tags.append(row_header) 

226 translate_raw_tags(form) 

227 word_entry.forms.append(form)