Coverage for src/wiktextract/extractor/fr/gloss.py: 95%

156 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import calculate_bold_offsets 

9from .etymology import ATTESTATION_TEMPLATES, extract_date_template 

10from .models import AltForm, AttestationData, Example, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13 

14def extract_gloss( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 list_node: WikiNode, 

18 parent_sense: Sense | None = None, 

19) -> None: 

20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

21 gloss_nodes = list( 

22 list_item_node.invert_find_child( 

23 NodeKind.LIST, include_empty_str=True 

24 ) 

25 ) 

26 gloss_data = ( 

27 parent_sense.model_copy(deep=True) 

28 if parent_sense is not None 

29 else Sense() 

30 ) 

31 gloss_data.examples.clear() 

32 # process modifier, theme tempaltes before gloss text 

33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

34 tag_indexes = set() 

35 for index, gloss_node in enumerate(gloss_nodes): 

36 if ( 

37 isinstance(gloss_node, TemplateNode) 

38 and gloss_node.template_name in ATTESTATION_TEMPLATES 

39 ): 

40 gloss_data.attestations = extract_date_template( 

41 wxr, gloss_data, gloss_node 

42 ) 

43 tag_indexes.add(index) 

44 elif ( 

45 isinstance(gloss_node, TemplateNode) 

46 and gloss_node.template_name != "équiv-pour" 

47 ): 

48 categories_data = defaultdict(list) 

49 expanded_text = clean_node(wxr, categories_data, gloss_node) 

50 if ( 

51 expanded_text.startswith("(") 

52 and expanded_text.endswith(")") 

53 and "(" not in expanded_text[1:-1] 

54 ): 

55 tags = expanded_text.strip("() \n").split(", ") 

56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 gloss_data.raw_tags.extend(tags) 

58 if "categories" in categories_data: 

59 gloss_data.categories.extend( 

60 categories_data["categories"] 

61 ) 

62 tag_indexes.add(index) 

63 # if an italic node is between parentheses then it's a tag, also 

64 # don't add the parenthese strings to `gloss_only_nodes` 

65 elif ( 

66 isinstance(gloss_node, WikiNode) 

67 and gloss_node.kind == NodeKind.ITALIC 

68 and isinstance(gloss_nodes[index - 1], str) 

69 and gloss_nodes[index - 1].strip() == "(" 

70 and index + 1 < len(gloss_nodes) 

71 and isinstance(gloss_nodes[index + 1], str) 

72 and gloss_nodes[index + 1].strip() == ")" 

73 ): 

74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

75 tag_indexes |= {index - 1, index, index + 1} 

76 

77 gloss_only_nodes = [ 

78 node 

79 for index, node in enumerate(gloss_nodes) 

80 if index not in tag_indexes 

81 ] 

82 note_index = len(gloss_only_nodes) 

83 for index in range(note_index): 

84 if ( 

85 isinstance(gloss_only_nodes[index], TemplateNode) 

86 and gloss_only_nodes[index].template_name == "note" 

87 ): 

88 note_index = index 

89 gloss_text = find_alt_of_form( 

90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

91 ) 

92 if "form-of" in page_data[-1].tags: 

93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

94 gloss_text = gloss_text.strip("— \n") 

95 if gloss_text != "": 

96 gloss_data.glosses.append(gloss_text) 

97 gloss_data.note = clean_node( 

98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

99 ).strip(" ().") 

100 if len(gloss_data.glosses) > 0: 

101 page_data[-1].senses.append(gloss_data) 

102 

103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

104 if nest_gloss_list.sarg.endswith("#"): 

105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true

107 for e_list_item in nest_gloss_list.find_child( 

108 NodeKind.LIST_ITEM 

109 ): 

110 extract_example_list_item(wxr, gloss_data, e_list_item) 

111 

112 translate_raw_tags(gloss_data) 

113 

114 

115def extract_example_list_item( 

116 wxr: WiktextractContext, sense: Sense, list_item: WikiNode 

117): 

118 has_exemple_template = False 

119 e_data = Example() 

120 e_nodes = [] 

121 raw_tags = [] 

122 for node in list_item.children: 

123 if isinstance(node, TemplateNode): 

124 if node.template_name.endswith("exemple"): 

125 process_exemple_template(wxr, node, sense, raw_tags=raw_tags) 

126 has_exemple_template = True 

127 elif node.template_name == "source": 

128 e_data.ref = clean_node(wxr, sense, node).strip("— ()") 

129 else: 

130 t_text = clean_node(wxr, sense, node) 

131 if t_text.startswith("(") and t_text.endswith(")"): 131 ↛ 134line 131 didn't jump to line 134 because the condition on line 131 was always true

132 raw_tags.append(t_text.strip("() ")) 

133 else: 

134 e_nodes.append(node) 

135 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

136 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

137 e_data.translation = clean_node(wxr, None, tr_item.children) 

138 else: 

139 e_nodes.append(node) 

140 

141 if not has_exemple_template: 

142 e_data.text = clean_node(wxr, sense, e_nodes) 

143 if e_data.text != "": 143 ↛ exitline 143 didn't return from function 'extract_example_list_item' because the condition on line 143 was always true

144 e_data.raw_tags.extend(raw_tags) 

145 translate_raw_tags(e_data) 

146 calculate_bold_offsets( 

147 wxr, 

148 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_nodes)), 

149 e_data.text, 

150 e_data, 

151 "bold_text_offsets", 

152 ) 

153 sense.examples.append(e_data) 

154 

155 

156def process_exemple_template( 

157 wxr: WiktextractContext, 

158 node: TemplateNode, 

159 gloss_data: Sense | None, 

160 attestations: list[AttestationData] = [], 

161 raw_tags: list[str] = [], 

162) -> Example: 

163 # https://fr.wiktionary.org/wiki/Modèle:exemple 

164 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

165 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

166 text_arg = wxr.wtp.parse( 

167 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, "")) 

168 ) 

169 text = clean_node(wxr, None, text_arg) 

170 trans_arg = wxr.wtp.parse( 

171 wxr.wtp.node_to_wikitext( 

172 node.template_parameters.get( 

173 2, node.template_parameters.get("sens", "") 

174 ) 

175 ) 

176 ) 

177 translation = clean_node(wxr, None, trans_arg) 

178 roman_arg = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext( 

180 node.template_parameters.get( 

181 3, node.template_parameters.get("tr", "") 

182 ) 

183 ) 

184 ) 

185 transcription = clean_node(wxr, None, roman_arg) 

186 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

187 example_data = Example( 

188 text=text, 

189 translation=translation, 

190 roman=transcription, 

191 ref=source, 

192 attestations=attestations, 

193 raw_tags=raw_tags, 

194 ) 

195 calculate_bold_offsets( 

196 wxr, text_arg, text, example_data, "bold_text_offsets" 

197 ) 

198 calculate_bold_offsets( 

199 wxr, trans_arg, translation, example_data, "bold_translation_offsets" 

200 ) 

201 calculate_bold_offsets( 

202 wxr, roman_arg, transcription, example_data, "bold_roman_offsets" 

203 ) 

204 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

205 gloss_data.examples.append(example_data) 

206 if gloss_data is not None: 206 ↛ 208line 206 didn't jump to line 208 because the condition on line 206 was always true

207 clean_node(wxr, gloss_data, node) 

208 translate_raw_tags(example_data) 

209 return example_data 

210 

211 

212def find_alt_of_form( 

213 wxr: WiktextractContext, 

214 gloss_nodes: list[str | WikiNode], 

215 word_entry: WordEntry, 

216 gloss_data: Sense, 

217) -> str: 

218 """ 

219 Return gloss text, remove tag template expanded from "variante *" templates. 

220 """ 

221 from .form_line import process_equiv_pour_template 

222 

223 alt_of = "" 

224 filtered_gloss_nodes = [] 

225 for gloss_node in gloss_nodes: 

226 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

227 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

228 if isinstance( 

229 gloss_node, TemplateNode 

230 ) and gloss_node.template_name.startswith("variante "): 

231 alt_of = clean_node( 

232 wxr, None, gloss_node.template_parameters.get("dif", "") 

233 ) 

234 if len(alt_of) == 0: 

235 alt_of = clean_node( 

236 wxr, None, gloss_node.template_parameters.get(1, "") 

237 ) 

238 if len(alt_of) > 0: 238 ↛ 241line 238 didn't jump to line 241 because the condition on line 238 was always true

239 gloss_data.alt_of.append(AltForm(word=alt_of)) 

240 gloss_data.tags.append("alt-of") 

241 expanded_template = wxr.wtp.parse( 

242 wxr.wtp.node_to_wikitext(gloss_node), 

243 pre_expand=True, 

244 additional_expand={gloss_node.template_name}, 

245 ) 

246 for node in expanded_template.children: 

247 if ( 

248 isinstance(node, TemplateNode) 

249 and node.template_name == "désuet" 

250 ): 

251 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

252 gloss_data.raw_tags.append(raw_tag) 

253 else: 

254 filtered_gloss_nodes.append(node) 

255 elif ( 

256 isinstance(gloss_node, TemplateNode) 

257 and gloss_node.template_name == "équiv-pour" 

258 ): 

259 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

260 form_data.sense_index = len(word_entry.senses) + 1 

261 word_entry.forms.append(form_data) 

262 else: 

263 filtered_gloss_nodes.append(gloss_node) 

264 

265 if alt_of == "" and word_entry.pos == "typographic variant": 

266 for gloss_node in filter( 

267 lambda n: isinstance(n, WikiNode), gloss_nodes 

268 ): 

269 # use the last link 

270 if gloss_node.kind == NodeKind.LINK: 

271 alt_of = clean_node(wxr, None, gloss_node) 

272 if isinstance(gloss_node, TemplateNode): 

273 gloss_node = wxr.wtp.parse( 

274 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

275 ) 

276 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

277 alt_of = clean_node(wxr, None, link) 

278 if len(alt_of) > 0: 278 ↛ 283line 278 didn't jump to line 283 because the condition on line 278 was always true

279 gloss_data.alt_of.append(AltForm(word=alt_of)) 

280 if "alt-of" not in gloss_data.tags: 280 ↛ 283line 280 didn't jump to line 283 because the condition on line 280 was always true

281 gloss_data.tags.append("alt-of") 

282 

283 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

284 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

285 brackets = 0 

286 for char in gloss_text: 

287 if char == "(": 

288 brackets += 1 

289 elif char == ")": 

290 brackets -= 1 

291 if brackets != 0: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 gloss_text = gloss_text.strip(" ()") 

293 return gloss_text 

294 

295 

296def find_form_of_word( 

297 wxr: WiktextractContext, 

298 gloss_nodes: list[str | WikiNode], 

299 gloss_data: Sense, 

300) -> None: 

301 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

302 form_of = "" 

303 for node in gloss_nodes: 

304 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

305 form_of = clean_node(wxr, None, node) 

306 elif isinstance(node, TemplateNode): 

307 if node.template_name in ("mutation de", "lien"): 307 ↛ 303line 307 didn't jump to line 303 because the condition on line 307 was always true

308 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

309 form_of = clean_node( 

310 wxr, None, node.template_parameters.get(1, "") 

311 ) 

312 if len(form_of) > 0: 312 ↛ exitline 312 didn't return from function 'find_form_of_word' because the condition on line 312 was always true

313 gloss_data.form_of.append(AltForm(word=form_of)) 

314 if "form-of" not in gloss_data.tags: 314 ↛ exitline 314 didn't return from function 'find_form_of_word' because the condition on line 314 was always true

315 gloss_data.tags.append("form-of")