Coverage for src/wiktextract/extractor/fr/gloss.py: 95%

168 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-29 08:06 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import calculate_bold_offsets 

9from .etymology import ATTESTATION_TEMPLATES, extract_date_template 

10from .models import AltForm, AttestationData, Example, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13 

14def extract_gloss( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 list_node: WikiNode, 

18 parent_sense: Sense | None = None, 

19) -> None: 

20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

21 gloss_nodes = list( 

22 list_item_node.invert_find_child( 

23 NodeKind.LIST, include_empty_str=True 

24 ) 

25 ) 

26 gloss_data = ( 

27 parent_sense.model_copy(deep=True) 

28 if parent_sense is not None 

29 else Sense() 

30 ) 

31 gloss_data.examples.clear() 

32 # process modifier, theme tempaltes before gloss text 

33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

34 tag_indexes = set() 

35 for index, gloss_node in enumerate(gloss_nodes): 

36 if ( 

37 isinstance(gloss_node, TemplateNode) 

38 and gloss_node.template_name in ATTESTATION_TEMPLATES 

39 ): 

40 gloss_data.attestations = extract_date_template( 

41 wxr, gloss_data, gloss_node 

42 ) 

43 tag_indexes.add(index) 

44 elif ( 

45 isinstance(gloss_node, TemplateNode) 

46 and gloss_node.template_name != "équiv-pour" 

47 ): 

48 categories_data = defaultdict(list) 

49 expanded_text = clean_node(wxr, categories_data, gloss_node) 

50 if ( 

51 expanded_text.startswith("(") 

52 and expanded_text.endswith(")") 

53 and "(" not in expanded_text[1:-1] 

54 ): 

55 tags = expanded_text.strip("() \n").split(", ") 

56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 gloss_data.raw_tags.extend(tags) 

58 if "categories" in categories_data: 

59 gloss_data.categories.extend( 

60 categories_data["categories"] 

61 ) 

62 tag_indexes.add(index) 

63 # if an italic node is between parentheses then it's a tag, also 

64 # don't add the parenthese strings to `gloss_only_nodes` 

65 elif ( 

66 isinstance(gloss_node, WikiNode) 

67 and gloss_node.kind == NodeKind.ITALIC 

68 and isinstance(gloss_nodes[index - 1], str) 

69 and gloss_nodes[index - 1].strip() == "(" 

70 and index + 1 < len(gloss_nodes) 

71 and isinstance(gloss_nodes[index + 1], str) 

72 and gloss_nodes[index + 1].strip() == ")" 

73 ): 

74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

75 tag_indexes |= {index - 1, index, index + 1} 

76 

77 gloss_only_nodes = [ 

78 node 

79 for index, node in enumerate(gloss_nodes) 

80 if index not in tag_indexes 

81 ] 

82 note_index = len(gloss_only_nodes) 

83 for index in range(note_index): 

84 if ( 

85 isinstance(gloss_only_nodes[index], TemplateNode) 

86 and gloss_only_nodes[index].template_name == "note" 

87 ): 

88 note_index = index 

89 gloss_text = find_alt_of_form( 

90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

91 ) 

92 if "form-of" in page_data[-1].tags: 

93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

94 gloss_text = gloss_text.strip("— \n") 

95 if gloss_text != "": 

96 gloss_data.glosses.append(gloss_text) 

97 gloss_data.note = clean_node( 

98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

99 ).strip(" ().") 

100 if len(gloss_data.glosses) > 0: 

101 page_data[-1].senses.append(gloss_data) 

102 

103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

104 if nest_gloss_list.sarg.endswith("#"): 

105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true

107 for e_list_item in nest_gloss_list.find_child( 

108 NodeKind.LIST_ITEM 

109 ): 

110 extract_example_list_item(wxr, gloss_data, e_list_item) 

111 

112 translate_raw_tags(gloss_data) 

113 

114 

115def extract_example_list_item( 

116 wxr: WiktextractContext, sense: Sense, list_item: WikiNode 

117): 

118 has_exemple_template = False 

119 e_data = Example() 

120 e_nodes = [] 

121 raw_tags = [] 

122 for node in list_item.children: 

123 if isinstance(node, TemplateNode): 

124 if node.template_name.endswith("exemple"): 

125 process_exemple_template(wxr, node, sense, raw_tags=raw_tags) 

126 has_exemple_template = True 

127 elif node.template_name == "source": 

128 e_data.ref = clean_node(wxr, sense, node).strip("— ()") 

129 elif node.template_name.lower() == "lang": 

130 e_data = extract_lang_example_template(wxr, node) 

131 if e_data.text != "": 131 ↛ 133line 131 didn't jump to line 133 because the condition on line 131 was always true

132 sense.examples.append(e_data) 

133 has_exemple_template = True 

134 else: 

135 t_text = clean_node(wxr, sense, node) 

136 if t_text.startswith("(") and t_text.endswith(")"): 136 ↛ 139line 136 didn't jump to line 139 because the condition on line 136 was always true

137 raw_tags.append(t_text.strip("() ")) 

138 else: 

139 e_nodes.append(node) 

140 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

141 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

142 e_data.translation = clean_node(wxr, None, tr_item.children) 

143 calculate_bold_offsets( 

144 wxr, 

145 wxr.wtp.parse(wxr.wtp.node_to_wikitext(tr_item.children)), 

146 e_data.translation, 

147 e_data, 

148 "bold_translation_offsets", 

149 ) 

150 else: 

151 e_nodes.append(node) 

152 

153 if not has_exemple_template: 

154 e_data.text = clean_node(wxr, sense, e_nodes) 

155 if e_data.text != "": 155 ↛ exitline 155 didn't return from function 'extract_example_list_item' because the condition on line 155 was always true

156 e_data.raw_tags.extend(raw_tags) 

157 translate_raw_tags(e_data) 

158 calculate_bold_offsets( 

159 wxr, 

160 wxr.wtp.parse(wxr.wtp.node_to_wikitext(e_nodes)), 

161 e_data.text, 

162 e_data, 

163 "bold_text_offsets", 

164 ) 

165 sense.examples.append(e_data) 

166 

167 

168def process_exemple_template( 

169 wxr: WiktextractContext, 

170 node: TemplateNode, 

171 gloss_data: Sense | None, 

172 attestations: list[AttestationData] = [], 

173 raw_tags: list[str] = [], 

174) -> Example: 

175 # https://fr.wiktionary.org/wiki/Modèle:exemple 

176 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

177 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

178 text_arg = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, "")) 

180 ) 

181 text = clean_node(wxr, None, text_arg) 

182 trans_arg = wxr.wtp.parse( 

183 wxr.wtp.node_to_wikitext( 

184 node.template_parameters.get( 

185 2, node.template_parameters.get("sens", "") 

186 ) 

187 ) 

188 ) 

189 translation = clean_node(wxr, None, trans_arg) 

190 roman_arg = wxr.wtp.parse( 

191 wxr.wtp.node_to_wikitext( 

192 node.template_parameters.get( 

193 3, node.template_parameters.get("tr", "") 

194 ) 

195 ) 

196 ) 

197 transcription = clean_node(wxr, None, roman_arg) 

198 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

199 example_data = Example( 

200 text=text, 

201 translation=translation, 

202 roman=transcription, 

203 ref=source, 

204 attestations=attestations, 

205 raw_tags=raw_tags, 

206 ) 

207 calculate_bold_offsets( 

208 wxr, text_arg, text, example_data, "bold_text_offsets" 

209 ) 

210 calculate_bold_offsets( 

211 wxr, trans_arg, translation, example_data, "bold_translation_offsets" 

212 ) 

213 calculate_bold_offsets( 

214 wxr, roman_arg, transcription, example_data, "bold_roman_offsets" 

215 ) 

216 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

217 gloss_data.examples.append(example_data) 

218 if gloss_data is not None: 218 ↛ 220line 218 didn't jump to line 220 because the condition on line 218 was always true

219 clean_node(wxr, gloss_data, node) 

220 translate_raw_tags(example_data) 

221 return example_data 

222 

223 

224def find_alt_of_form( 

225 wxr: WiktextractContext, 

226 gloss_nodes: list[str | WikiNode], 

227 word_entry: WordEntry, 

228 gloss_data: Sense, 

229) -> str: 

230 """ 

231 Return gloss text, remove tag template expanded from "variante *" templates. 

232 """ 

233 from .form_line import process_equiv_pour_template 

234 

235 alt_of = "" 

236 filtered_gloss_nodes = [] 

237 for gloss_node in gloss_nodes: 

238 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

239 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

240 if isinstance( 

241 gloss_node, TemplateNode 

242 ) and gloss_node.template_name.startswith("variante "): 

243 alt_of = clean_node( 

244 wxr, None, gloss_node.template_parameters.get("dif", "") 

245 ) 

246 if len(alt_of) == 0: 

247 alt_of = clean_node( 

248 wxr, None, gloss_node.template_parameters.get(1, "") 

249 ) 

250 if len(alt_of) > 0: 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was always true

251 gloss_data.alt_of.append(AltForm(word=alt_of)) 

252 gloss_data.tags.append("alt-of") 

253 expanded_template = wxr.wtp.parse( 

254 wxr.wtp.node_to_wikitext(gloss_node), 

255 pre_expand=True, 

256 additional_expand={gloss_node.template_name}, 

257 ) 

258 for node in expanded_template.children: 

259 if ( 

260 isinstance(node, TemplateNode) 

261 and node.template_name == "désuet" 

262 ): 

263 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

264 gloss_data.raw_tags.append(raw_tag) 

265 else: 

266 filtered_gloss_nodes.append(node) 

267 elif ( 

268 isinstance(gloss_node, TemplateNode) 

269 and gloss_node.template_name == "équiv-pour" 

270 ): 

271 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

272 form_data.sense_index = len(word_entry.senses) + 1 

273 word_entry.forms.append(form_data) 

274 else: 

275 filtered_gloss_nodes.append(gloss_node) 

276 

277 if alt_of == "" and word_entry.pos == "typographic variant": 

278 for gloss_node in filter( 

279 lambda n: isinstance(n, WikiNode), gloss_nodes 

280 ): 

281 # use the last link 

282 if gloss_node.kind == NodeKind.LINK: 

283 alt_of = clean_node(wxr, None, gloss_node) 

284 if isinstance(gloss_node, TemplateNode): 

285 gloss_node = wxr.wtp.parse( 

286 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

287 ) 

288 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

289 alt_of = clean_node(wxr, None, link) 

290 if len(alt_of) > 0: 290 ↛ 295line 290 didn't jump to line 295 because the condition on line 290 was always true

291 gloss_data.alt_of.append(AltForm(word=alt_of)) 

292 if "alt-of" not in gloss_data.tags: 292 ↛ 295line 292 didn't jump to line 295 because the condition on line 292 was always true

293 gloss_data.tags.append("alt-of") 

294 

295 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

296 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

297 brackets = 0 

298 for char in gloss_text: 

299 if char == "(": 

300 brackets += 1 

301 elif char == ")": 

302 brackets -= 1 

303 if brackets != 0: 303 ↛ 304line 303 didn't jump to line 304 because the condition on line 303 was never true

304 gloss_text = gloss_text.strip(" ()") 

305 return gloss_text 

306 

307 

308def find_form_of_word( 

309 wxr: WiktextractContext, 

310 gloss_nodes: list[str | WikiNode], 

311 gloss_data: Sense, 

312) -> None: 

313 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

314 form_of = "" 

315 for node in gloss_nodes: 

316 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

317 form_of = clean_node(wxr, None, node) 

318 elif isinstance(node, TemplateNode): 

319 if node.template_name in ("mutation de", "lien"): 319 ↛ 315line 319 didn't jump to line 315 because the condition on line 319 was always true

320 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

321 form_of = clean_node( 

322 wxr, None, node.template_parameters.get(1, "") 

323 ) 

324 if len(form_of) > 0: 324 ↛ exitline 324 didn't return from function 'find_form_of_word' because the condition on line 324 was always true

325 gloss_data.form_of.append(AltForm(word=form_of)) 

326 if "form-of" not in gloss_data.tags: 326 ↛ exitline 326 didn't return from function 'find_form_of_word' because the condition on line 326 was always true

327 gloss_data.tags.append("form-of") 

328 

329 

330def extract_lang_example_template( 

331 wxr: WiktextractContext, t_node: TemplateNode 

332) -> Example: 

333 text_arg = wxr.wtp.parse( 

334 wxr.wtp.node_to_wikitext(t_node.template_parameters.get(2, "")) 

335 ) 

336 text = clean_node(wxr, None, text_arg) 

337 e_data = Example(text=text) 

338 calculate_bold_offsets(wxr, text_arg, text, e_data, "bold_text_offsets") 

339 return e_data