Coverage for src/wiktextract/extractor/fr/gloss.py: 94%

148 statements  

« prev     ^ index     » next       coverage.py v7.10.6, created at 2025-09-12 08:27 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import calculate_bold_offsets 

9from .etymology import ATTESTATION_TEMPLATES, extract_date_template 

10from .models import AltForm, AttestationData, Example, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13 

14def extract_gloss( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 list_node: WikiNode, 

18 parent_sense: Sense | None = None, 

19) -> None: 

20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

21 gloss_nodes = list( 

22 list_item_node.invert_find_child( 

23 NodeKind.LIST, include_empty_str=True 

24 ) 

25 ) 

26 gloss_data = ( 

27 parent_sense.model_copy(deep=True) 

28 if parent_sense is not None 

29 else Sense() 

30 ) 

31 gloss_data.examples.clear() 

32 # process modifier, theme tempaltes before gloss text 

33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

34 tag_indexes = set() 

35 for index, gloss_node in enumerate(gloss_nodes): 

36 if ( 

37 isinstance(gloss_node, TemplateNode) 

38 and gloss_node.template_name in ATTESTATION_TEMPLATES 

39 ): 

40 gloss_data.attestations = extract_date_template( 

41 wxr, gloss_data, gloss_node 

42 ) 

43 tag_indexes.add(index) 

44 elif ( 

45 isinstance(gloss_node, TemplateNode) 

46 and gloss_node.template_name != "équiv-pour" 

47 ): 

48 categories_data = defaultdict(list) 

49 expanded_text = clean_node(wxr, categories_data, gloss_node) 

50 if ( 

51 expanded_text.startswith("(") 

52 and expanded_text.endswith(")") 

53 and "(" not in expanded_text[1:-1] 

54 ): 

55 tags = expanded_text.strip("() \n").split(", ") 

56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 gloss_data.raw_tags.extend(tags) 

58 if "categories" in categories_data: 

59 gloss_data.categories.extend( 

60 categories_data["categories"] 

61 ) 

62 tag_indexes.add(index) 

63 # if an italic node is between parentheses then it's a tag, also 

64 # don't add the parenthese strings to `gloss_only_nodes` 

65 elif ( 

66 isinstance(gloss_node, WikiNode) 

67 and gloss_node.kind == NodeKind.ITALIC 

68 and isinstance(gloss_nodes[index - 1], str) 

69 and gloss_nodes[index - 1].strip() == "(" 

70 and index + 1 < len(gloss_nodes) 

71 and isinstance(gloss_nodes[index + 1], str) 

72 and gloss_nodes[index + 1].strip() == ")" 

73 ): 

74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

75 tag_indexes |= {index - 1, index, index + 1} 

76 

77 gloss_only_nodes = [ 

78 node 

79 for index, node in enumerate(gloss_nodes) 

80 if index not in tag_indexes 

81 ] 

82 note_index = len(gloss_only_nodes) 

83 for index in range(note_index): 

84 if ( 

85 isinstance(gloss_only_nodes[index], TemplateNode) 

86 and gloss_only_nodes[index].template_name == "note" 

87 ): 

88 note_index = index 

89 gloss_text = find_alt_of_form( 

90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

91 ) 

92 if "form-of" in page_data[-1].tags: 

93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

94 gloss_text = gloss_text.strip("— \n") 

95 if gloss_text != "": 

96 gloss_data.glosses.append(gloss_text) 

97 gloss_data.note = clean_node( 

98 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

99 ).strip(" ().") 

100 if len(gloss_data.glosses) > 0: 

101 page_data[-1].senses.append(gloss_data) 

102 

103 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

104 if nest_gloss_list.sarg.endswith("#"): 

105 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

106 elif nest_gloss_list.sarg.endswith("*"): 106 ↛ 103line 106 didn't jump to line 103 because the condition on line 106 was always true

107 extract_examples(wxr, gloss_data, nest_gloss_list) 

108 

109 translate_raw_tags(gloss_data) 

110 

111 

112def extract_examples( 

113 wxr: WiktextractContext, 

114 gloss_data: Sense, 

115 example_list_node: WikiNode, 

116) -> None: 

117 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): 

118 example_node_children = list(example_node.filter_empty_str_child()) 

119 if len(example_node_children) == 0: 119 ↛ 120line 119 didn't jump to line 120 because the condition on line 119 was never true

120 continue 

121 first_child = example_node_children[0] 

122 if isinstance( 

123 first_child, TemplateNode 

124 ) and first_child.template_name.endswith("exemple"): 

125 process_exemple_template(wxr, first_child, gloss_data) 

126 else: 

127 example_data = Example() 

128 ignored_nodes = [] 

129 for node in example_node.find_child( 

130 NodeKind.TEMPLATE | NodeKind.LIST 

131 ): 

132 if ( 

133 node.kind == NodeKind.TEMPLATE 

134 and node.template_name == "source" 

135 ): 

136 example_data.ref = clean_node(wxr, None, node).strip("— ()") 

137 ignored_nodes.append(node) 

138 elif node.kind == NodeKind.LIST: 138 ↛ 129line 138 didn't jump to line 129 because the condition on line 138 was always true

139 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

140 example_data.translation = clean_node( 

141 wxr, None, tr_item.children 

142 ) 

143 ignored_nodes.append(node) 

144 example_nodes = [ 

145 node 

146 for node in example_node_children 

147 if node not in ignored_nodes 

148 ] 

149 example_data.text = clean_node(wxr, None, example_nodes) 

150 gloss_data.examples.append(example_data) 

151 

152 

153def process_exemple_template( 

154 wxr: WiktextractContext, 

155 node: TemplateNode, 

156 gloss_data: Sense | None, 

157 attestations: list[AttestationData] = [], 

158) -> Example: 

159 # https://fr.wiktionary.org/wiki/Modèle:exemple 

160 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

161 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

162 text_arg = wxr.wtp.parse( 

163 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, "")) 

164 ) 

165 text = clean_node(wxr, None, text_arg) 

166 trans_arg = wxr.wtp.parse( 

167 wxr.wtp.node_to_wikitext( 

168 node.template_parameters.get( 

169 2, node.template_parameters.get("sens", "") 

170 ) 

171 ) 

172 ) 

173 translation = clean_node(wxr, None, trans_arg) 

174 roman_arg = wxr.wtp.parse( 

175 wxr.wtp.node_to_wikitext( 

176 node.template_parameters.get( 

177 3, node.template_parameters.get("tr", "") 

178 ) 

179 ) 

180 ) 

181 transcription = clean_node(wxr, None, roman_arg) 

182 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

183 example_data = Example( 

184 text=text, 

185 translation=translation, 

186 roman=transcription, 

187 ref=source, 

188 attestations=attestations, 

189 ) 

190 calculate_bold_offsets( 

191 wxr, text_arg, text, example_data, "bold_text_offsets" 

192 ) 

193 calculate_bold_offsets( 

194 wxr, trans_arg, translation, example_data, "bold_translation_offsets" 

195 ) 

196 calculate_bold_offsets( 

197 wxr, roman_arg, transcription, example_data, "bold_roman_offsets" 

198 ) 

199 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

200 gloss_data.examples.append(example_data) 

201 if gloss_data is not None: 201 ↛ 203line 201 didn't jump to line 203 because the condition on line 201 was always true

202 clean_node(wxr, gloss_data, node) 

203 return example_data 

204 

205 

206def find_alt_of_form( 

207 wxr: WiktextractContext, 

208 gloss_nodes: list[str | WikiNode], 

209 word_entry: WordEntry, 

210 gloss_data: Sense, 

211) -> str: 

212 """ 

213 Return gloss text, remove tag template expanded from "variante *" templates. 

214 """ 

215 from .form_line import process_equiv_pour_template 

216 

217 alt_of = "" 

218 filtered_gloss_nodes = [] 

219 for gloss_node in gloss_nodes: 

220 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

221 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

222 if isinstance( 

223 gloss_node, TemplateNode 

224 ) and gloss_node.template_name.startswith("variante "): 

225 alt_of = clean_node( 

226 wxr, None, gloss_node.template_parameters.get("dif", "") 

227 ) 

228 if len(alt_of) == 0: 

229 alt_of = clean_node( 

230 wxr, None, gloss_node.template_parameters.get(1, "") 

231 ) 

232 if len(alt_of) > 0: 232 ↛ 235line 232 didn't jump to line 235 because the condition on line 232 was always true

233 gloss_data.alt_of.append(AltForm(word=alt_of)) 

234 gloss_data.tags.append("alt-of") 

235 expanded_template = wxr.wtp.parse( 

236 wxr.wtp.node_to_wikitext(gloss_node), 

237 pre_expand=True, 

238 additional_expand={gloss_node.template_name}, 

239 ) 

240 for node in expanded_template.children: 

241 if ( 

242 isinstance(node, TemplateNode) 

243 and node.template_name == "désuet" 

244 ): 

245 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

246 gloss_data.raw_tags.append(raw_tag) 

247 else: 

248 filtered_gloss_nodes.append(node) 

249 elif ( 

250 isinstance(gloss_node, TemplateNode) 

251 and gloss_node.template_name == "équiv-pour" 

252 ): 

253 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

254 form_data.sense_index = len(word_entry.senses) + 1 

255 word_entry.forms.append(form_data) 

256 else: 

257 filtered_gloss_nodes.append(gloss_node) 

258 

259 if alt_of == "" and word_entry.pos == "typographic variant": 

260 for gloss_node in filter( 

261 lambda n: isinstance(n, WikiNode), gloss_nodes 

262 ): 

263 # use the last link 

264 if gloss_node.kind == NodeKind.LINK: 

265 alt_of = clean_node(wxr, None, gloss_node) 

266 if isinstance(gloss_node, TemplateNode): 

267 gloss_node = wxr.wtp.parse( 

268 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

269 ) 

270 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

271 alt_of = clean_node(wxr, None, link) 

272 if len(alt_of) > 0: 272 ↛ 277line 272 didn't jump to line 277 because the condition on line 272 was always true

273 gloss_data.alt_of.append(AltForm(word=alt_of)) 

274 if "alt-of" not in gloss_data.tags: 274 ↛ 277line 274 didn't jump to line 277 because the condition on line 274 was always true

275 gloss_data.tags.append("alt-of") 

276 

277 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

278 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

279 brackets = 0 

280 for char in gloss_text: 

281 if char == "(": 

282 brackets += 1 

283 elif char == ")": 

284 brackets -= 1 

285 if brackets != 0: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 gloss_text = gloss_text.strip(" ()") 

287 return gloss_text 

288 

289 

290def find_form_of_word( 

291 wxr: WiktextractContext, 

292 gloss_nodes: list[str | WikiNode], 

293 gloss_data: Sense, 

294) -> None: 

295 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

296 form_of = "" 

297 for node in gloss_nodes: 

298 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

299 form_of = clean_node(wxr, None, node) 

300 elif isinstance(node, TemplateNode): 

301 if node.template_name in ("mutation de", "lien"): 301 ↛ 297line 301 didn't jump to line 297 because the condition on line 301 was always true

302 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

303 form_of = clean_node( 

304 wxr, None, node.template_parameters.get(1, "") 

305 ) 

306 if len(form_of) > 0: 306 ↛ exitline 306 didn't return from function 'find_form_of_word' because the condition on line 306 was always true

307 gloss_data.form_of.append(AltForm(word=form_of)) 

308 if "form-of" not in gloss_data.tags: 308 ↛ exitline 308 didn't return from function 'find_form_of_word' because the condition on line 308 was always true

309 gloss_data.tags.append("form-of")