Coverage for src/wiktextract/extractor/fr/gloss.py: 96%

143 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import calculate_bold_offsets 

9from .etymology import ATTESTATION_TEMPLATES, extract_date_template 

10from .models import AltForm, AttestationData, Example, Sense, WordEntry 

11from .tags import translate_raw_tags 

12 

13 

14def extract_gloss( 

15 wxr: WiktextractContext, 

16 page_data: list[WordEntry], 

17 list_node: WikiNode, 

18 parent_sense: Sense | None = None, 

19) -> None: 

20 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

21 gloss_nodes = list( 

22 list_item_node.invert_find_child( 

23 NodeKind.LIST, include_empty_str=True 

24 ) 

25 ) 

26 gloss_data = ( 

27 parent_sense.model_copy(deep=True) 

28 if parent_sense is not None 

29 else Sense() 

30 ) 

31 gloss_data.examples.clear() 

32 # process modifier, theme tempaltes before gloss text 

33 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

34 tag_indexes = set() 

35 for index, gloss_node in enumerate(gloss_nodes): 

36 if ( 

37 isinstance(gloss_node, TemplateNode) 

38 and gloss_node.template_name in ATTESTATION_TEMPLATES 

39 ): 

40 gloss_data.attestations = extract_date_template( 

41 wxr, gloss_data, gloss_node 

42 ) 

43 tag_indexes.add(index) 

44 elif ( 

45 isinstance(gloss_node, TemplateNode) 

46 and gloss_node.template_name != "équiv-pour" 

47 ): 

48 categories_data = defaultdict(list) 

49 expanded_text = clean_node(wxr, categories_data, gloss_node) 

50 if ( 

51 expanded_text.startswith("(") 

52 and expanded_text.endswith(")") 

53 and "(" not in expanded_text[1:-1] 

54 ): 

55 tags = expanded_text.strip("() \n").split(", ") 

56 if len(tags) > 0: 56 ↛ 58line 56 didn't jump to line 58 because the condition on line 56 was always true

57 gloss_data.raw_tags.extend(tags) 

58 if "categories" in categories_data: 

59 gloss_data.categories.extend( 

60 categories_data["categories"] 

61 ) 

62 tag_indexes.add(index) 

63 # if an italic node is between parentheses then it's a tag, also 

64 # don't add the parenthese strings to `gloss_only_nodes` 

65 elif ( 

66 isinstance(gloss_node, WikiNode) 

67 and gloss_node.kind == NodeKind.ITALIC 

68 and isinstance(gloss_nodes[index - 1], str) 

69 and gloss_nodes[index - 1].strip() == "(" 

70 and index + 1 < len(gloss_nodes) 

71 and isinstance(gloss_nodes[index + 1], str) 

72 and gloss_nodes[index + 1].strip() == ")" 

73 ): 

74 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

75 tag_indexes |= {index - 1, index, index + 1} 

76 

77 gloss_only_nodes = [ 

78 node 

79 for index, node in enumerate(gloss_nodes) 

80 if index not in tag_indexes 

81 ] 

82 note_index = len(gloss_only_nodes) 

83 for index in range(note_index): 

84 if ( 

85 isinstance(gloss_only_nodes[index], TemplateNode) 

86 and gloss_only_nodes[index].template_name == "note" 

87 ): 

88 note_index = index 

89 gloss_text = find_alt_of_form( 

90 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

91 ) 

92 if "form-of" in page_data[-1].tags: 

93 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

94 if gloss_text != "": 

95 gloss_data.glosses.append(gloss_text) 

96 gloss_data.note = clean_node( 

97 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

98 ).strip(" ().") 

99 if len(gloss_data.glosses) > 0: 

100 page_data[-1].senses.append(gloss_data) 

101 

102 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

103 if nest_gloss_list.sarg.endswith("#"): 

104 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

105 elif nest_gloss_list.sarg.endswith("*"): 105 ↛ 102line 105 didn't jump to line 102 because the condition on line 105 was always true

106 extract_examples(wxr, gloss_data, nest_gloss_list) 

107 

108 translate_raw_tags(gloss_data) 

109 

110 

111def extract_examples( 

112 wxr: WiktextractContext, 

113 gloss_data: Sense, 

114 example_list_node: WikiNode, 

115) -> None: 

116 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): 

117 example_node_children = list(example_node.filter_empty_str_child()) 

118 if len(example_node_children) == 0: 118 ↛ 119line 118 didn't jump to line 119 because the condition on line 118 was never true

119 continue 

120 first_child = example_node_children[0] 

121 if isinstance( 

122 first_child, TemplateNode 

123 ) and first_child.template_name.endswith("exemple"): 

124 process_exemple_template(wxr, first_child, gloss_data) 

125 else: 

126 example_data = Example() 

127 ignored_nodes = [] 

128 for node in example_node.find_child( 

129 NodeKind.TEMPLATE | NodeKind.LIST 

130 ): 

131 if ( 

132 node.kind == NodeKind.TEMPLATE 

133 and node.template_name == "source" 

134 ): 

135 example_data.ref = clean_node(wxr, None, node).strip("— ()") 

136 ignored_nodes.append(node) 

137 elif node.kind == NodeKind.LIST: 137 ↛ 128line 137 didn't jump to line 128 because the condition on line 137 was always true

138 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

139 example_data.translation = clean_node( 

140 wxr, None, tr_item.children 

141 ) 

142 ignored_nodes.append(node) 

143 example_nodes = [ 

144 node 

145 for node in example_node_children 

146 if node not in ignored_nodes 

147 ] 

148 example_data.text = clean_node(wxr, None, example_nodes) 

149 gloss_data.examples.append(example_data) 

150 

151 

152def process_exemple_template( 

153 wxr: WiktextractContext, 

154 node: TemplateNode, 

155 gloss_data: Sense | None, 

156 attestations: list[AttestationData] = [], 

157) -> Example: 

158 # https://fr.wiktionary.org/wiki/Modèle:exemple 

159 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

160 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

161 text_arg = wxr.wtp.parse( 

162 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, "")) 

163 ) 

164 text = clean_node(wxr, None, text_arg) 

165 trans_arg = wxr.wtp.parse( 

166 wxr.wtp.node_to_wikitext( 

167 node.template_parameters.get( 

168 2, node.template_parameters.get("sens", "") 

169 ) 

170 ) 

171 ) 

172 translation = clean_node(wxr, None, trans_arg) 

173 roman_arg = wxr.wtp.parse( 

174 wxr.wtp.node_to_wikitext( 

175 node.template_parameters.get( 

176 3, node.template_parameters.get("tr", "") 

177 ) 

178 ) 

179 ) 

180 transcription = clean_node(wxr, None, roman_arg) 

181 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

182 example_data = Example( 

183 text=text, 

184 translation=translation, 

185 roman=transcription, 

186 ref=source, 

187 attestations=attestations, 

188 ) 

189 calculate_bold_offsets( 

190 wxr, text_arg, text, example_data, "bold_text_offsets" 

191 ) 

192 calculate_bold_offsets( 

193 wxr, trans_arg, translation, example_data, "bold_translation_offsets" 

194 ) 

195 calculate_bold_offsets( 

196 wxr, roman_arg, transcription, example_data, "bold_roman_offsets" 

197 ) 

198 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

199 gloss_data.examples.append(example_data) 

200 if gloss_data is not None: 200 ↛ 202line 200 didn't jump to line 202 because the condition on line 200 was always true

201 clean_node(wxr, gloss_data, node) 

202 return example_data 

203 

204 

205def find_alt_of_form( 

206 wxr: WiktextractContext, 

207 gloss_nodes: list[str | WikiNode], 

208 word_entry: WordEntry, 

209 gloss_data: Sense, 

210) -> str: 

211 """ 

212 Return gloss text, remove tag template expanded from "variante *" templates. 

213 """ 

214 from .form_line import process_equiv_pour_template 

215 

216 alt_of = "" 

217 filtered_gloss_nodes = [] 

218 for gloss_node in gloss_nodes: 

219 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

220 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

221 if isinstance( 

222 gloss_node, TemplateNode 

223 ) and gloss_node.template_name.startswith("variante "): 

224 alt_of = clean_node( 

225 wxr, None, gloss_node.template_parameters.get("dif", "") 

226 ) 

227 if len(alt_of) == 0: 

228 alt_of = clean_node( 

229 wxr, None, gloss_node.template_parameters.get(1, "") 

230 ) 

231 if len(alt_of) > 0: 231 ↛ 234line 231 didn't jump to line 234 because the condition on line 231 was always true

232 gloss_data.alt_of.append(AltForm(word=alt_of)) 

233 gloss_data.tags.append("alt-of") 

234 expanded_template = wxr.wtp.parse( 

235 wxr.wtp.node_to_wikitext(gloss_node), 

236 pre_expand=True, 

237 additional_expand={gloss_node.template_name}, 

238 ) 

239 for node in expanded_template.children: 

240 if ( 

241 isinstance(node, TemplateNode) 

242 and node.template_name == "désuet" 

243 ): 

244 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

245 gloss_data.raw_tags.append(raw_tag) 

246 else: 

247 filtered_gloss_nodes.append(node) 

248 elif ( 

249 isinstance(gloss_node, TemplateNode) 

250 and gloss_node.template_name == "équiv-pour" 

251 ): 

252 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

253 form_data.sense_index = len(word_entry.senses) + 1 

254 word_entry.forms.append(form_data) 

255 else: 

256 filtered_gloss_nodes.append(gloss_node) 

257 

258 if alt_of == "" and word_entry.pos == "typographic variant": 

259 for gloss_node in filter( 

260 lambda n: isinstance(n, WikiNode), gloss_nodes 

261 ): 

262 # use the last link 

263 if gloss_node.kind == NodeKind.LINK: 

264 alt_of = clean_node(wxr, None, gloss_node) 

265 if isinstance(gloss_node, TemplateNode): 

266 gloss_node = wxr.wtp.parse( 

267 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

268 ) 

269 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

270 alt_of = clean_node(wxr, None, link) 

271 if len(alt_of) > 0: 271 ↛ 274line 271 didn't jump to line 274 because the condition on line 271 was always true

272 gloss_data.alt_of.append(AltForm(word=alt_of)) 

273 

274 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

275 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

276 brackets = 0 

277 for char in gloss_text: 

278 if char == "(": 

279 brackets += 1 

280 elif char == ")": 

281 brackets -= 1 

282 if brackets != 0: 

283 gloss_text = gloss_text.strip(" ()") 

284 return gloss_text 

285 

286 

287def find_form_of_word( 

288 wxr: WiktextractContext, 

289 gloss_nodes: list[str | WikiNode], 

290 gloss_data: Sense, 

291) -> None: 

292 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

293 form_of = "" 

294 for node in gloss_nodes: 

295 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

296 form_of = clean_node(wxr, None, node) 

297 elif isinstance(node, TemplateNode): 

298 if node.template_name in ("mutation de", "lien"): 298 ↛ 294line 298 didn't jump to line 294 because the condition on line 298 was always true

299 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

300 form_of = clean_node( 

301 wxr, None, node.template_parameters.get(1, "") 

302 ) 

303 if len(form_of) > 0: 303 ↛ exitline 303 didn't return from function 'find_form_of_word' because the condition on line 303 was always true

304 gloss_data.form_of.append(AltForm(word=form_of))