Coverage for src/wiktextract/extractor/fr/gloss.py: 96%

139 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from ..share import calculate_bold_offsets 

9from .models import AltForm, Example, Sense, WordEntry 

10from .tags import translate_raw_tags 

11 

12 

13def extract_gloss( 

14 wxr: WiktextractContext, 

15 page_data: list[WordEntry], 

16 list_node: WikiNode, 

17 parent_sense: Sense | None = None, 

18) -> None: 

19 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

20 gloss_nodes = list( 

21 list_item_node.invert_find_child( 

22 NodeKind.LIST, include_empty_str=True 

23 ) 

24 ) 

25 gloss_data = ( 

26 parent_sense.model_copy(deep=True) 

27 if parent_sense is not None 

28 else Sense() 

29 ) 

30 gloss_data.examples.clear() 

31 # process modifier, theme tempaltes before gloss text 

32 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

33 tag_indexes = set() 

34 for index, gloss_node in enumerate(gloss_nodes): 

35 if ( 

36 isinstance(gloss_node, TemplateNode) 

37 and gloss_node.template_name != "équiv-pour" 

38 ): 

39 categories_data = defaultdict(list) 

40 expanded_text = clean_node(wxr, categories_data, gloss_node) 

41 if ( 

42 expanded_text.startswith("(") 

43 and expanded_text.endswith(")") 

44 and "(" not in expanded_text[1:-1] 

45 ): 

46 tags = expanded_text.strip("() \n").split(", ") 

47 if len(tags) > 0: 47 ↛ 49line 47 didn't jump to line 49 because the condition on line 47 was always true

48 gloss_data.raw_tags.extend(tags) 

49 if "categories" in categories_data: 

50 gloss_data.categories.extend( 

51 categories_data["categories"] 

52 ) 

53 tag_indexes.add(index) 

54 # if an italic node is between parentheses then it's a tag, also 

55 # don't add the parenthese strings to `gloss_only_nodes` 

56 elif ( 

57 isinstance(gloss_node, WikiNode) 

58 and gloss_node.kind == NodeKind.ITALIC 

59 and isinstance(gloss_nodes[index - 1], str) 

60 and gloss_nodes[index - 1].strip() == "(" 

61 and index + 1 < len(gloss_nodes) 

62 and isinstance(gloss_nodes[index + 1], str) 

63 and gloss_nodes[index + 1].strip() == ")" 

64 ): 

65 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

66 tag_indexes |= {index - 1, index, index + 1} 

67 

68 gloss_only_nodes = [ 

69 node 

70 for index, node in enumerate(gloss_nodes) 

71 if index not in tag_indexes 

72 ] 

73 note_index = len(gloss_only_nodes) 

74 for index in range(note_index): 

75 if ( 

76 isinstance(gloss_only_nodes[index], TemplateNode) 

77 and gloss_only_nodes[index].template_name == "note" 

78 ): 

79 note_index = index 

80 gloss_text = find_alt_of_form( 

81 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

82 ) 

83 if "form-of" in page_data[-1].tags: 

84 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

85 if gloss_text != "": 

86 gloss_data.glosses.append(gloss_text) 

87 gloss_data.note = clean_node( 

88 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

89 ).strip(" ().") 

90 if len(gloss_data.glosses) > 0: 

91 page_data[-1].senses.append(gloss_data) 

92 

93 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

94 if nest_gloss_list.sarg.endswith("#"): 

95 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

96 elif nest_gloss_list.sarg.endswith("*"): 96 ↛ 93line 96 didn't jump to line 93 because the condition on line 96 was always true

97 extract_examples(wxr, gloss_data, nest_gloss_list) 

98 

99 translate_raw_tags(gloss_data) 

100 

101 

102def extract_examples( 

103 wxr: WiktextractContext, 

104 gloss_data: Sense, 

105 example_list_node: WikiNode, 

106) -> None: 

107 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): 

108 example_node_children = list(example_node.filter_empty_str_child()) 

109 if len(example_node_children) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 continue 

111 first_child = example_node_children[0] 

112 if isinstance( 

113 first_child, TemplateNode 

114 ) and first_child.template_name.endswith("exemple"): 

115 process_exemple_template(wxr, first_child, gloss_data) 

116 else: 

117 example_data = Example() 

118 ignored_nodes = [] 

119 for node in example_node.find_child( 

120 NodeKind.TEMPLATE | NodeKind.LIST 

121 ): 

122 if ( 

123 node.kind == NodeKind.TEMPLATE 

124 and node.template_name == "source" 

125 ): 

126 example_data.ref = clean_node(wxr, None, node).strip("— ()") 

127 ignored_nodes.append(node) 

128 elif node.kind == NodeKind.LIST: 128 ↛ 119line 128 didn't jump to line 119 because the condition on line 128 was always true

129 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

130 example_data.translation = clean_node( 

131 wxr, None, tr_item.children 

132 ) 

133 ignored_nodes.append(node) 

134 example_nodes = [ 

135 node 

136 for node in example_node_children 

137 if node not in ignored_nodes 

138 ] 

139 example_data.text = clean_node(wxr, None, example_nodes) 

140 gloss_data.examples.append(example_data) 

141 

142 

143def process_exemple_template( 

144 wxr: WiktextractContext, 

145 node: TemplateNode, 

146 gloss_data: Sense | None, 

147 time: str = "", 

148) -> Example: 

149 # https://fr.wiktionary.org/wiki/Modèle:exemple 

150 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

151 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

152 text_arg = wxr.wtp.parse( 

153 wxr.wtp.node_to_wikitext(node.template_parameters.get(1, "")) 

154 ) 

155 text = clean_node(wxr, None, text_arg) 

156 trans_arg = wxr.wtp.parse( 

157 wxr.wtp.node_to_wikitext( 

158 node.template_parameters.get( 

159 2, node.template_parameters.get("sens", "") 

160 ) 

161 ) 

162 ) 

163 translation = clean_node(wxr, None, trans_arg) 

164 roman_arg = wxr.wtp.parse( 

165 wxr.wtp.node_to_wikitext( 

166 node.template_parameters.get( 

167 3, node.template_parameters.get("tr", "") 

168 ) 

169 ) 

170 ) 

171 transcription = clean_node(wxr, None, roman_arg) 

172 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

173 example_data = Example( 

174 text=text, 

175 translation=translation, 

176 roman=transcription, 

177 ref=source, 

178 time=time, 

179 ) 

180 calculate_bold_offsets( 

181 wxr, text_arg, text, example_data, "bold_text_offsets" 

182 ) 

183 calculate_bold_offsets( 

184 wxr, trans_arg, translation, example_data, "bold_translation_offsets" 

185 ) 

186 calculate_bold_offsets( 

187 wxr, roman_arg, transcription, example_data, "bold_roman_offsets" 

188 ) 

189 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

190 gloss_data.examples.append(example_data) 

191 if gloss_data is not None: 191 ↛ 193line 191 didn't jump to line 193 because the condition on line 191 was always true

192 clean_node(wxr, gloss_data, node) 

193 return example_data 

194 

195 

196def find_alt_of_form( 

197 wxr: WiktextractContext, 

198 gloss_nodes: list[str | WikiNode], 

199 word_entry: WordEntry, 

200 gloss_data: Sense, 

201) -> str: 

202 """ 

203 Return gloss text, remove tag template expanded from "variante *" templates. 

204 """ 

205 from .form_line import process_equiv_pour_template 

206 

207 alt_of = "" 

208 filtered_gloss_nodes = [] 

209 for gloss_node in gloss_nodes: 

210 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

211 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

212 if isinstance( 

213 gloss_node, TemplateNode 

214 ) and gloss_node.template_name.startswith("variante "): 

215 alt_of = clean_node( 

216 wxr, None, gloss_node.template_parameters.get("dif", "") 

217 ) 

218 if len(alt_of) == 0: 

219 alt_of = clean_node( 

220 wxr, None, gloss_node.template_parameters.get(1, "") 

221 ) 

222 if len(alt_of) > 0: 222 ↛ 225line 222 didn't jump to line 225 because the condition on line 222 was always true

223 gloss_data.alt_of.append(AltForm(word=alt_of)) 

224 gloss_data.tags.append("alt-of") 

225 expanded_template = wxr.wtp.parse( 

226 wxr.wtp.node_to_wikitext(gloss_node), 

227 pre_expand=True, 

228 additional_expand={gloss_node.template_name}, 

229 ) 

230 for node in expanded_template.children: 

231 if ( 

232 isinstance(node, TemplateNode) 

233 and node.template_name == "désuet" 

234 ): 

235 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

236 gloss_data.raw_tags.append(raw_tag) 

237 else: 

238 filtered_gloss_nodes.append(node) 

239 elif ( 

240 isinstance(gloss_node, TemplateNode) 

241 and gloss_node.template_name == "équiv-pour" 

242 ): 

243 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

244 form_data.sense_index = len(word_entry.senses) + 1 

245 word_entry.forms.append(form_data) 

246 else: 

247 filtered_gloss_nodes.append(gloss_node) 

248 

249 if alt_of == "" and word_entry.pos == "typographic variant": 

250 for gloss_node in filter( 

251 lambda n: isinstance(n, WikiNode), gloss_nodes 

252 ): 

253 # use the last link 

254 if gloss_node.kind == NodeKind.LINK: 

255 alt_of = clean_node(wxr, None, gloss_node) 

256 if isinstance(gloss_node, TemplateNode): 

257 gloss_node = wxr.wtp.parse( 

258 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

259 ) 

260 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

261 alt_of = clean_node(wxr, None, link) 

262 if len(alt_of) > 0: 262 ↛ 265line 262 didn't jump to line 265 because the condition on line 262 was always true

263 gloss_data.alt_of.append(AltForm(word=alt_of)) 

264 

265 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

266 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

267 brackets = 0 

268 for char in gloss_text: 

269 if char == "(": 

270 brackets += 1 

271 elif char == ")": 

272 brackets -= 1 

273 if brackets != 0: 

274 gloss_text = gloss_text.strip(" ()") 

275 return gloss_text 

276 

277 

278def find_form_of_word( 

279 wxr: WiktextractContext, 

280 gloss_nodes: list[str | WikiNode], 

281 gloss_data: Sense, 

282) -> None: 

283 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

284 form_of = "" 

285 for node in gloss_nodes: 

286 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

287 form_of = clean_node(wxr, None, node) 

288 elif isinstance(node, TemplateNode): 

289 if node.template_name in ("mutation de", "lien"): 289 ↛ 285line 289 didn't jump to line 285 because the condition on line 289 was always true

290 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

291 form_of = clean_node( 

292 wxr, None, node.template_parameters.get(1, "") 

293 ) 

294 if len(form_of) > 0: 294 ↛ exitline 294 didn't return from function 'find_form_of_word' because the condition on line 294 was always true

295 gloss_data.form_of.append(AltForm(word=form_of))