Coverage for src/wiktextract/extractor/fr/gloss.py: 96%

137 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1import re 

2from collections import defaultdict 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import AltForm, Example, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_gloss( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 list_node: WikiNode, 

16 parent_sense: Sense | None = None, 

17) -> None: 

18 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

19 gloss_nodes = list( 

20 list_item_node.invert_find_child( 

21 NodeKind.LIST, include_empty_str=True 

22 ) 

23 ) 

24 gloss_data = Sense() 

25 if parent_sense is not None: 

26 gloss_data.glosses.extend(parent_sense.glosses) 

27 gloss_data.tags.extend(parent_sense.tags) 

28 gloss_data.raw_tags.extend(parent_sense.raw_tags) 

29 gloss_data.topics.extend(parent_sense.topics) 

30 # process modifier, theme tempaltes before gloss text 

31 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

32 tag_indexes = set() 

33 for index, gloss_node in enumerate(gloss_nodes): 

34 if ( 

35 isinstance(gloss_node, TemplateNode) 

36 and gloss_node.template_name != "équiv-pour" 

37 ): 

38 categories_data = defaultdict(list) 

39 expanded_text = clean_node(wxr, categories_data, gloss_node) 

40 if ( 

41 expanded_text.startswith("(") 

42 and expanded_text.endswith(")") 

43 and "(" not in expanded_text[1:-1] 

44 ): 

45 tags = expanded_text.strip("() \n").split(", ") 

46 if len(tags) > 0: 46 ↛ 48line 46 didn't jump to line 48 because the condition on line 46 was always true

47 gloss_data.raw_tags.extend(tags) 

48 if "categories" in categories_data: 

49 gloss_data.categories.extend( 

50 categories_data["categories"] 

51 ) 

52 tag_indexes.add(index) 

53 # if an italic node is between parentheses then it's a tag, also 

54 # don't add the parenthese strings to `gloss_only_nodes` 

55 elif ( 

56 isinstance(gloss_node, WikiNode) 

57 and gloss_node.kind == NodeKind.ITALIC 

58 and isinstance(gloss_nodes[index - 1], str) 

59 and gloss_nodes[index - 1].strip() == "(" 

60 and index + 1 < len(gloss_nodes) 

61 and isinstance(gloss_nodes[index + 1], str) 

62 and gloss_nodes[index + 1].strip() == ")" 

63 ): 

64 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

65 tag_indexes |= {index - 1, index, index + 1} 

66 

67 gloss_only_nodes = [ 

68 node 

69 for index, node in enumerate(gloss_nodes) 

70 if index not in tag_indexes 

71 ] 

72 note_index = len(gloss_only_nodes) 

73 for index in range(note_index): 

74 if ( 

75 isinstance(gloss_only_nodes[index], TemplateNode) 

76 and gloss_only_nodes[index].template_name == "note" 

77 ): 

78 note_index = index 

79 gloss_text = find_alt_of_form( 

80 wxr, gloss_only_nodes[:note_index], page_data[-1], gloss_data 

81 ) 

82 if "form-of" in page_data[-1].tags: 

83 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

84 if gloss_text != "": 

85 gloss_data.glosses.append(gloss_text) 

86 gloss_data.note = clean_node( 

87 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

88 ).strip(" ().") 

89 page_data[-1].senses.append(gloss_data) 

90 

91 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

92 if nest_gloss_list.sarg.endswith("#"): 

93 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

94 elif nest_gloss_list.sarg.endswith("*"): 94 ↛ 91line 94 didn't jump to line 91 because the condition on line 94 was always true

95 extract_examples(wxr, gloss_data, nest_gloss_list) 

96 

97 translate_raw_tags(gloss_data) 

98 if len(gloss_data.glosses) == 0: 

99 gloss_data.tags.append("no-gloss") 

100 

101 

102def extract_examples( 

103 wxr: WiktextractContext, 

104 gloss_data: Sense, 

105 example_list_node: WikiNode, 

106) -> None: 

107 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): 

108 example_node_children = list(example_node.filter_empty_str_child()) 

109 if len(example_node_children) == 0: 109 ↛ 110line 109 didn't jump to line 110 because the condition on line 109 was never true

110 continue 

111 first_child = example_node_children[0] 

112 if isinstance( 

113 first_child, TemplateNode 

114 ) and first_child.template_name.endswith("exemple"): 

115 process_exemple_template(wxr, first_child, gloss_data) 

116 else: 

117 example_data = Example() 

118 ignored_nodes = [] 

119 for node in example_node.find_child( 

120 NodeKind.TEMPLATE | NodeKind.LIST 

121 ): 

122 if ( 

123 node.kind == NodeKind.TEMPLATE 

124 and node.template_name == "source" 

125 ): 

126 example_data.ref = clean_node(wxr, None, node).strip("— ()") 

127 ignored_nodes.append(node) 

128 elif node.kind == NodeKind.LIST: 128 ↛ 119line 128 didn't jump to line 119 because the condition on line 128 was always true

129 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

130 example_data.translation = clean_node( 

131 wxr, None, tr_item.children 

132 ) 

133 ignored_nodes.append(node) 

134 example_nodes = [ 

135 node 

136 for node in example_node_children 

137 if node not in ignored_nodes 

138 ] 

139 example_data.text = clean_node(wxr, None, example_nodes) 

140 gloss_data.examples.append(example_data) 

141 

142 

143def process_exemple_template( 

144 wxr: WiktextractContext, 

145 node: TemplateNode, 

146 gloss_data: Sense | None, 

147 time: str = "", 

148) -> Example: 

149 # https://fr.wiktionary.org/wiki/Modèle:exemple 

150 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

151 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

152 text = clean_node(wxr, None, node.template_parameters.get(1, "")) 

153 translation = clean_node( 

154 wxr, 

155 None, 

156 node.template_parameters.get( 

157 2, node.template_parameters.get("sens", "") 

158 ), 

159 ) 

160 transcription = clean_node( 

161 wxr, 

162 None, 

163 node.template_parameters.get(3, node.template_parameters.get("tr", "")), 

164 ) 

165 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

166 example_data = Example( 

167 text=clean_node(wxr, None, text), 

168 translation=clean_node(wxr, None, translation), 

169 roman=clean_node(wxr, None, transcription), 

170 ref=clean_node(wxr, None, source), 

171 time=time, 

172 ) 

173 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

174 gloss_data.examples.append(example_data) 

175 if gloss_data is not None: 175 ↛ 177line 175 didn't jump to line 177 because the condition on line 175 was always true

176 clean_node(wxr, gloss_data, node) 

177 return example_data 

178 

179 

180def find_alt_of_form( 

181 wxr: WiktextractContext, 

182 gloss_nodes: list[str | WikiNode], 

183 word_entry: WordEntry, 

184 gloss_data: Sense, 

185) -> str: 

186 """ 

187 Return gloss text, remove tag template expanded from "variante *" templates. 

188 """ 

189 from .form_line import process_equiv_pour_template 

190 

191 alt_of = "" 

192 filtered_gloss_nodes = [] 

193 for gloss_node in gloss_nodes: 

194 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

195 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

196 if isinstance( 

197 gloss_node, TemplateNode 

198 ) and gloss_node.template_name.startswith("variante "): 

199 alt_of = clean_node( 

200 wxr, None, gloss_node.template_parameters.get("dif", "") 

201 ) 

202 if len(alt_of) == 0: 

203 alt_of = clean_node( 

204 wxr, None, gloss_node.template_parameters.get(1, "") 

205 ) 

206 if len(alt_of) > 0: 206 ↛ 209line 206 didn't jump to line 209 because the condition on line 206 was always true

207 gloss_data.alt_of.append(AltForm(word=alt_of)) 

208 gloss_data.tags.append("alt-of") 

209 expanded_template = wxr.wtp.parse( 

210 wxr.wtp.node_to_wikitext(gloss_node), 

211 pre_expand=True, 

212 additional_expand={gloss_node.template_name}, 

213 ) 

214 for node in expanded_template.children: 

215 if ( 

216 isinstance(node, TemplateNode) 

217 and node.template_name == "désuet" 

218 ): 

219 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

220 gloss_data.raw_tags.append(raw_tag) 

221 else: 

222 filtered_gloss_nodes.append(node) 

223 elif ( 

224 isinstance(gloss_node, TemplateNode) 

225 and gloss_node.template_name == "équiv-pour" 

226 ): 

227 for form_data in process_equiv_pour_template(wxr, gloss_node, []): 

228 form_data.sense_index = len(word_entry.senses) + 1 

229 word_entry.forms.append(form_data) 

230 else: 

231 filtered_gloss_nodes.append(gloss_node) 

232 

233 if alt_of == "" and word_entry.pos == "typographic variant": 

234 for gloss_node in filter( 

235 lambda n: isinstance(n, WikiNode), gloss_nodes 

236 ): 

237 # use the last link 

238 if gloss_node.kind == NodeKind.LINK: 

239 alt_of = clean_node(wxr, None, gloss_node) 

240 if isinstance(gloss_node, TemplateNode): 

241 gloss_node = wxr.wtp.parse( 

242 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

243 ) 

244 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

245 alt_of = clean_node(wxr, None, link) 

246 if len(alt_of) > 0: 246 ↛ 249line 246 didn't jump to line 249 because the condition on line 246 was always true

247 gloss_data.alt_of.append(AltForm(word=alt_of)) 

248 

249 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

250 gloss_text = re.sub(r"\s+\.$", ".", gloss_text) 

251 brackets = 0 

252 for char in gloss_text: 

253 if char == "(": 

254 brackets += 1 

255 elif char == ")": 

256 brackets -= 1 

257 if brackets != 0: 

258 gloss_text = gloss_text.strip(" ()") 

259 return gloss_text 

260 

261 

262def find_form_of_word( 

263 wxr: WiktextractContext, 

264 gloss_nodes: list[str | WikiNode], 

265 gloss_data: Sense, 

266) -> None: 

267 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

268 form_of = "" 

269 for node in gloss_nodes: 

270 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

271 form_of = clean_node(wxr, None, node) 

272 elif isinstance(node, TemplateNode): 

273 if node.template_name in ("mutation de", "lien"): 273 ↛ 269line 273 didn't jump to line 269 because the condition on line 273 was always true

274 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

275 form_of = clean_node( 

276 wxr, None, node.template_parameters.get(1, "") 

277 ) 

278 if len(form_of) > 0: 278 ↛ exitline 278 didn't return from function 'find_form_of_word' because the condition on line 278 was always true

279 gloss_data.form_of.append(AltForm(word=form_of))