Coverage for src/wiktextract/extractor/fr/gloss.py: 96%

131 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from collections import defaultdict 

2from typing import Optional, Union 

3 

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode 

5 

6from ...page import clean_node 

7from ...wxr_context import WiktextractContext 

8from .models import AltForm, Example, Sense, WordEntry 

9from .tags import translate_raw_tags 

10 

11 

12def extract_gloss( 

13 wxr: WiktextractContext, 

14 page_data: list[WordEntry], 

15 list_node: WikiNode, 

16 parent_sense: Optional[Sense] = None, 

17) -> None: 

18 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM): 

19 gloss_nodes = list( 

20 list_item_node.invert_find_child( 

21 NodeKind.LIST, include_empty_str=True 

22 ) 

23 ) 

24 gloss_data = Sense() 

25 if parent_sense is not None: 

26 gloss_data.glosses.extend(parent_sense.glosses) 

27 gloss_data.tags.extend(parent_sense.tags) 

28 gloss_data.raw_tags.extend(parent_sense.raw_tags) 

29 gloss_data.topics.extend(parent_sense.topics) 

30 # process modifier, theme tempaltes before gloss text 

31 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens 

32 tag_indexes = set() 

33 for index, gloss_node in enumerate(gloss_nodes): 

34 if isinstance(gloss_node, TemplateNode): 

35 categories_data = defaultdict(list) 

36 expanded_text = clean_node(wxr, categories_data, gloss_node) 

37 if ( 

38 expanded_text.startswith("(") 

39 and expanded_text.endswith(")") 

40 and "(" not in expanded_text[1:-1] 

41 ): 

42 tags = expanded_text.strip("() \n").split(", ") 

43 if len(tags) > 0: 43 ↛ 45line 43 didn't jump to line 45 because the condition on line 43 was always true

44 gloss_data.raw_tags.extend(tags) 

45 if "categories" in categories_data: 

46 gloss_data.categories.extend( 

47 categories_data["categories"] 

48 ) 

49 tag_indexes.add(index) 

50 # if an italic node is between parentheses then it's a tag, also 

51 # don't add the parenthese strings to `gloss_only_nodes` 

52 elif ( 

53 isinstance(gloss_node, WikiNode) 

54 and gloss_node.kind == NodeKind.ITALIC 

55 and isinstance(gloss_nodes[index - 1], str) 

56 and gloss_nodes[index - 1].strip() == "(" 

57 and index + 1 < len(gloss_nodes) 

58 and isinstance(gloss_nodes[index + 1], str) 

59 and gloss_nodes[index + 1].strip() == ")" 

60 ): 

61 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node)) 

62 tag_indexes |= {index - 1, index, index + 1} 

63 

64 gloss_only_nodes = [ 

65 node 

66 for index, node in enumerate(gloss_nodes) 

67 if index not in tag_indexes 

68 ] 

69 note_index = len(gloss_only_nodes) 

70 for index in range(note_index): 

71 if ( 

72 isinstance(gloss_only_nodes[index], TemplateNode) 

73 and gloss_only_nodes[index].template_name == "note" 

74 ): 

75 note_index = index 

76 gloss_text = find_alt_of_form( 

77 wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data 

78 ) 

79 if "form-of" in page_data[-1].tags: 

80 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data) 

81 if gloss_text != "": 

82 gloss_data.glosses.append(gloss_text) 

83 gloss_data.note = clean_node( 

84 wxr, gloss_data, gloss_only_nodes[note_index + 1 :] 

85 ).strip(" ().") 

86 page_data[-1].senses.append(gloss_data) 

87 

88 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST): 

89 if nest_gloss_list.sarg.endswith("#"): 

90 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data) 

91 elif nest_gloss_list.sarg.endswith("*"): 91 ↛ 88line 91 didn't jump to line 88 because the condition on line 91 was always true

92 extract_examples(wxr, gloss_data, nest_gloss_list) 

93 

94 translate_raw_tags(gloss_data) 

95 if len(gloss_data.glosses) == 0: 

96 gloss_data.tags.append("no-gloss") 

97 

98 

99def extract_examples( 

100 wxr: WiktextractContext, 

101 gloss_data: Sense, 

102 example_list_node: WikiNode, 

103) -> None: 

104 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM): 

105 example_node_children = list(example_node.filter_empty_str_child()) 

106 if len(example_node_children) == 0: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 continue 

108 first_child = example_node_children[0] 

109 if isinstance( 

110 first_child, TemplateNode 

111 ) and first_child.template_name.endswith("exemple"): 

112 process_exemple_template(wxr, first_child, gloss_data) 

113 else: 

114 example_data = Example() 

115 ignored_nodes = [] 

116 for node in example_node.find_child( 

117 NodeKind.TEMPLATE | NodeKind.LIST 

118 ): 

119 if ( 

120 node.kind == NodeKind.TEMPLATE 

121 and node.template_name == "source" 

122 ): 

123 example_data.ref = clean_node(wxr, None, node).strip("— ()") 

124 ignored_nodes.append(node) 

125 elif node.kind == NodeKind.LIST: 125 ↛ 116line 125 didn't jump to line 116 because the condition on line 125 was always true

126 for tr_item in node.find_child(NodeKind.LIST_ITEM): 

127 example_data.translation = clean_node( 

128 wxr, None, tr_item.children 

129 ) 

130 ignored_nodes.append(node) 

131 example_nodes = [ 

132 node 

133 for node in example_node_children 

134 if node not in ignored_nodes 

135 ] 

136 example_data.text = clean_node(wxr, None, example_nodes) 

137 gloss_data.examples.append(example_data) 

138 

139 

140def process_exemple_template( 

141 wxr: WiktextractContext, 

142 node: TemplateNode, 

143 gloss_data: Optional[Sense], 

144 time: str = "", 

145) -> Example: 

146 # https://fr.wiktionary.org/wiki/Modèle:exemple 

147 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple 

148 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple 

149 text = clean_node(wxr, None, node.template_parameters.get(1, "")) 

150 translation = clean_node( 

151 wxr, 

152 None, 

153 node.template_parameters.get( 

154 2, node.template_parameters.get("sens", "") 

155 ), 

156 ) 

157 transcription = clean_node( 

158 wxr, 

159 None, 

160 node.template_parameters.get(3, node.template_parameters.get("tr", "")), 

161 ) 

162 source = clean_node(wxr, None, node.template_parameters.get("source", "")) 

163 example_data = Example( 

164 text=clean_node(wxr, None, text), 

165 translation=clean_node(wxr, None, translation), 

166 roman=clean_node(wxr, None, transcription), 

167 ref=clean_node(wxr, None, source), 

168 time=time, 

169 ) 

170 if len(example_data.text) > 0 and isinstance(gloss_data, Sense): 

171 gloss_data.examples.append(example_data) 

172 if gloss_data is not None: 172 ↛ 174line 172 didn't jump to line 174 because the condition on line 172 was always true

173 clean_node(wxr, gloss_data, node) 

174 return example_data 

175 

176 

177def find_alt_of_form( 

178 wxr: WiktextractContext, 

179 gloss_nodes: list[Union[str, WikiNode]], 

180 pos_type: str, 

181 gloss_data: Sense, 

182) -> str: 

183 """ 

184 Return gloss text, remove tag template expanded from "variante *" templates. 

185 """ 

186 

187 alt_of = "" 

188 filtered_gloss_nodes = [] 

189 for gloss_node in gloss_nodes: 

190 # https://fr.wiktionary.org/wiki/Modèle:variante_de 

191 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de 

192 if isinstance( 

193 gloss_node, TemplateNode 

194 ) and gloss_node.template_name.startswith("variante "): 

195 alt_of = clean_node( 

196 wxr, None, gloss_node.template_parameters.get("dif", "") 

197 ) 

198 if len(alt_of) == 0: 

199 alt_of = clean_node( 

200 wxr, None, gloss_node.template_parameters.get(1, "") 

201 ) 

202 if len(alt_of) > 0: 202 ↛ 205line 202 didn't jump to line 205 because the condition on line 202 was always true

203 gloss_data.alt_of.append(AltForm(word=alt_of)) 

204 gloss_data.tags.append("alt-of") 

205 expanded_template = wxr.wtp.parse( 

206 wxr.wtp.node_to_wikitext(gloss_node), 

207 pre_expand=True, 

208 additional_expand={gloss_node.template_name}, 

209 ) 

210 for node in expanded_template.children: 

211 if ( 

212 isinstance(node, TemplateNode) 

213 and node.template_name == "désuet" 

214 ): 

215 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()") 

216 gloss_data.raw_tags.append(raw_tag) 

217 else: 

218 filtered_gloss_nodes.append(node) 

219 else: 

220 filtered_gloss_nodes.append(gloss_node) 

221 

222 if alt_of == "" and pos_type == "typographic variant": 

223 for gloss_node in filter( 

224 lambda n: isinstance(n, WikiNode), gloss_nodes 

225 ): 

226 # use the last link 

227 if gloss_node.kind == NodeKind.LINK: 

228 alt_of = clean_node(wxr, None, gloss_node) 

229 if isinstance(gloss_node, TemplateNode): 

230 gloss_node = wxr.wtp.parse( 

231 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True 

232 ) 

233 for link in gloss_node.find_child_recursively(NodeKind.LINK): 

234 alt_of = clean_node(wxr, None, link) 

235 if len(alt_of) > 0: 235 ↛ 238line 235 didn't jump to line 238 because the condition on line 235 was always true

236 gloss_data.alt_of.append(AltForm(word=alt_of)) 

237 

238 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes) 

239 brackets = 0 

240 for char in gloss_text: 

241 if char == "(": 

242 brackets += 1 

243 elif char == ")": 

244 brackets -= 1 

245 if brackets != 0: 

246 gloss_text = gloss_text.strip(" ()") 

247 return gloss_text 

248 

249 

250def find_form_of_word( 

251 wxr: WiktextractContext, 

252 gloss_nodes: list[Union[str, WikiNode]], 

253 gloss_data: Sense, 

254) -> None: 

255 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes 

256 form_of = "" 

257 for node in gloss_nodes: 

258 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

259 form_of = clean_node(wxr, None, node) 

260 elif isinstance(node, TemplateNode): 

261 if node.template_name in ("mutation de", "lien"): 261 ↛ 257line 261 didn't jump to line 257 because the condition on line 261 was always true

262 # https://fr.wiktionary.org/wiki/Modèle:mutation_de 

263 form_of = clean_node( 

264 wxr, None, node.template_parameters.get(1, "") 

265 ) 

266 if len(form_of) > 0: 266 ↛ exitline 266 didn't return from function 'find_form_of_word' because the condition on line 266 was always true

267 gloss_data.form_of.append(AltForm(word=form_of))