Coverage for src/wiktextract/extractor/fr/etymology.py: 95%

120 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from collections import defaultdict 

2from dataclasses import dataclass, field 

3 

4from wikitextprocessor.parser import ( 

5 LEVEL_KIND_FLAGS, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from .models import Example, WordEntry 

15 

16 

17@dataclass 

18class EtymologyData: 

19 texts: list[str] = field(default_factory=list) 

20 categories: list[str] = field(default_factory=list) 

21 

22 

23EtymologyDict = dict[tuple[str, str], EtymologyData] 

24 

25 

26def extract_etymology( 

27 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry 

28) -> EtymologyDict: 

29 etymology_dict: EtymologyDict = defaultdict(EtymologyData) 

30 level_node_index = len(level_node.children) 

31 pos_id = "" 

32 pos_title = "" 

33 for node_index, node in level_node.find_child( 

34 NodeKind.LIST | LEVEL_KIND_FLAGS, True 

35 ): 

36 if node.kind in LEVEL_KIND_FLAGS and node_index < level_node_index: 

37 level_node_index = node_index 

38 elif node.kind == NodeKind.LIST: 38 ↛ 33line 38 didn't jump to line 33 because the condition on line 38 was always true

39 for etymology_item in node.find_child(NodeKind.LIST_ITEM): 

40 etymology_data = find_pos_in_etymology_list(wxr, etymology_item) 

41 if etymology_data is not None: 

42 pos_id, pos_title, etymology_text, categories = ( 

43 etymology_data 

44 ) 

45 if len(etymology_text) > 0: 

46 etymology_dict[(pos_id, pos_title)].texts.append( 

47 etymology_text 

48 ) 

49 etymology_dict[(pos_id, pos_title)].categories.extend( 

50 categories 

51 ) 

52 else: 

53 categories = {} 

54 etymology_text = clean_node( 

55 wxr, categories, etymology_item.children 

56 ) 

57 if len(etymology_text) > 0: 57 ↛ 39line 57 didn't jump to line 39 because the condition on line 57 was always true

58 etymology_dict[(pos_id, pos_title)].texts.append( 

59 etymology_text 

60 ) 

61 etymology_dict[(pos_id, pos_title)].categories.extend( 

62 categories.get("categories", []) 

63 ) 

64 

65 if len(etymology_dict) == 0: 

66 categories = {} 

67 etymology_text = clean_node( 

68 wxr, categories, level_node.children[:level_node_index] 

69 ) 

70 if len(etymology_text) > 0: 70 ↛ 76line 70 didn't jump to line 76 because the condition on line 70 was always true

71 etymology_dict[("", "")].texts.append(etymology_text) 

72 etymology_dict[(pos_id, pos_title)].categories.extend( 

73 categories.get("categories", []) 

74 ) 

75 

76 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [ 

77 " " 

78 ]: 

79 # remove "ébauche-étym" template placeholder 

80 del etymology_dict[("", "")] 

81 

82 return etymology_dict 

83 

84 

85def find_pos_in_etymology_list( 

86 wxr: WiktextractContext, list_item_node: WikiNode 

87) -> tuple[str, str, str, list[str]] | None: 

88 """ 

89 Return tuple of POS id, title, etymology text, categories if the passed 

90 list item node starts with italic POS node or POS template, otherwise 

91 return `None`. 

92 """ 

93 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

94 if template_node.template_name == "ébauche-étym": 

95 return ("", "", " ", []) # missing etymology 

96 

97 categories = {} 

98 

99 for index, node in list_item_node.find_child( 

100 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True 

101 ): 

102 if isinstance(node, TemplateNode) and node.template_name in ( 

103 "lien-ancre-étym", 

104 "laé", 

105 ): 

106 expanded_template = wxr.wtp.parse( 

107 wxr.wtp.node_to_wikitext(node), expand_all=True 

108 ) 

109 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 109 ↛ 99line 109 didn't jump to line 99 because the loop on line 109 didn't complete

110 for link_node in italic_node.find_child(NodeKind.LINK): 110 ↛ 109line 110 didn't jump to line 109 because the loop on line 110 didn't complete

111 if isinstance( 111 ↛ 110line 111 didn't jump to line 110 because the condition on line 111 was always true

112 link_node.largs[0][0], str 

113 ) and link_node.largs[0][0].startswith("#"): 

114 pos_id = link_node.largs[0][0].removeprefix("#") 

115 return ( 

116 pos_id, 

117 clean_node(wxr, None, link_node).strip(": "), 

118 clean_node( 

119 wxr, 

120 categories, 

121 list_item_node.children[index + 1 :], 

122 ), 

123 categories.get("categories", []), 

124 ) 

125 elif ( 

126 node.kind == NodeKind.LINK 

127 and isinstance(node.largs[0][0], str) 

128 and node.largs[0][0].startswith("#") 

129 ): 

130 pos_id = node.largs[0][0].removeprefix("#") 

131 return ( 

132 pos_id, 

133 clean_node(wxr, None, node).strip(": "), 

134 clean_node( 

135 wxr, categories, list_item_node.children[index + 1 :] 

136 ), 

137 categories.get("categories", []), 

138 ) 

139 elif node.kind == NodeKind.ITALIC: 

140 for link_node in node.find_child(NodeKind.LINK): 

141 if isinstance(link_node.largs[0][0], str) and link_node.largs[ 

142 0 

143 ][0].startswith("#"): 

144 pos_id = link_node.largs[0][0].removeprefix("#") 

145 return ( 

146 pos_id, 

147 clean_node(wxr, None, link_node).strip(": "), 

148 clean_node( 

149 wxr, 

150 categories, 

151 list_item_node.children[index + 1 :], 

152 ).lstrip(") "), 

153 categories.get("categories", []), 

154 ) 

155 italic_text = clean_node(wxr, None, node) 

156 if ( 

157 index <= 1 # first node is empty string 

158 and italic_text.startswith("(") 

159 and italic_text.endswith(")") 

160 ): 

161 return ( 

162 "", 

163 italic_text.strip("() "), 

164 clean_node( 

165 wxr, 

166 categories, 

167 list_item_node.children[index + 1 :], 

168 ), 

169 categories.get("categories", []), 

170 ) 

171 

172 

173def insert_etymology_data( 

174 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict 

175) -> None: 

176 """ 

177 Insert list of etymology data extracted from the level 3 node to each sense 

178 dictionary matches the language and POS. 

179 """ 

180 sense_dict = defaultdict(list) # group by pos title and id 

181 for sense_data in page_data: 

182 if sense_data.lang_code == lang_code: 

183 sense_dict[sense_data.pos_title].append(sense_data) 

184 sense_dict[sense_data.pos_id].append(sense_data) 

185 if sense_data.pos_id.endswith("-1"): 

186 # extra ids for the first title 

187 sense_dict[sense_data.pos_title.replace(" ", "_")].append( 

188 sense_data 

189 ) 

190 sense_dict[sense_data.pos_id.removesuffix("-1")].append( 

191 sense_data 

192 ) 

193 

194 for pos_id_title, etymology_data in etymology_dict.items(): 

195 if pos_id_title == ("", ""): # add to all sense dictionaries 

196 for sense_data_list in sense_dict.values(): 

197 for sense_data in sense_data_list: 

198 sense_data.etymology_texts = etymology_data.texts 

199 sense_data.categories.extend(etymology_data.categories) 

200 else: 

201 for pos_key in pos_id_title: 

202 if pos_key in sense_dict: 

203 for sense_data in sense_dict[pos_key]: 

204 sense_data.etymology_texts = etymology_data.texts 

205 sense_data.categories.extend(etymology_data.categories) 

206 

207 

208def extract_etymology_examples( 

209 wxr: WiktextractContext, 

210 level_node: LevelNode, 

211 base_data: WordEntry, 

212) -> None: 

213 for list_node in level_node.find_child(NodeKind.LIST): 

214 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

215 extract_etymology_example_list_item(wxr, list_item, base_data, "") 

216 

217 

218def extract_etymology_example_list_item( 

219 wxr: WiktextractContext, 

220 list_item: WikiNode, 

221 base_data: WordEntry, 

222 note: str, 

223) -> None: 

224 from .gloss import process_exemple_template 

225 

226 time = "" 

227 source = "" 

228 example_nodes = [] 

229 has_exemple_template = False 

230 for node in list_item.children: 

231 if isinstance(node, TemplateNode): 

232 if node.template_name in ["siècle", "circa", "date"]: 

233 time = clean_node(wxr, base_data, node).strip("() ") 

234 elif node.template_name == "exemple": 

235 has_exemple_template = True 

236 example_data = process_exemple_template( 

237 wxr, node, base_data, time 

238 ) 

239 if example_data.text != "": 239 ↛ 230line 239 didn't jump to line 230 because the condition on line 239 was always true

240 example_data.note = note 

241 base_data.etymology_examples.append(example_data) 

242 elif node.template_name == "source": 242 ↛ 245line 242 didn't jump to line 245 because the condition on line 242 was always true

243 source = clean_node(wxr, base_data, node).strip("— ()") 

244 else: 

245 example_nodes.append(node) 

246 else: 

247 example_nodes.append(node) 

248 

249 if not has_exemple_template: 

250 if time == "" and list_item.contain_node(NodeKind.LIST): 

251 note = clean_node( 

252 wxr, base_data, list(list_item.invert_find_child(NodeKind.LIST)) 

253 ) 

254 for next_list in list_item.find_child(NodeKind.LIST): 

255 for next_list_item in next_list.find_child(NodeKind.LIST_ITEM): 

256 extract_etymology_example_list_item( 

257 wxr, next_list_item, base_data, note 

258 ) 

259 elif len(example_nodes) > 0: 259 ↛ exitline 259 didn't return from function 'extract_etymology_example_list_item' because the condition on line 259 was always true

260 example_str = clean_node(wxr, base_data, example_nodes) 

261 if example_str != "": 261 ↛ exitline 261 didn't return from function 'extract_etymology_example_list_item' because the condition on line 261 was always true

262 example_data = Example( 

263 text=example_str, time=time, ref=source, note=note 

264 ) 

265 base_data.etymology_examples.append(example_data)