Coverage for src/wiktextract/extractor/fr/etymology.py: 96%

100 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from collections import defaultdict 

2from dataclasses import dataclass, field 

3from typing import Optional 

4 

5from wikitextprocessor.parser import ( 

6 LEVEL_KIND_FLAGS, 

7 LevelNode, 

8 NodeKind, 

9 TemplateNode, 

10 WikiNode, 

11) 

12 

13from ...page import clean_node 

14from ...wxr_context import WiktextractContext 

15from .models import WordEntry 

16 

17 

18@dataclass 

19class EtymologyData: 

20 texts: list[str] = field(default_factory=list) 

21 categories: list[str] = field(default_factory=list) 

22 

23 

24EtymologyDict = dict[tuple[str, str], EtymologyData] 

25 

26 

27def extract_etymology( 

28 wxr: WiktextractContext, level_node: LevelNode, base_data: WordEntry 

29) -> EtymologyDict: 

30 etymology_dict: EtymologyDict = defaultdict(EtymologyData) 

31 level_node_index = len(level_node.children) 

32 pos_id = "" 

33 pos_title = "" 

34 for node_index, node in level_node.find_child( 

35 NodeKind.LIST | LEVEL_KIND_FLAGS, True 

36 ): 

37 if node.kind in LEVEL_KIND_FLAGS: 

38 level_node_index = node_index 

39 title_text = clean_node(wxr, None, node.largs) 

40 if title_text == "Attestations historiques": 

41 extract_etymology_examples(wxr, node, base_data) 

42 elif node.kind == NodeKind.LIST: 42 ↛ 34line 42 didn't jump to line 34 because the condition on line 42 was always true

43 for etymology_item in node.find_child(NodeKind.LIST_ITEM): 

44 etymology_data = find_pos_in_etymology_list(wxr, etymology_item) 

45 if etymology_data is not None: 

46 pos_id, pos_title, etymology_text, categories = ( 

47 etymology_data 

48 ) 

49 if len(etymology_text) > 0: 

50 etymology_dict[(pos_id, pos_title)].texts.append( 

51 etymology_text 

52 ) 

53 etymology_dict[(pos_id, pos_title)].categories.extend( 

54 categories 

55 ) 

56 else: 

57 categories = {} 

58 etymology_text = clean_node( 

59 wxr, categories, etymology_item.children 

60 ) 

61 if len(etymology_text) > 0: 61 ↛ 43line 61 didn't jump to line 43 because the condition on line 61 was always true

62 etymology_dict[(pos_id, pos_title)].texts.append( 

63 etymology_text 

64 ) 

65 etymology_dict[(pos_id, pos_title)].categories.extend( 

66 categories.get("categories", []) 

67 ) 

68 

69 if len(etymology_dict) == 0: 

70 categories = {} 

71 etymology_text = clean_node( 

72 wxr, categories, level_node.children[:level_node_index] 

73 ) 

74 if len(etymology_text) > 0: 

75 etymology_dict[("", "")].texts.append(etymology_text) 

76 etymology_dict[(pos_id, pos_title)].categories.extend( 

77 categories.get("categories", []) 

78 ) 

79 

80 if ("", "") in etymology_dict and etymology_dict.get(("", "")).texts == [ 

81 " " 

82 ]: 

83 # remove "ébauche-étym" template placeholder 

84 del etymology_dict[("", "")] 

85 

86 return etymology_dict 

87 

88 

89def find_pos_in_etymology_list( 

90 wxr: WiktextractContext, list_item_node: WikiNode 

91) -> Optional[tuple[str, str, str, list[str]]]: 

92 """ 

93 Return tuple of POS id, title, etymology text, categories if the passed 

94 list item node starts with italic POS node or POS template, otherwise 

95 return `None`. 

96 """ 

97 for template_node in list_item_node.find_child(NodeKind.TEMPLATE): 

98 if template_node.template_name == "ébauche-étym": 

99 return ("", "", " ", []) # missing etymology 

100 

101 categories = {} 

102 

103 for index, node in list_item_node.find_child( 

104 NodeKind.TEMPLATE | NodeKind.LINK | NodeKind.ITALIC, True 

105 ): 

106 if isinstance(node, TemplateNode) and node.template_name in ( 

107 "lien-ancre-étym", 

108 "laé", 

109 ): 

110 expanded_template = wxr.wtp.parse( 

111 wxr.wtp.node_to_wikitext(node), expand_all=True 

112 ) 

113 for italic_node in expanded_template.find_child(NodeKind.ITALIC): 113 ↛ 103line 113 didn't jump to line 103 because the loop on line 113 didn't complete

114 for link_node in italic_node.find_child(NodeKind.LINK): 114 ↛ 113line 114 didn't jump to line 113 because the loop on line 114 didn't complete

115 if isinstance( 115 ↛ 114line 115 didn't jump to line 114 because the condition on line 115 was always true

116 link_node.largs[0][0], str 

117 ) and link_node.largs[0][0].startswith("#"): 

118 pos_id = link_node.largs[0][0].removeprefix("#") 

119 return ( 

120 pos_id, 

121 clean_node(wxr, None, link_node).strip(": "), 

122 clean_node( 

123 wxr, 

124 categories, 

125 list_item_node.children[index + 1 :], 

126 ), 

127 categories.get("categories", []), 

128 ) 

129 elif ( 

130 node.kind == NodeKind.LINK 

131 and isinstance(node.largs[0][0], str) 

132 and node.largs[0][0].startswith("#") 

133 ): 

134 pos_id = node.largs[0][0].removeprefix("#") 

135 return ( 

136 pos_id, 

137 clean_node(wxr, None, node).strip(": "), 

138 clean_node( 

139 wxr, categories, list_item_node.children[index + 1 :] 

140 ), 

141 categories.get("categories", []), 

142 ) 

143 elif node.kind == NodeKind.ITALIC: 

144 for link_node in node.find_child(NodeKind.LINK): 

145 if isinstance(link_node.largs[0][0], str) and link_node.largs[ 

146 0 

147 ][0].startswith("#"): 

148 pos_id = link_node.largs[0][0].removeprefix("#") 

149 return ( 

150 pos_id, 

151 clean_node(wxr, None, link_node).strip(": "), 

152 clean_node( 

153 wxr, 

154 categories, 

155 list_item_node.children[index + 1 :], 

156 ).lstrip(") "), 

157 categories.get("categories", []), 

158 ) 

159 italic_text = clean_node(wxr, None, node) 

160 if ( 

161 index <= 1 # first node is empty string 

162 and italic_text.startswith("(") 

163 and italic_text.endswith(")") 

164 ): 

165 return ( 

166 "", 

167 italic_text.strip("() "), 

168 clean_node( 

169 wxr, 

170 categories, 

171 list_item_node.children[index + 1 :], 

172 ), 

173 categories.get("categories", []), 

174 ) 

175 

176 

177def insert_etymology_data( 

178 lang_code: str, page_data: list[WordEntry], etymology_dict: EtymologyDict 

179) -> None: 

180 """ 

181 Insert list of etymology data extracted from the level 3 node to each sense 

182 dictionary matches the language and POS. 

183 """ 

184 sense_dict = defaultdict(list) # group by pos title and id 

185 for sense_data in page_data: 

186 if sense_data.lang_code == lang_code: 

187 sense_dict[sense_data.pos_title].append(sense_data) 

188 sense_dict[sense_data.pos_id].append(sense_data) 

189 if sense_data.pos_id.endswith("-1"): 

190 # extra ids for the first title 

191 sense_dict[sense_data.pos_title.replace(" ", "_")].append( 

192 sense_data 

193 ) 

194 sense_dict[sense_data.pos_id.removesuffix("-1")].append( 

195 sense_data 

196 ) 

197 

198 for pos_id_title, etymology_data in etymology_dict.items(): 

199 if pos_id_title == ("", ""): # add to all sense dictionaries 

200 for sense_data_list in sense_dict.values(): 

201 for sense_data in sense_data_list: 

202 sense_data.etymology_texts = etymology_data.texts 

203 sense_data.categories.extend(etymology_data.categories) 

204 else: 

205 for pos_key in pos_id_title: 

206 if pos_key in sense_dict: 

207 for sense_data in sense_dict[pos_key]: 

208 sense_data.etymology_texts = etymology_data.texts 

209 sense_data.categories.extend(etymology_data.categories) 

210 

211 

212def extract_etymology_examples( 

213 wxr: WiktextractContext, 

214 level_node: LevelNode, 

215 base_data: WordEntry, 

216) -> None: 

217 from .gloss import process_exemple_template 

218 

219 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM): 

220 time = "" 

221 for template_node in list_item.find_child(NodeKind.TEMPLATE): 

222 if template_node.template_name == "siècle": 

223 time = clean_node(wxr, None, template_node).strip("() ") 

224 elif template_node.template_name == "exemple": 224 ↛ 221line 224 didn't jump to line 221 because the condition on line 224 was always true

225 example_data = process_exemple_template( 

226 wxr, template_node, base_data, time 

227 ) 

228 if example_data.text != "": 228 ↛ 221line 228 didn't jump to line 221 because the condition on line 228 was always true

229 base_data.etymology_examples.append(example_data)