Coverage for src/wiktextract/extractor/fr/page.py: 86%

140 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ...wxr_logging import logger 

13from .etymology import ( 

14 EtymologyData, 

15 extract_etymology, 

16 extract_etymology_examples, 

17 insert_etymology_data, 

18) 

19from .form_line import extract_form_line 

20from .gloss import extract_gloss, process_exemple_template 

21from .inflection import extract_inflection 

22from .linkage import extract_linkage 

23from .models import Sense, WordEntry 

24from .note import extract_note, extract_recognition_rate_section 

25from .pronunciation import extract_pronunciation 

26from .section_types import ( 

27 ETYMOLOGY_SECTIONS, 

28 IGNORED_SECTIONS, 

29 INFLECTION_SECTIONS, 

30 LINKAGE_SECTIONS, 

31 NOTES_SECTIONS, 

32 POS_SECTIONS, 

33 PRONUNCIATION_SECTIONS, 

34 TRANSLATION_SECTIONS, 

35) 

36from .translation import extract_translation 

37 

38 

39def parse_section( 

40 wxr: WiktextractContext, 

41 page_data: list[WordEntry], 

42 base_data: WordEntry, 

43 level_node: WikiNode, 

44) -> EtymologyData | None: 

45 etymology_data = None 

46 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

47 if level_node_template.template_name == "S": 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true

48 # French Wiktionary uses a `S` template for all subtitles, we could 

49 # find the subtitle type by only checking the template parameter. 

50 # https://fr.wiktionary.org/wiki/Modèle:S 

51 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections 

52 first_param = level_node_template.template_parameters.get(1, "") 

53 if not isinstance(first_param, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true

54 continue 

55 section_type = first_param.strip().lower() 

56 title_categories = {} 

57 subtitle = clean_node(wxr, title_categories, level_node.largs) 

58 wxr.wtp.start_subsection(subtitle) 

59 if section_type in IGNORED_SECTIONS: 

60 pass 

61 # POS parameters: 

62 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots 

63 elif section_type in POS_SECTIONS: 

64 process_pos_block( 

65 wxr, 

66 page_data, 

67 base_data, 

68 level_node, 

69 section_type, 

70 subtitle, 

71 ) 

72 if len(page_data) > 0: 72 ↛ 46line 72 didn't jump to line 46 because the condition on line 72 was always true

73 page_data[-1].categories.extend( 

74 title_categories.get("categories", []) 

75 ) 

76 elif ( 

77 wxr.config.capture_etymologies 

78 and section_type in ETYMOLOGY_SECTIONS 

79 ): 

80 etymology_data = extract_etymology(wxr, level_node, base_data) 

81 elif ( 

82 wxr.config.capture_pronunciation 

83 and section_type in PRONUNCIATION_SECTIONS 

84 ): 

85 extract_pronunciation(wxr, page_data, level_node, base_data) 

86 elif ( 

87 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS 

88 ): 

89 extract_linkage( 

90 wxr, 

91 page_data if len(page_data) > 0 else [base_data], 

92 level_node, 

93 section_type, 

94 ) 

95 elif ( 

96 wxr.config.capture_translations 

97 and section_type in TRANSLATION_SECTIONS 

98 ): 

99 extract_translation( 

100 wxr, 

101 page_data if len(page_data) > 0 else [base_data], 

102 base_data, 

103 level_node, 

104 ) 

105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true

106 wxr.config.capture_inflections 

107 and section_type in INFLECTION_SECTIONS 

108 ): 

109 pass 

110 elif section_type in NOTES_SECTIONS: 

111 extract_note( 

112 wxr, 

113 page_data if len(page_data) > 0 else [base_data], 

114 level_node, 

115 ) 

116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 extract_recognition_rate_section( 

118 wxr, 

119 page_data[-1] if len(page_data) > 0 else base_data, 

120 level_node, 

121 ) 

122 elif section_type == "attestations": 122 ↛ 46line 122 didn't jump to line 46 because the condition on line 122 was always true

123 extract_etymology_examples(wxr, level_node, base_data) 

124 

125 find_bottom_category_links(wxr, page_data, level_node) 

126 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

127 parse_section(wxr, page_data, base_data, next_level_node) 

128 return etymology_data 

129 

130 

131def process_pos_block( 

132 wxr: WiktextractContext, 

133 page_data: list[WordEntry], 

134 base_data: WordEntry, 

135 pos_title_node: WikiNode, 

136 pos_argument: str, 

137 pos_title: str, 

138): 

139 pos_data = POS_SECTIONS[pos_argument] 

140 pos_type = pos_data["pos"] 

141 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true

142 page_data.append(base_data.model_copy(deep=True)) 

143 page_data[-1].pos = pos_type 

144 page_data[-1].pos_title = pos_title 

145 page_data[-1].tags.extend(pos_data.get("tags", [])) 

146 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): 

147 if level_node_template.template_name == "S": 147 ↛ 146line 147 didn't jump to line 146 because the condition on line 147 was always true

148 if level_node_template.template_parameters.get(3) == "flexion": 

149 page_data[-1].tags.append("form-of") 

150 expanded_s = wxr.wtp.parse( 

151 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True 

152 ) 

153 for span_tag in expanded_s.find_html("span"): 153 ↛ 154line 153 didn't jump to line 154 because the loop on line 153 never started

154 page_data[-1].pos_id = span_tag.attrs.get("id", "") 

155 break 

156 child_nodes = list(pos_title_node.filter_empty_str_child()) 

157 form_line_start = 0 # Ligne de forme 

158 level_node_index = len(child_nodes) 

159 gloss_start = len(child_nodes) 

160 lang_code = page_data[-1].lang_code 

161 has_gloss_list = False 

162 for index, child in enumerate(child_nodes): 

163 if isinstance(child, WikiNode): 

164 if child.kind == NodeKind.TEMPLATE: 

165 template_name = child.template_name 

166 if ( 166 ↛ 175line 166 didn't jump to line 175 because the condition on line 166 was always true

167 template_name.endswith("-exemple") 

168 and len(page_data[-1].senses) > 0 

169 ): 

170 # zh-exemple and ja-exemple expand to list thus are not the 

171 # child of gloss list item. 

172 process_exemple_template( 

173 wxr, child, page_data[-1].senses[-1] 

174 ) 

175 elif template_name.startswith(("zh-mot", "ja-mot")): 

176 # skip form line templates 

177 form_line_start = index 

178 elif template_name.startswith(f"{lang_code}-"): 

179 extract_inflection(wxr, page_data, child) 

180 elif child.kind == NodeKind.BOLD and form_line_start == 0: 

181 form_line_start = index + 1 

182 elif child.kind == NodeKind.LIST: 

183 if index < gloss_start: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true

184 gloss_start = index 

185 extract_gloss(wxr, page_data, child) 

186 has_gloss_list = True 

187 elif child.kind in LEVEL_KIND_FLAGS: 

188 level_node_index = index 

189 break 

190 

191 form_line_nodes = child_nodes[form_line_start:gloss_start] 

192 extract_form_line(wxr, page_data, form_line_nodes) 

193 if not has_gloss_list: 

194 gloss_text = clean_node( 

195 wxr, None, child_nodes[form_line_start:level_node_index] 

196 ) 

197 if gloss_text != "": 197 ↛ exitline 197 didn't return from function 'process_pos_block' because the condition on line 197 was always true

198 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

199 

200 

201def parse_page( 

202 wxr: WiktextractContext, page_title: str, page_text: str 

203) -> list[dict[str, Any]]: 

204 # Page structure 

205 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages 

206 if wxr.config.verbose: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 logger.info(f"Parsing page: {page_title}") 

208 wxr.config.word = page_title 

209 wxr.wtp.start_page(page_title) 

210 tree = wxr.wtp.parse(page_text) 

211 page_data: list[WordEntry] = [] 

212 for level2_node in tree.find_child(NodeKind.LEVEL2): 

213 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

214 # https://fr.wiktionary.org/wiki/Modèle:langue 

215 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues 

216 if subtitle_template.template_name == "langue": 216 ↛ 213line 216 didn't jump to line 213 because the condition on line 216 was always true

217 categories = {} 

218 lang_code = subtitle_template.template_parameters.get(1) 

219 if ( 219 ↛ 223line 219 didn't jump to line 223 because the condition on line 219 was never true

220 wxr.config.capture_language_codes is not None 

221 and lang_code not in wxr.config.capture_language_codes 

222 ): 

223 continue 

224 lang_name = clean_node(wxr, categories, subtitle_template) 

225 wxr.wtp.start_section(lang_name) 

226 base_data = WordEntry( 

227 word=wxr.wtp.title, 

228 lang_code=lang_code, 

229 lang=lang_name, 

230 pos="unknown", 

231 categories=categories.get("categories", []), 

232 ) 

233 etymology_data: EtymologyData | None = None 

234 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

235 new_etymology_data = parse_section( 

236 wxr, page_data, base_data, level3_node 

237 ) 

238 if new_etymology_data is not None: 

239 etymology_data = new_etymology_data 

240 

241 if etymology_data is not None: 

242 insert_etymology_data(lang_code, page_data, etymology_data) 

243 

244 for data in page_data: 

245 if len(data.senses) == 0: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true

246 data.senses.append(Sense(tags=["no-gloss"])) 

247 return [m.model_dump(exclude_defaults=True) for m in page_data] 

248 

249 

250def find_bottom_category_links( 

251 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

252) -> None: 

253 if len(page_data) == 0: 

254 return 

255 categories = {} 

256 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

257 if isinstance(node, TemplateNode) and node.template_name.endswith( 257 ↛ 260line 257 didn't jump to line 260 because the condition on line 257 was never true

258 " entrée" 

259 ): 

260 clean_node(wxr, categories, node) 

261 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

262 clean_node(wxr, categories, node) 

263 

264 for data in page_data: 

265 if data.lang_code == page_data[-1].lang_code: 

266 data.categories.extend(categories.get("categories", []))