Coverage for src/wiktextract/extractor/fr/page.py: 88%

143 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-11 10:26 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ...wxr_logging import logger 

14from .etymology import ( 

15 EtymologyData, 

16 extract_etymology, 

17 extract_etymology_examples, 

18 insert_etymology_data, 

19) 

20from .form_line import extract_form_line 

21from .gloss import extract_gloss, process_exemple_template 

22from .inflection import extract_inflection 

23from .linkage import extract_linkage 

24from .models import Sense, WordEntry 

25from .note import extract_note, extract_recognition_rate_section 

26from .pronunciation import extract_homophone_section, extract_pronunciation 

27from .section_types import ( 

28 ETYMOLOGY_SECTIONS, 

29 IGNORED_SECTIONS, 

30 INFLECTION_SECTIONS, 

31 LINKAGE_SECTIONS, 

32 NOTES_SECTIONS, 

33 POS_SECTIONS, 

34 PRONUNCIATION_SECTIONS, 

35 TRANSLATION_SECTIONS, 

36) 

37from .translation import extract_translation_section 

38 

39 

40def parse_section( 

41 wxr: WiktextractContext, 

42 page_data: list[WordEntry], 

43 base_data: WordEntry, 

44 level_node: LevelNode, 

45) -> EtymologyData | None: 

46 etymology_data = None 

47 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

48 if level_node_template.template_name == "S": 48 ↛ 47line 48 didn't jump to line 47 because the condition on line 48 was always true

49 # French Wiktionary uses a `S` template for all subtitles, we could 

50 # find the subtitle type by only checking the template parameter. 

51 # https://fr.wiktionary.org/wiki/Modèle:S 

52 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections 

53 first_param = level_node_template.template_parameters.get(1, "") 

54 if not isinstance(first_param, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 continue 

56 section_type = first_param.strip().lower() 

57 title_categories = {} 

58 subtitle = clean_node(wxr, title_categories, level_node.largs) 

59 wxr.wtp.start_subsection(subtitle) 

60 if section_type in IGNORED_SECTIONS: 

61 pass 

62 # POS parameters: 

63 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots 

64 elif section_type in POS_SECTIONS: 

65 process_pos_block( 

66 wxr, 

67 page_data, 

68 base_data, 

69 level_node, 

70 section_type, 

71 subtitle, 

72 ) 

73 if len(page_data) > 0: 73 ↛ 47line 73 didn't jump to line 47 because the condition on line 73 was always true

74 page_data[-1].categories.extend( 

75 title_categories.get("categories", []) 

76 ) 

77 elif ( 

78 wxr.config.capture_etymologies 

79 and section_type in ETYMOLOGY_SECTIONS 

80 ): 

81 etymology_data = extract_etymology(wxr, level_node, base_data) 

82 elif ( 

83 wxr.config.capture_pronunciation 

84 and section_type in PRONUNCIATION_SECTIONS 

85 ): 

86 extract_pronunciation(wxr, page_data, level_node, base_data) 

87 elif ( 

88 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS 

89 ): 

90 extract_linkage( 

91 wxr, 

92 page_data if len(page_data) > 0 else [base_data], 

93 level_node, 

94 section_type, 

95 ) 

96 elif ( 

97 wxr.config.capture_translations 

98 and section_type in TRANSLATION_SECTIONS 

99 ): 

100 extract_translation_section( 

101 wxr, 

102 page_data if len(page_data) > 0 else [base_data], 

103 level_node, 

104 ) 

105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true

106 wxr.config.capture_inflections 

107 and section_type in INFLECTION_SECTIONS 

108 ): 

109 pass 

110 elif section_type in NOTES_SECTIONS: 

111 extract_note( 

112 wxr, 

113 page_data if len(page_data) > 0 else [base_data], 

114 level_node, 

115 ) 

116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 extract_recognition_rate_section( 

118 wxr, 

119 page_data[-1] if len(page_data) > 0 else base_data, 

120 level_node, 

121 ) 

122 elif section_type == "attestations": 

123 extract_etymology_examples(wxr, level_node, base_data) 

124 elif section_type in ["homophones", "homo"]: 124 ↛ 133line 124 didn't jump to line 133 because the condition on line 124 was always true

125 extract_homophone_section( 

126 wxr, 

127 page_data, 

128 base_data, 

129 level_node, 

130 title_categories.get("categories", []), 

131 ) 

132 else: 

133 wxr.wtp.debug( 

134 f"Unknown section: {section_type}", 

135 sortid="extractor/fr/page/parse_section/127", 

136 ) 

137 

138 find_bottom_category_links(wxr, page_data, level_node) 

139 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

140 parse_section(wxr, page_data, base_data, next_level_node) 

141 return etymology_data 

142 

143 

144def process_pos_block( 

145 wxr: WiktextractContext, 

146 page_data: list[WordEntry], 

147 base_data: WordEntry, 

148 pos_title_node: LevelNode, 

149 pos_argument: str, 

150 pos_title: str, 

151): 

152 pos_data = POS_SECTIONS[pos_argument] 

153 pos_type = pos_data["pos"] 

154 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 page_data.append(base_data.model_copy(deep=True)) 

156 page_data[-1].pos = pos_type 

157 page_data[-1].pos_title = pos_title 

158 page_data[-1].tags.extend(pos_data.get("tags", [])) 

159 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): 

160 if level_node_template.template_name == "S": 160 ↛ 159line 160 didn't jump to line 159 because the condition on line 160 was always true

161 if level_node_template.template_parameters.get(3) == "flexion": 

162 page_data[-1].tags.append("form-of") 

163 expanded_s = wxr.wtp.parse( 

164 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True 

165 ) 

166 for span_tag in expanded_s.find_html("span"): 166 ↛ 167line 166 didn't jump to line 167 because the loop on line 166 never started

167 page_data[-1].pos_id = span_tag.attrs.get("id", "") 

168 break 

169 child_nodes = list(pos_title_node.filter_empty_str_child()) 

170 form_line_start = 0 # Ligne de forme 

171 level_node_index = len(child_nodes) 

172 gloss_start = len(child_nodes) 

173 lang_code = page_data[-1].lang_code 

174 has_gloss_list = False 

175 for index, child in enumerate(child_nodes): 

176 if isinstance(child, WikiNode): 

177 if child.kind == NodeKind.TEMPLATE: 

178 template_name = child.template_name 

179 if ( 

180 template_name.endswith("-exemple") 

181 and len(page_data[-1].senses) > 0 

182 ): 

183 # zh-exemple and ja-exemple expand to list thus are not the 

184 # child of gloss list item. 

185 process_exemple_template( 

186 wxr, child, page_data[-1].senses[-1] 

187 ) 

188 elif template_name.startswith(("zh-mot", "ja-mot")): 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was never true

189 # skip form line templates 

190 form_line_start = index 

191 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 extract_inflection(wxr, page_data, child) 

193 elif child.kind == NodeKind.BOLD and form_line_start == 0: 

194 form_line_start = index + 1 

195 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"): 

196 if index < gloss_start: 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was always true

197 gloss_start = index 

198 extract_gloss(wxr, page_data, child) 

199 has_gloss_list = True 

200 elif child.kind in LEVEL_KIND_FLAGS: 

201 level_node_index = index 

202 break 

203 

204 form_line_nodes = child_nodes[form_line_start:gloss_start] 

205 extract_form_line(wxr, page_data, form_line_nodes) 

206 if not has_gloss_list: 

207 gloss_text = clean_node( 

208 wxr, None, child_nodes[form_line_start:level_node_index] 

209 ) 

210 if gloss_text != "": 210 ↛ exitline 210 didn't return from function 'process_pos_block' because the condition on line 210 was always true

211 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

212 

213 

214def parse_page( 

215 wxr: WiktextractContext, page_title: str, page_text: str 

216) -> list[dict[str, Any]]: 

217 # Page structure 

218 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages 

219 if wxr.config.verbose: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 logger.info(f"Parsing page: {page_title}") 

221 wxr.config.word = page_title 

222 wxr.wtp.start_page(page_title) 

223 tree = wxr.wtp.parse(page_text) 

224 page_data: list[WordEntry] = [] 

225 for level2_node in tree.find_child(NodeKind.LEVEL2): 

226 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

227 # https://fr.wiktionary.org/wiki/Modèle:langue 

228 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues 

229 if subtitle_template.template_name == "langue": 229 ↛ 226line 229 didn't jump to line 226 because the condition on line 229 was always true

230 categories = {} 

231 lang_code = subtitle_template.template_parameters.get(1) 

232 if ( 232 ↛ 236line 232 didn't jump to line 236 because the condition on line 232 was never true

233 wxr.config.capture_language_codes is not None 

234 and lang_code not in wxr.config.capture_language_codes 

235 ): 

236 continue 

237 lang_name = clean_node(wxr, categories, subtitle_template) 

238 wxr.wtp.start_section(lang_name) 

239 base_data = WordEntry( 

240 word=wxr.wtp.title, 

241 lang_code=lang_code, 

242 lang=lang_name, 

243 pos="unknown", 

244 categories=categories.get("categories", []), 

245 ) 

246 etymology_data: EtymologyData | None = None 

247 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

248 new_etymology_data = parse_section( 

249 wxr, page_data, base_data, level3_node 

250 ) 

251 if new_etymology_data is not None: 

252 etymology_data = new_etymology_data 

253 

254 if etymology_data is not None: 

255 insert_etymology_data(lang_code, page_data, etymology_data) 

256 

257 for data in page_data: 

258 if len(data.senses) == 0: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 data.senses.append(Sense(tags=["no-gloss"])) 

260 return [m.model_dump(exclude_defaults=True) for m in page_data] 

261 

262 

263def find_bottom_category_links( 

264 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

265) -> None: 

266 if len(page_data) == 0: 

267 return 

268 categories = {} 

269 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

270 if isinstance(node, TemplateNode) and node.template_name.endswith( 270 ↛ 273line 270 didn't jump to line 273 because the condition on line 270 was never true

271 " entrée" 

272 ): 

273 clean_node(wxr, categories, node) 

274 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

275 clean_node(wxr, categories, node) 

276 

277 for data in page_data: 

278 if data.lang_code == page_data[-1].lang_code: 

279 data.categories.extend(categories.get("categories", []))