Coverage for src/wiktextract/extractor/fr/page.py: 88%

148 statements  

« prev     ^ index     » next       coverage.py v7.11.0, created at 2025-10-24 07:36 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ...wxr_logging import logger 

14from .descendant import extract_desc_section 

15from .etymology import ( 

16 EtymologyData, 

17 extract_etymology, 

18 extract_etymology_examples, 

19 insert_etymology_data, 

20) 

21from .form_line import extract_form_line 

22from .gloss import extract_gloss, process_exemple_template 

23from .inflection import extract_inflection 

24from .linkage import extract_linkage 

25from .models import Sense, WordEntry 

26from .note import extract_note, extract_recognition_rate_section 

27from .pronunciation import extract_homophone_section, extract_pronunciation 

28from .section_types import ( 

29 ETYMOLOGY_SECTIONS, 

30 IGNORED_SECTIONS, 

31 INFLECTION_SECTIONS, 

32 LINKAGE_SECTIONS, 

33 NOTES_SECTIONS, 

34 POS_SECTIONS, 

35 PRONUNCIATION_SECTIONS, 

36 TRANSLATION_SECTIONS, 

37) 

38from .translation import extract_translation_section 

39 

40 

41def parse_section( 

42 wxr: WiktextractContext, 

43 page_data: list[WordEntry], 

44 base_data: WordEntry, 

45 level_node: LevelNode, 

46) -> EtymologyData | None: 

47 etymology_data = None 

48 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

49 if level_node_template.template_name == "S": 49 ↛ 48line 49 didn't jump to line 48 because the condition on line 49 was always true

50 # French Wiktionary uses a `S` template for all subtitles, we could 

51 # find the subtitle type by only checking the template parameter. 

52 # https://fr.wiktionary.org/wiki/Modèle:S 

53 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections 

54 first_param = level_node_template.template_parameters.get(1, "") 

55 if not isinstance(first_param, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true

56 continue 

57 section_type = first_param.strip().lower() 

58 title_categories = {} 

59 subtitle = clean_node(wxr, title_categories, level_node.largs) 

60 wxr.wtp.start_subsection(subtitle) 

61 if section_type in IGNORED_SECTIONS: 

62 pass 

63 # POS parameters: 

64 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots 

65 elif section_type in POS_SECTIONS: 

66 process_pos_block( 

67 wxr, 

68 page_data, 

69 base_data, 

70 level_node, 

71 section_type, 

72 subtitle, 

73 ) 

74 if len(page_data) > 0: 74 ↛ 48line 74 didn't jump to line 48 because the condition on line 74 was always true

75 page_data[-1].categories.extend( 

76 title_categories.get("categories", []) 

77 ) 

78 elif ( 

79 wxr.config.capture_etymologies 

80 and section_type in ETYMOLOGY_SECTIONS 

81 ): 

82 etymology_data = extract_etymology(wxr, level_node, base_data) 

83 elif ( 

84 wxr.config.capture_pronunciation 

85 and section_type in PRONUNCIATION_SECTIONS 

86 ): 

87 extract_pronunciation(wxr, page_data, level_node, base_data) 

88 elif ( 

89 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS 

90 ): 

91 extract_linkage( 

92 wxr, 

93 page_data if len(page_data) > 0 else [base_data], 

94 level_node, 

95 section_type, 

96 ) 

97 elif ( 

98 wxr.config.capture_translations 

99 and section_type in TRANSLATION_SECTIONS 

100 ): 

101 extract_translation_section( 

102 wxr, 

103 page_data if len(page_data) > 0 else [base_data], 

104 level_node, 

105 ) 

106 elif ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true

107 wxr.config.capture_inflections 

108 and section_type in INFLECTION_SECTIONS 

109 ): 

110 pass 

111 elif section_type in NOTES_SECTIONS: 

112 extract_note( 

113 wxr, 

114 page_data if len(page_data) > 0 else [base_data], 

115 level_node, 

116 ) 

117 elif section_type == "taux de reconnaissance": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true

118 extract_recognition_rate_section( 

119 wxr, 

120 page_data[-1] if len(page_data) > 0 else base_data, 

121 level_node, 

122 ) 

123 elif section_type == "attestations": 

124 extract_etymology_examples(wxr, level_node, base_data) 

125 elif section_type in ["homophones", "homo"]: 

126 extract_homophone_section( 

127 wxr, 

128 page_data, 

129 base_data, 

130 level_node, 

131 title_categories.get("categories", []), 

132 ) 

133 elif section_type == "dérivés autres langues": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true

134 extract_desc_section( 

135 wxr, 

136 page_data[-1] if len(page_data) > 0 else base_data, 

137 level_node, 

138 ) 

139 else: 

140 wxr.wtp.debug( 

141 f"Unknown section: {section_type}", 

142 sortid="extractor/fr/page/parse_section/127", 

143 ) 

144 

145 find_bottom_category_links(wxr, page_data, level_node) 

146 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

147 parse_section(wxr, page_data, base_data, next_level_node) 

148 return etymology_data 

149 

150 

151def process_pos_block( 

152 wxr: WiktextractContext, 

153 page_data: list[WordEntry], 

154 base_data: WordEntry, 

155 pos_title_node: LevelNode, 

156 pos_argument: str, 

157 pos_title: str, 

158): 

159 pos_data = POS_SECTIONS[pos_argument] 

160 pos_type = pos_data["pos"] 

161 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was always true

162 page_data.append(base_data.model_copy(deep=True)) 

163 page_data[-1].pos = pos_type 

164 page_data[-1].pos_title = pos_title 

165 page_data[-1].tags.extend(pos_data.get("tags", [])) 

166 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): 

167 if level_node_template.template_name == "S": 167 ↛ 166line 167 didn't jump to line 166 because the condition on line 167 was always true

168 if level_node_template.template_parameters.get(3) == "flexion": 

169 page_data[-1].tags.append("form-of") 

170 expanded_s = wxr.wtp.parse( 

171 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True 

172 ) 

173 for span_tag in expanded_s.find_html("span"): 173 ↛ 174line 173 didn't jump to line 174 because the loop on line 173 never started

174 page_data[-1].pos_id = span_tag.attrs.get("id", "") 

175 break 

176 child_nodes = list(pos_title_node.filter_empty_str_child()) 

177 form_line_start = 0 # Ligne de forme 

178 level_node_index = len(child_nodes) 

179 gloss_start = len(child_nodes) 

180 lang_code = page_data[-1].lang_code 

181 has_gloss_list = False 

182 is_first_bold = True 

183 for index, child in enumerate(child_nodes): 

184 if isinstance(child, WikiNode): 

185 if child.kind == NodeKind.TEMPLATE: 

186 template_name = child.template_name 

187 if ( 

188 template_name.endswith("-exemple") 

189 and len(page_data[-1].senses) > 0 

190 ): 

191 # zh-exemple and ja-exemple expand to list thus are not the 

192 # child of gloss list item. 

193 process_exemple_template( 

194 wxr, child, page_data[-1].senses[-1] 

195 ) 

196 elif template_name.startswith(("zh-mot", "ja-mot")): 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was never true

197 # skip form line templates 

198 form_line_start = index 

199 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 

200 extract_inflection(wxr, page_data, child) 

201 elif child.kind == NodeKind.BOLD and is_first_bold: 

202 if index < form_line_start: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 form_line_start = index 

204 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"): 

205 if index < gloss_start: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true

206 gloss_start = index 

207 extract_gloss(wxr, page_data, child) 

208 has_gloss_list = True 

209 elif child.kind in LEVEL_KIND_FLAGS: 

210 level_node_index = index 

211 break 

212 

213 form_line_nodes = child_nodes[form_line_start:gloss_start] 

214 extract_form_line(wxr, page_data, form_line_nodes) 

215 if not has_gloss_list: 

216 gloss_text = clean_node( 

217 wxr, None, child_nodes[form_line_start + 1 : level_node_index] 

218 ) 

219 if gloss_text != "": 219 ↛ exitline 219 didn't return from function 'process_pos_block' because the condition on line 219 was always true

220 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

221 

222 

223def parse_page( 

224 wxr: WiktextractContext, page_title: str, page_text: str 

225) -> list[dict[str, Any]]: 

226 # Page structure 

227 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages 

228 if wxr.config.verbose: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 logger.info(f"Parsing page: {page_title}") 

230 wxr.config.word = page_title 

231 wxr.wtp.start_page(page_title) 

232 tree = wxr.wtp.parse(page_text) 

233 page_data: list[WordEntry] = [] 

234 for level2_node in tree.find_child(NodeKind.LEVEL2): 

235 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

236 # https://fr.wiktionary.org/wiki/Modèle:langue 

237 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues 

238 if subtitle_template.template_name == "langue": 238 ↛ 235line 238 didn't jump to line 235 because the condition on line 238 was always true

239 categories = {} 

240 lang_code = subtitle_template.template_parameters.get(1) 

241 if ( 241 ↛ 245line 241 didn't jump to line 245 because the condition on line 241 was never true

242 wxr.config.capture_language_codes is not None 

243 and lang_code not in wxr.config.capture_language_codes 

244 ): 

245 continue 

246 lang_name = clean_node(wxr, categories, subtitle_template) 

247 wxr.wtp.start_section(lang_name) 

248 base_data = WordEntry( 

249 word=page_title, 

250 lang_code=lang_code, 

251 lang=lang_name, 

252 pos="unknown", 

253 categories=categories.get("categories", []), 

254 ) 

255 etymology_data: EtymologyData | None = None 

256 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

257 new_etymology_data = parse_section( 

258 wxr, page_data, base_data, level3_node 

259 ) 

260 if new_etymology_data is not None: 

261 etymology_data = new_etymology_data 

262 

263 if etymology_data is not None: 

264 insert_etymology_data(lang_code, page_data, etymology_data) 

265 

266 for data in page_data: 

267 if len(data.senses) == 0: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true

268 data.senses.append(Sense(tags=["no-gloss"])) 

269 return [m.model_dump(exclude_defaults=True) for m in page_data] 

270 

271 

272def find_bottom_category_links( 

273 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

274) -> None: 

275 if len(page_data) == 0: 

276 return 

277 categories = {} 

278 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

279 if isinstance(node, TemplateNode) and node.template_name.endswith( 279 ↛ 282line 279 didn't jump to line 282 because the condition on line 279 was never true

280 " entrée" 

281 ): 

282 clean_node(wxr, categories, node) 

283 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

284 clean_node(wxr, categories, node) 

285 

286 for data in page_data: 

287 if data.lang_code == page_data[-1].lang_code: 

288 data.categories.extend(categories.get("categories", []))