Coverage for src/wiktextract/extractor/fr/page.py: 88%

144 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from typing import Any 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 LevelNode, 

6 NodeKind, 

7 TemplateNode, 

8 WikiNode, 

9) 

10 

11from ...page import clean_node 

12from ...wxr_context import WiktextractContext 

13from ...wxr_logging import logger 

14from .etymology import ( 

15 EtymologyData, 

16 extract_etymology, 

17 extract_etymology_examples, 

18 insert_etymology_data, 

19) 

20from .form_line import extract_form_line 

21from .gloss import extract_gloss, process_exemple_template 

22from .inflection import extract_inflection 

23from .linkage import extract_linkage 

24from .models import Sense, WordEntry 

25from .note import extract_note, extract_recognition_rate_section 

26from .pronunciation import extract_homophone_section, extract_pronunciation 

27from .section_types import ( 

28 ETYMOLOGY_SECTIONS, 

29 IGNORED_SECTIONS, 

30 INFLECTION_SECTIONS, 

31 LINKAGE_SECTIONS, 

32 NOTES_SECTIONS, 

33 POS_SECTIONS, 

34 PRONUNCIATION_SECTIONS, 

35 TRANSLATION_SECTIONS, 

36) 

37from .translation import extract_translation_section 

38 

39 

40def parse_section( 

41 wxr: WiktextractContext, 

42 page_data: list[WordEntry], 

43 base_data: WordEntry, 

44 level_node: LevelNode, 

45) -> EtymologyData | None: 

46 etymology_data = None 

47 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

48 if level_node_template.template_name == "S": 48 ↛ 47line 48 didn't jump to line 47 because the condition on line 48 was always true

49 # French Wiktionary uses a `S` template for all subtitles, we could 

50 # find the subtitle type by only checking the template parameter. 

51 # https://fr.wiktionary.org/wiki/Modèle:S 

52 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections 

53 first_param = level_node_template.template_parameters.get(1, "") 

54 if not isinstance(first_param, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 continue 

56 section_type = first_param.strip().lower() 

57 title_categories = {} 

58 subtitle = clean_node(wxr, title_categories, level_node.largs) 

59 wxr.wtp.start_subsection(subtitle) 

60 if section_type in IGNORED_SECTIONS: 

61 pass 

62 # POS parameters: 

63 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots 

64 elif section_type in POS_SECTIONS: 

65 process_pos_block( 

66 wxr, 

67 page_data, 

68 base_data, 

69 level_node, 

70 section_type, 

71 subtitle, 

72 ) 

73 if len(page_data) > 0: 73 ↛ 47line 73 didn't jump to line 47 because the condition on line 73 was always true

74 page_data[-1].categories.extend( 

75 title_categories.get("categories", []) 

76 ) 

77 elif ( 

78 wxr.config.capture_etymologies 

79 and section_type in ETYMOLOGY_SECTIONS 

80 ): 

81 etymology_data = extract_etymology(wxr, level_node, base_data) 

82 elif ( 

83 wxr.config.capture_pronunciation 

84 and section_type in PRONUNCIATION_SECTIONS 

85 ): 

86 extract_pronunciation(wxr, page_data, level_node, base_data) 

87 elif ( 

88 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS 

89 ): 

90 extract_linkage( 

91 wxr, 

92 page_data if len(page_data) > 0 else [base_data], 

93 level_node, 

94 section_type, 

95 ) 

96 elif ( 

97 wxr.config.capture_translations 

98 and section_type in TRANSLATION_SECTIONS 

99 ): 

100 extract_translation_section( 

101 wxr, 

102 page_data if len(page_data) > 0 else [base_data], 

103 level_node, 

104 ) 

105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true

106 wxr.config.capture_inflections 

107 and section_type in INFLECTION_SECTIONS 

108 ): 

109 pass 

110 elif section_type in NOTES_SECTIONS: 

111 extract_note( 

112 wxr, 

113 page_data if len(page_data) > 0 else [base_data], 

114 level_node, 

115 ) 

116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 extract_recognition_rate_section( 

118 wxr, 

119 page_data[-1] if len(page_data) > 0 else base_data, 

120 level_node, 

121 ) 

122 elif section_type == "attestations": 

123 extract_etymology_examples(wxr, level_node, base_data) 

124 elif section_type in ["homophones", "homo"]: 124 ↛ 133line 124 didn't jump to line 133 because the condition on line 124 was always true

125 extract_homophone_section( 

126 wxr, 

127 page_data, 

128 base_data, 

129 level_node, 

130 title_categories.get("categories", []), 

131 ) 

132 else: 

133 wxr.wtp.debug( 

134 f"Unknown section: {section_type}", 

135 sortid="extractor/fr/page/parse_section/127", 

136 ) 

137 

138 find_bottom_category_links(wxr, page_data, level_node) 

139 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

140 parse_section(wxr, page_data, base_data, next_level_node) 

141 return etymology_data 

142 

143 

144def process_pos_block( 

145 wxr: WiktextractContext, 

146 page_data: list[WordEntry], 

147 base_data: WordEntry, 

148 pos_title_node: LevelNode, 

149 pos_argument: str, 

150 pos_title: str, 

151): 

152 pos_data = POS_SECTIONS[pos_argument] 

153 pos_type = pos_data["pos"] 

154 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 page_data.append(base_data.model_copy(deep=True)) 

156 page_data[-1].pos = pos_type 

157 page_data[-1].pos_title = pos_title 

158 page_data[-1].tags.extend(pos_data.get("tags", [])) 

159 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): 

160 if level_node_template.template_name == "S": 160 ↛ 159line 160 didn't jump to line 159 because the condition on line 160 was always true

161 if level_node_template.template_parameters.get(3) == "flexion": 

162 page_data[-1].tags.append("form-of") 

163 expanded_s = wxr.wtp.parse( 

164 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True 

165 ) 

166 for span_tag in expanded_s.find_html("span"): 166 ↛ 167line 166 didn't jump to line 167 because the loop on line 166 never started

167 page_data[-1].pos_id = span_tag.attrs.get("id", "") 

168 break 

169 child_nodes = list(pos_title_node.filter_empty_str_child()) 

170 form_line_start = 0 # Ligne de forme 

171 level_node_index = len(child_nodes) 

172 gloss_start = len(child_nodes) 

173 lang_code = page_data[-1].lang_code 

174 has_gloss_list = False 

175 is_first_bold = True 

176 for index, child in enumerate(child_nodes): 

177 if isinstance(child, WikiNode): 

178 if child.kind == NodeKind.TEMPLATE: 

179 template_name = child.template_name 

180 if ( 

181 template_name.endswith("-exemple") 

182 and len(page_data[-1].senses) > 0 

183 ): 

184 # zh-exemple and ja-exemple expand to list thus are not the 

185 # child of gloss list item. 

186 process_exemple_template( 

187 wxr, child, page_data[-1].senses[-1] 

188 ) 

189 elif template_name.startswith(("zh-mot", "ja-mot")): 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true

190 # skip form line templates 

191 form_line_start = index 

192 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 extract_inflection(wxr, page_data, child) 

194 elif child.kind == NodeKind.BOLD and is_first_bold: 

195 form_line_start = index 

196 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"): 

197 if index < gloss_start: 197 ↛ 199line 197 didn't jump to line 199 because the condition on line 197 was always true

198 gloss_start = index 

199 extract_gloss(wxr, page_data, child) 

200 has_gloss_list = True 

201 elif child.kind in LEVEL_KIND_FLAGS: 

202 level_node_index = index 

203 break 

204 

205 form_line_nodes = child_nodes[form_line_start:gloss_start] 

206 extract_form_line(wxr, page_data, form_line_nodes) 

207 if not has_gloss_list: 

208 gloss_text = clean_node( 

209 wxr, None, child_nodes[form_line_start + 1 : level_node_index] 

210 ) 

211 if gloss_text != "": 211 ↛ exitline 211 didn't return from function 'process_pos_block' because the condition on line 211 was always true

212 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

213 

214 

215def parse_page( 

216 wxr: WiktextractContext, page_title: str, page_text: str 

217) -> list[dict[str, Any]]: 

218 # Page structure 

219 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages 

220 if wxr.config.verbose: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 logger.info(f"Parsing page: {page_title}") 

222 wxr.config.word = page_title 

223 wxr.wtp.start_page(page_title) 

224 tree = wxr.wtp.parse(page_text) 

225 page_data: list[WordEntry] = [] 

226 for level2_node in tree.find_child(NodeKind.LEVEL2): 

227 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

228 # https://fr.wiktionary.org/wiki/Modèle:langue 

229 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues 

230 if subtitle_template.template_name == "langue": 230 ↛ 227line 230 didn't jump to line 227 because the condition on line 230 was always true

231 categories = {} 

232 lang_code = subtitle_template.template_parameters.get(1) 

233 if ( 233 ↛ 237line 233 didn't jump to line 237 because the condition on line 233 was never true

234 wxr.config.capture_language_codes is not None 

235 and lang_code not in wxr.config.capture_language_codes 

236 ): 

237 continue 

238 lang_name = clean_node(wxr, categories, subtitle_template) 

239 wxr.wtp.start_section(lang_name) 

240 base_data = WordEntry( 

241 word=page_title, 

242 lang_code=lang_code, 

243 lang=lang_name, 

244 pos="unknown", 

245 categories=categories.get("categories", []), 

246 ) 

247 etymology_data: EtymologyData | None = None 

248 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

249 new_etymology_data = parse_section( 

250 wxr, page_data, base_data, level3_node 

251 ) 

252 if new_etymology_data is not None: 

253 etymology_data = new_etymology_data 

254 

255 if etymology_data is not None: 

256 insert_etymology_data(lang_code, page_data, etymology_data) 

257 

258 for data in page_data: 

259 if len(data.senses) == 0: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 data.senses.append(Sense(tags=["no-gloss"])) 

261 return [m.model_dump(exclude_defaults=True) for m in page_data] 

262 

263 

264def find_bottom_category_links( 

265 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

266) -> None: 

267 if len(page_data) == 0: 

268 return 

269 categories = {} 

270 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

271 if isinstance(node, TemplateNode) and node.template_name.endswith( 271 ↛ 274line 271 didn't jump to line 274 because the condition on line 271 was never true

272 " entrée" 

273 ): 

274 clean_node(wxr, categories, node) 

275 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

276 clean_node(wxr, categories, node) 

277 

278 for data in page_data: 

279 if data.lang_code == page_data[-1].lang_code: 

280 data.categories.extend(categories.get("categories", []))