Coverage for src/wiktextract/extractor/fr/page.py: 84%

138 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Any, Optional 

2 

3from wikitextprocessor.parser import ( 

4 LEVEL_KIND_FLAGS, 

5 NodeKind, 

6 TemplateNode, 

7 WikiNode, 

8) 

9 

10from ...page import clean_node 

11from ...wxr_context import WiktextractContext 

12from ...wxr_logging import logger 

13from .etymology import EtymologyData, extract_etymology, insert_etymology_data 

14from .form_line import extract_form_line 

15from .gloss import extract_gloss, process_exemple_template 

16from .inflection import extract_inflection 

17from .linkage import extract_linkage 

18from .models import Sense, WordEntry 

19from .note import extract_note, extract_recognition_rate_section 

20from .pronunciation import extract_pronunciation 

21from .section_types import ( 

22 ETYMOLOGY_SECTIONS, 

23 IGNORED_SECTIONS, 

24 INFLECTION_SECTIONS, 

25 LINKAGE_SECTIONS, 

26 NOTES_SECTIONS, 

27 POS_SECTIONS, 

28 PRONUNCIATION_SECTIONS, 

29 TRANSLATION_SECTIONS, 

30) 

31from .translation import extract_translation 

32 

33 

34def parse_section( 

35 wxr: WiktextractContext, 

36 page_data: list[WordEntry], 

37 base_data: WordEntry, 

38 level_node: WikiNode, 

39) -> Optional[EtymologyData]: 

40 etymology_data = None 

41 for level_node_template in level_node.find_content(NodeKind.TEMPLATE): 

42 if level_node_template.template_name == "S": 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true

43 # French Wiktionary uses a `S` template for all subtitles, we could 

44 # find the subtitle type by only checking the template parameter. 

45 # https://fr.wiktionary.org/wiki/Modèle:S 

46 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections 

47 first_param = level_node_template.template_parameters.get(1, "") 

48 if not isinstance(first_param, str): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 continue 

50 section_type = first_param.strip().lower() 

51 title_categories = {} 

52 subtitle = clean_node(wxr, title_categories, level_node.largs) 

53 wxr.wtp.start_subsection(subtitle) 

54 if section_type in IGNORED_SECTIONS: 

55 pass 

56 # POS parameters: 

57 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots 

58 elif section_type in POS_SECTIONS: 

59 process_pos_block( 

60 wxr, 

61 page_data, 

62 base_data, 

63 level_node, 

64 section_type, 

65 subtitle, 

66 ) 

67 if len(page_data) > 0: 67 ↛ 41line 67 didn't jump to line 41 because the condition on line 67 was always true

68 page_data[-1].categories.extend( 

69 title_categories.get("categories", []) 

70 ) 

71 elif ( 

72 wxr.config.capture_etymologies 

73 and section_type in ETYMOLOGY_SECTIONS 

74 ): 

75 etymology_data = extract_etymology(wxr, level_node, base_data) 

76 elif ( 

77 wxr.config.capture_pronunciation 

78 and section_type in PRONUNCIATION_SECTIONS 

79 ): 

80 extract_pronunciation(wxr, page_data, level_node, base_data) 

81 elif ( 

82 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS 

83 ): 

84 extract_linkage( 

85 wxr, 

86 page_data if len(page_data) > 0 else [base_data], 

87 level_node, 

88 section_type, 

89 ) 

90 elif ( 

91 wxr.config.capture_translations 

92 and section_type in TRANSLATION_SECTIONS 

93 ): 

94 extract_translation( 

95 wxr, 

96 page_data if len(page_data) > 0 else [base_data], 

97 base_data, 

98 level_node, 

99 ) 

100 elif ( 100 ↛ 104line 100 didn't jump to line 104

101 wxr.config.capture_inflections 

102 and section_type in INFLECTION_SECTIONS 

103 ): 

104 pass 

105 elif section_type in NOTES_SECTIONS: 105 ↛ 111line 105 didn't jump to line 111 because the condition on line 105 was always true

106 extract_note( 

107 wxr, 

108 page_data if len(page_data) > 0 else [base_data], 

109 level_node, 

110 ) 

111 elif section_type == "taux de reconnaissance": 

112 extract_recognition_rate_section( 

113 wxr, 

114 page_data[-1] if len(page_data) > 0 else base_data, 

115 level_node, 

116 ) 

117 

118 find_bottom_category_links(wxr, page_data, level_node) 

119 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS): 

120 parse_section(wxr, page_data, base_data, next_level_node) 

121 return etymology_data 

122 

123 

124def process_pos_block( 

125 wxr: WiktextractContext, 

126 page_data: list[WordEntry], 

127 base_data: WordEntry, 

128 pos_title_node: WikiNode, 

129 pos_argument: str, 

130 pos_title: str, 

131): 

132 pos_data = POS_SECTIONS[pos_argument] 

133 pos_type = pos_data["pos"] 

134 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 134 ↛ 136line 134 didn't jump to line 136 because the condition on line 134 was always true

135 page_data.append(base_data.model_copy(deep=True)) 

136 page_data[-1].pos = pos_type 

137 page_data[-1].pos_title = pos_title 

138 page_data[-1].tags.extend(pos_data.get("tags", [])) 

139 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE): 

140 if level_node_template.template_name == "S": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true

141 if level_node_template.template_parameters.get(3) == "flexion": 

142 page_data[-1].tags.append("form-of") 

143 expanded_s = wxr.wtp.parse( 

144 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True 

145 ) 

146 for span_tag in expanded_s.find_html("span"): 146 ↛ 147line 146 didn't jump to line 147 because the loop on line 146 never started

147 page_data[-1].pos_id = span_tag.attrs.get("id", "") 

148 break 

149 child_nodes = list(pos_title_node.filter_empty_str_child()) 

150 form_line_start = 0 # Ligne de forme 

151 level_node_index = len(child_nodes) 

152 gloss_start = len(child_nodes) 

153 lang_code = page_data[-1].lang_code 

154 has_gloss_list = False 

155 for index, child in enumerate(child_nodes): 

156 if isinstance(child, WikiNode): 

157 if child.kind == NodeKind.TEMPLATE: 

158 template_name = child.template_name 

159 if ( 159 ↛ 168line 159 didn't jump to line 168

160 template_name.endswith("-exemple") 

161 and len(page_data[-1].senses) > 0 

162 ): 

163 # zh-exemple and ja-exemple expand to list thus are not the 

164 # child of gloss list item. 

165 process_exemple_template( 

166 wxr, child, page_data[-1].senses[-1] 

167 ) 

168 elif template_name.startswith(("zh-mot", "ja-mot")): 

169 # skip form line templates 

170 form_line_start = index 

171 elif template_name.startswith(f"{lang_code}-"): 

172 extract_inflection(wxr, page_data, child) 

173 elif child.kind == NodeKind.BOLD and form_line_start == 0: 

174 form_line_start = index + 1 

175 elif child.kind == NodeKind.LIST: 

176 if index < gloss_start: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was always true

177 gloss_start = index 

178 extract_gloss(wxr, page_data, child) 

179 has_gloss_list = True 

180 elif child.kind in LEVEL_KIND_FLAGS: 

181 level_node_index = index 

182 break 

183 

184 form_line_nodes = child_nodes[form_line_start:gloss_start] 

185 extract_form_line(wxr, page_data, form_line_nodes) 

186 if not has_gloss_list: 

187 gloss_text = clean_node( 

188 wxr, None, child_nodes[form_line_start:level_node_index] 

189 ) 

190 if gloss_text != "": 190 ↛ exitline 190 didn't return from function 'process_pos_block' because the condition on line 190 was always true

191 page_data[-1].senses.append(Sense(glosses=[gloss_text])) 

192 

193 

194def parse_page( 

195 wxr: WiktextractContext, page_title: str, page_text: str 

196) -> list[dict[str, Any]]: 

197 # Page structure 

198 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages 

199 if wxr.config.verbose: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true

200 logger.info(f"Parsing page: {page_title}") 

201 wxr.config.word = page_title 

202 wxr.wtp.start_page(page_title) 

203 tree = wxr.wtp.parse(page_text) 

204 page_data: list[WordEntry] = [] 

205 for level2_node in tree.find_child(NodeKind.LEVEL2): 

206 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE): 

207 # https://fr.wiktionary.org/wiki/Modèle:langue 

208 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues 

209 if subtitle_template.template_name == "langue": 209 ↛ 206line 209 didn't jump to line 206 because the condition on line 209 was always true

210 categories = {} 

211 lang_code = subtitle_template.template_parameters.get(1) 

212 if ( 212 ↛ 216line 212 didn't jump to line 216

213 wxr.config.capture_language_codes is not None 

214 and lang_code not in wxr.config.capture_language_codes 

215 ): 

216 continue 

217 lang_name = clean_node(wxr, categories, subtitle_template) 

218 wxr.wtp.start_section(lang_name) 

219 base_data = WordEntry( 

220 word=wxr.wtp.title, 

221 lang_code=lang_code, 

222 lang=lang_name, 

223 pos="unknown", 

224 categories=categories.get("categories", []), 

225 ) 

226 etymology_data: Optional[EtymologyData] = None 

227 for level3_node in level2_node.find_child(NodeKind.LEVEL3): 

228 new_etymology_data = parse_section( 

229 wxr, page_data, base_data, level3_node 

230 ) 

231 if new_etymology_data is not None: 

232 etymology_data = new_etymology_data 

233 

234 if etymology_data is not None: 

235 insert_etymology_data(lang_code, page_data, etymology_data) 

236 

237 for data in page_data: 

238 if len(data.senses) == 0: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 data.senses.append(Sense(tags=["no-gloss"])) 

240 return [m.model_dump(exclude_defaults=True) for m in page_data] 

241 

242 

243def find_bottom_category_links( 

244 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode 

245) -> None: 

246 if len(page_data) == 0: 

247 return 

248 categories = {} 

249 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK): 

250 if isinstance(node, TemplateNode) and node.template_name.endswith( 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was never true

251 " entrée" 

252 ): 

253 clean_node(wxr, categories, node) 

254 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

255 clean_node(wxr, categories, node) 

256 

257 for data in page_data: 

258 if data.lang_code == page_data[-1].lang_code: 

259 data.categories.extend(categories.get("categories", []))