Coverage for src/wiktextract/extractor/fr/page.py: 88%

1from typing import Any

3from wikitextprocessor.parser import (

4 LEVEL_KIND_FLAGS,

5 LevelNode,

6 NodeKind,

7 TemplateNode,

8 WikiNode,

11from ...page import clean_node

12from ...wxr_context import WiktextractContext

13from ...wxr_logging import logger

14from .etymology import (

15 EtymologyData,

16 extract_etymology,

17 extract_etymology_examples,

18 insert_etymology_data,

19)

20from .form_line import extract_form_line

21from .gloss import extract_gloss, process_exemple_template

22from .inflection import extract_inflection

23from .linkage import extract_linkage

24from .models import Sense, WordEntry

25from .note import extract_note, extract_recognition_rate_section

26from .pronunciation import extract_homophone_section, extract_pronunciation

27from .section_types import (

28 ETYMOLOGY_SECTIONS,

29 IGNORED_SECTIONS,

30 INFLECTION_SECTIONS,

31 LINKAGE_SECTIONS,

32 NOTES_SECTIONS,

33 POS_SECTIONS,

34 PRONUNCIATION_SECTIONS,

35 TRANSLATION_SECTIONS,

36)

37from .translation import extract_translation_section

40def parse_section(

41 wxr: WiktextractContext,

42 page_data: list[WordEntry],

43 base_data: WordEntry,

44 level_node: LevelNode,

45) -> EtymologyData | None:

46 etymology_data = None

47 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):

48 if level_node_template.template_name == "S": 48 ↛ 47line 48 didn't jump to line 47 because the condition on line 48 was always true

49 # French Wiktionary uses a `S` template for all subtitles, we could

50 # find the subtitle type by only checking the template parameter.

51 # https://fr.wiktionary.org/wiki/Modèle:S

52 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections

53 first_param = level_node_template.template_parameters.get(1, "")

54 if not isinstance(first_param, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true

55 continue

56 section_type = first_param.strip().lower()

57 title_categories = {}

58 subtitle = clean_node(wxr, title_categories, level_node.largs)

59 wxr.wtp.start_subsection(subtitle)

60 if section_type in IGNORED_SECTIONS:

61 pass

62 # POS parameters:

63 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots

64 elif section_type in POS_SECTIONS:

65 process_pos_block(

66 wxr,

67 page_data,

68 base_data,

69 level_node,

70 section_type,

71 subtitle,

72 )

73 if len(page_data) > 0: 73 ↛ 47line 73 didn't jump to line 47 because the condition on line 73 was always true

74 page_data[-1].categories.extend(

75 title_categories.get("categories", [])

76 )

77 elif (

78 wxr.config.capture_etymologies

79 and section_type in ETYMOLOGY_SECTIONS

80 ):

81 etymology_data = extract_etymology(wxr, level_node, base_data)

82 elif (

83 wxr.config.capture_pronunciation

84 and section_type in PRONUNCIATION_SECTIONS

85 ):

86 extract_pronunciation(wxr, page_data, level_node, base_data)

87 elif (

88 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS

89 ):

90 extract_linkage(

91 wxr,

92 page_data if len(page_data) > 0 else [base_data],

93 level_node,

94 section_type,

95 )

96 elif (

97 wxr.config.capture_translations

98 and section_type in TRANSLATION_SECTIONS

99 ):

100 extract_translation_section(

101 wxr,

102 page_data if len(page_data) > 0 else [base_data],

103 level_node,

104 )

105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true

106 wxr.config.capture_inflections

107 and section_type in INFLECTION_SECTIONS

108 ):

109 pass

110 elif section_type in NOTES_SECTIONS:

111 extract_note(

112 wxr,

113 page_data if len(page_data) > 0 else [base_data],

114 level_node,

115 )

116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 extract_recognition_rate_section(

118 wxr,

119 page_data[-1] if len(page_data) > 0 else base_data,

120 level_node,

121 )

122 elif section_type == "attestations":

123 extract_etymology_examples(wxr, level_node, base_data)

124 elif section_type in ["homophones", "homo"]: 124 ↛ 133line 124 didn't jump to line 133 because the condition on line 124 was always true

125 extract_homophone_section(

126 wxr,

127 page_data,

128 base_data,

129 level_node,

130 title_categories.get("categories", []),

131 )

132 else:

133 wxr.wtp.debug(

134 f"Unknown section: {section_type}",

135 sortid="extractor/fr/page/parse_section/127",

136 )

137

138 find_bottom_category_links(wxr, page_data, level_node)

139 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):

140 parse_section(wxr, page_data, base_data, next_level_node)

141 return etymology_data

142

143

144def process_pos_block(

145 wxr: WiktextractContext,

146 page_data: list[WordEntry],

147 base_data: WordEntry,

148 pos_title_node: LevelNode,

149 pos_argument: str,

150 pos_title: str,

151):

152 pos_data = POS_SECTIONS[pos_argument]

153 pos_type = pos_data["pos"]

154 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true

155 page_data.append(base_data.model_copy(deep=True))

156 page_data[-1].pos = pos_type

157 page_data[-1].pos_title = pos_title

158 page_data[-1].tags.extend(pos_data.get("tags", []))

159 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):

160 if level_node_template.template_name == "S": 160 ↛ 159line 160 didn't jump to line 159 because the condition on line 160 was always true

161 if level_node_template.template_parameters.get(3) == "flexion":

162 page_data[-1].tags.append("form-of")

163 expanded_s = wxr.wtp.parse(

164 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True

165 )

166 for span_tag in expanded_s.find_html("span"): 166 ↛ 167line 166 didn't jump to line 167 because the loop on line 166 never started

167 page_data[-1].pos_id = span_tag.attrs.get("id", "")

168 break

169 child_nodes = list(pos_title_node.filter_empty_str_child())

170 form_line_start = 0 # Ligne de forme

171 level_node_index = len(child_nodes)

172 gloss_start = len(child_nodes)

173 lang_code = page_data[-1].lang_code

174 has_gloss_list = False

175 is_first_bold = True

176 for index, child in enumerate(child_nodes):

177 if isinstance(child, WikiNode):

178 if child.kind == NodeKind.TEMPLATE:

179 template_name = child.template_name

180 if (

181 template_name.endswith("-exemple")

182 and len(page_data[-1].senses) > 0

183 ):

184 # zh-exemple and ja-exemple expand to list thus are not the

185 # child of gloss list item.

186 process_exemple_template(

187 wxr, child, page_data[-1].senses[-1]

188 )

189 elif template_name.startswith(("zh-mot", "ja-mot")): 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true

190 # skip form line templates

191 form_line_start = index

192 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 extract_inflection(wxr, page_data, child)

194 elif child.kind == NodeKind.BOLD and is_first_bold:

195 form_line_start = index

196 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"):

197 if index < gloss_start: 197 ↛ 199line 197 didn't jump to line 199 because the condition on line 197 was always true

198 gloss_start = index

199 extract_gloss(wxr, page_data, child)

200 has_gloss_list = True

201 elif child.kind in LEVEL_KIND_FLAGS:

202 level_node_index = index

203 break

204

205 form_line_nodes = child_nodes[form_line_start:gloss_start]

206 extract_form_line(wxr, page_data, form_line_nodes)

207 if not has_gloss_list:

208 gloss_text = clean_node(

209 wxr, None, child_nodes[form_line_start + 1 : level_node_index]

210 )

211 if gloss_text != "": 211 ↛ exitline 211 didn't return from function 'process_pos_block' because the condition on line 211 was always true

212 page_data[-1].senses.append(Sense(glosses=[gloss_text]))

213

214

215def parse_page(

216 wxr: WiktextractContext, page_title: str, page_text: str

217) -> list[dict[str, Any]]:

218 # Page structure

219 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages

220 if wxr.config.verbose: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 logger.info(f"Parsing page: {page_title}")

222 wxr.config.word = page_title

223 wxr.wtp.start_page(page_title)

224 tree = wxr.wtp.parse(page_text)

225 page_data: list[WordEntry] = []

226 for level2_node in tree.find_child(NodeKind.LEVEL2):

227 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):

228 # https://fr.wiktionary.org/wiki/Modèle:langue

229 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues

230 if subtitle_template.template_name == "langue": 230 ↛ 227line 230 didn't jump to line 227 because the condition on line 230 was always true

231 categories = {}

232 lang_code = subtitle_template.template_parameters.get(1)

233 if ( 233 ↛ 237line 233 didn't jump to line 237 because the condition on line 233 was never true

234 wxr.config.capture_language_codes is not None

235 and lang_code not in wxr.config.capture_language_codes

236 ):

237 continue

238 lang_name = clean_node(wxr, categories, subtitle_template)

239 wxr.wtp.start_section(lang_name)

240 base_data = WordEntry(

241 word=page_title,

242 lang_code=lang_code,

243 lang=lang_name,

244 pos="unknown",

245 categories=categories.get("categories", []),

246 )

247 etymology_data: EtymologyData | None = None

248 for level3_node in level2_node.find_child(NodeKind.LEVEL3):

249 new_etymology_data = parse_section(

250 wxr, page_data, base_data, level3_node

251 )

252 if new_etymology_data is not None:

253 etymology_data = new_etymology_data

254

255 if etymology_data is not None:

256 insert_etymology_data(lang_code, page_data, etymology_data)

257

258 for data in page_data:

259 if len(data.senses) == 0: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 data.senses.append(Sense(tags=["no-gloss"]))

261 return [m.model_dump(exclude_defaults=True) for m in page_data]

262

263

264def find_bottom_category_links(

265 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode

266) -> None:

267 if len(page_data) == 0:

268 return

269 categories = {}

270 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):

271 if isinstance(node, TemplateNode) and node.template_name.endswith( 271 ↛ 274line 271 didn't jump to line 274 because the condition on line 271 was never true

272 " entrée"

273 ):

274 clean_node(wxr, categories, node)

275 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

276 clean_node(wxr, categories, node)

277

278 for data in page_data:

279 if data.lang_code == page_data[-1].lang_code:

280 data.categories.extend(categories.get("categories", []))