Coverage for src/wiktextract/extractor/fr/page.py: 88%
148 statements
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 12:01 +0000
« prev ^ index » next coverage.py v7.11.3, created at 2025-11-14 12:01 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ...wxr_logging import logger
14from .descendant import extract_desc_section
15from .etymology import (
16 EtymologyData,
17 extract_etymology,
18 extract_etymology_examples,
19 insert_etymology_data,
20)
21from .form_line import extract_form_line
22from .gloss import extract_gloss, process_exemple_template
23from .inflection import extract_inflection
24from .linkage import extract_linkage
25from .models import Sense, WordEntry
26from .note import extract_note, extract_recognition_rate_section
27from .pronunciation import extract_homophone_section, extract_pronunciation
28from .section_types import (
29 ETYMOLOGY_SECTIONS,
30 IGNORED_SECTIONS,
31 INFLECTION_SECTIONS,
32 LINKAGE_SECTIONS,
33 NOTES_SECTIONS,
34 POS_SECTIONS,
35 PRONUNCIATION_SECTIONS,
36 TRANSLATION_SECTIONS,
37)
38from .translation import extract_translation_section
41def parse_section(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 base_data: WordEntry,
45 level_node: LevelNode,
46) -> EtymologyData | None:
47 etymology_data = None
48 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
49 if level_node_template.template_name == "S": 49 ↛ 48line 49 didn't jump to line 48 because the condition on line 49 was always true
50 # French Wiktionary uses a `S` template for all subtitles, we could
51 # find the subtitle type by only checking the template parameter.
52 # https://fr.wiktionary.org/wiki/Modèle:S
53 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
54 first_param = level_node_template.template_parameters.get(1, "")
55 if not isinstance(first_param, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 continue
57 section_type = first_param.strip().lower()
58 title_categories = {}
59 subtitle = clean_node(wxr, title_categories, level_node.largs)
60 wxr.wtp.start_subsection(subtitle)
61 if section_type in IGNORED_SECTIONS:
62 pass
63 # POS parameters:
64 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
65 elif section_type in POS_SECTIONS:
66 process_pos_block(
67 wxr,
68 page_data,
69 base_data,
70 level_node,
71 section_type,
72 subtitle,
73 )
74 if len(page_data) > 0: 74 ↛ 48line 74 didn't jump to line 48 because the condition on line 74 was always true
75 page_data[-1].categories.extend(
76 title_categories.get("categories", [])
77 )
78 elif (
79 wxr.config.capture_etymologies
80 and section_type in ETYMOLOGY_SECTIONS
81 ):
82 etymology_data = extract_etymology(wxr, level_node, base_data)
83 elif (
84 wxr.config.capture_pronunciation
85 and section_type in PRONUNCIATION_SECTIONS
86 ):
87 extract_pronunciation(wxr, page_data, level_node, base_data)
88 elif (
89 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
90 ):
91 extract_linkage(
92 wxr,
93 page_data if len(page_data) > 0 else [base_data],
94 level_node,
95 section_type,
96 )
97 elif (
98 wxr.config.capture_translations
99 and section_type in TRANSLATION_SECTIONS
100 ):
101 extract_translation_section(
102 wxr,
103 page_data if len(page_data) > 0 else [base_data],
104 level_node,
105 )
106 elif ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true
107 wxr.config.capture_inflections
108 and section_type in INFLECTION_SECTIONS
109 ):
110 pass
111 elif section_type in NOTES_SECTIONS:
112 extract_note(
113 wxr,
114 page_data if len(page_data) > 0 else [base_data],
115 level_node,
116 )
117 elif section_type == "taux de reconnaissance": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 extract_recognition_rate_section(
119 wxr,
120 page_data[-1] if len(page_data) > 0 else base_data,
121 level_node,
122 )
123 elif section_type == "attestations":
124 extract_etymology_examples(wxr, level_node, base_data)
125 elif section_type in ["homophones", "homo"]:
126 extract_homophone_section(
127 wxr,
128 page_data,
129 base_data,
130 level_node,
131 title_categories.get("categories", []),
132 )
133 elif section_type == "dérivés autres langues": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true
134 extract_desc_section(
135 wxr,
136 page_data[-1] if len(page_data) > 0 else base_data,
137 level_node,
138 )
139 else:
140 wxr.wtp.debug(
141 f"Unknown section: {section_type}",
142 sortid="extractor/fr/page/parse_section/127",
143 )
145 find_bottom_category_links(wxr, page_data, level_node)
146 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
147 parse_section(wxr, page_data, base_data, next_level_node)
148 return etymology_data
151def process_pos_block(
152 wxr: WiktextractContext,
153 page_data: list[WordEntry],
154 base_data: WordEntry,
155 pos_title_node: LevelNode,
156 pos_argument: str,
157 pos_title: str,
158):
159 pos_data = POS_SECTIONS[pos_argument]
160 pos_type = pos_data["pos"]
161 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was always true
162 page_data.append(base_data.model_copy(deep=True))
163 page_data[-1].pos = pos_type
164 page_data[-1].pos_title = pos_title
165 page_data[-1].tags.extend(pos_data.get("tags", []))
166 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
167 if level_node_template.template_name == "S": 167 ↛ 166line 167 didn't jump to line 166 because the condition on line 167 was always true
168 if level_node_template.template_parameters.get(3) == "flexion":
169 page_data[-1].tags.append("form-of")
170 expanded_s = wxr.wtp.parse(
171 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
172 )
173 for span_tag in expanded_s.find_html("span"): 173 ↛ 174line 173 didn't jump to line 174 because the loop on line 173 never started
174 page_data[-1].pos_id = span_tag.attrs.get("id", "")
175 break
176 child_nodes = list(pos_title_node.filter_empty_str_child())
177 form_line_start = 0 # Ligne de forme
178 level_node_index = len(child_nodes)
179 gloss_start = len(child_nodes)
180 lang_code = page_data[-1].lang_code
181 has_gloss_list = False
182 is_first_bold = True
183 for index, child in enumerate(child_nodes):
184 if isinstance(child, WikiNode):
185 if child.kind == NodeKind.TEMPLATE:
186 template_name = child.template_name
187 if (
188 template_name.endswith("-exemple")
189 and len(page_data[-1].senses) > 0
190 ):
191 # zh-exemple and ja-exemple expand to list thus are not the
192 # child of gloss list item.
193 process_exemple_template(
194 wxr, child, page_data[-1].senses[-1]
195 )
196 elif template_name.startswith(("zh-mot", "ja-mot")): 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was never true
197 # skip form line templates
198 form_line_start = index
199 elif template_name.startswith(
200 (f"{lang_code}-", "flex-ku-", "zh-formes")
201 ):
202 extract_inflection(wxr, page_data, child)
203 elif child.kind == NodeKind.BOLD and is_first_bold:
204 if index < form_line_start: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true
205 form_line_start = index
206 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"):
207 if index < gloss_start: 207 ↛ 209line 207 didn't jump to line 209 because the condition on line 207 was always true
208 gloss_start = index
209 extract_gloss(wxr, page_data, child)
210 has_gloss_list = True
211 elif child.kind in LEVEL_KIND_FLAGS:
212 level_node_index = index
213 break
215 form_line_nodes = child_nodes[form_line_start:gloss_start]
216 extract_form_line(wxr, page_data, form_line_nodes)
217 if not has_gloss_list:
218 gloss_text = clean_node(
219 wxr, None, child_nodes[form_line_start + 1 : level_node_index]
220 )
221 if gloss_text != "": 221 ↛ exitline 221 didn't return from function 'process_pos_block' because the condition on line 221 was always true
222 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
225def parse_page(
226 wxr: WiktextractContext, page_title: str, page_text: str
227) -> list[dict[str, Any]]:
228 # Page structure
229 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
230 if wxr.config.verbose: 230 ↛ 231line 230 didn't jump to line 231 because the condition on line 230 was never true
231 logger.info(f"Parsing page: {page_title}")
232 wxr.config.word = page_title
233 wxr.wtp.start_page(page_title)
234 tree = wxr.wtp.parse(page_text)
235 page_data: list[WordEntry] = []
236 for level2_node in tree.find_child(NodeKind.LEVEL2):
237 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
238 # https://fr.wiktionary.org/wiki/Modèle:langue
239 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
240 if subtitle_template.template_name == "langue": 240 ↛ 237line 240 didn't jump to line 237 because the condition on line 240 was always true
241 categories = {}
242 lang_code = subtitle_template.template_parameters.get(1)
243 if ( 243 ↛ 247line 243 didn't jump to line 247 because the condition on line 243 was never true
244 wxr.config.capture_language_codes is not None
245 and lang_code not in wxr.config.capture_language_codes
246 ):
247 continue
248 lang_name = clean_node(wxr, categories, subtitle_template)
249 wxr.wtp.start_section(lang_name)
250 base_data = WordEntry(
251 word=page_title,
252 lang_code=lang_code,
253 lang=lang_name,
254 pos="unknown",
255 categories=categories.get("categories", []),
256 )
257 etymology_data: EtymologyData | None = None
258 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
259 new_etymology_data = parse_section(
260 wxr, page_data, base_data, level3_node
261 )
262 if new_etymology_data is not None:
263 etymology_data = new_etymology_data
265 if etymology_data is not None:
266 insert_etymology_data(lang_code, page_data, etymology_data)
268 for data in page_data:
269 if len(data.senses) == 0: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 data.senses.append(Sense(tags=["no-gloss"]))
271 return [m.model_dump(exclude_defaults=True) for m in page_data]
274def find_bottom_category_links(
275 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
276) -> None:
277 if len(page_data) == 0:
278 return
279 categories = {}
280 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
281 if isinstance(node, TemplateNode) and node.template_name.endswith( 281 ↛ 284line 281 didn't jump to line 284 because the condition on line 281 was never true
282 " entrée"
283 ):
284 clean_node(wxr, categories, node)
285 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
286 clean_node(wxr, categories, node)
288 for data in page_data:
289 if data.lang_code == page_data[-1].lang_code:
290 data.categories.extend(categories.get("categories", []))