Coverage for src/wiktextract/extractor/fr/page.py: 88%
148 statements
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
« prev ^ index » next coverage.py v7.11.0, created at 2025-10-24 07:36 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ...wxr_logging import logger
14from .descendant import extract_desc_section
15from .etymology import (
16 EtymologyData,
17 extract_etymology,
18 extract_etymology_examples,
19 insert_etymology_data,
20)
21from .form_line import extract_form_line
22from .gloss import extract_gloss, process_exemple_template
23from .inflection import extract_inflection
24from .linkage import extract_linkage
25from .models import Sense, WordEntry
26from .note import extract_note, extract_recognition_rate_section
27from .pronunciation import extract_homophone_section, extract_pronunciation
28from .section_types import (
29 ETYMOLOGY_SECTIONS,
30 IGNORED_SECTIONS,
31 INFLECTION_SECTIONS,
32 LINKAGE_SECTIONS,
33 NOTES_SECTIONS,
34 POS_SECTIONS,
35 PRONUNCIATION_SECTIONS,
36 TRANSLATION_SECTIONS,
37)
38from .translation import extract_translation_section
41def parse_section(
42 wxr: WiktextractContext,
43 page_data: list[WordEntry],
44 base_data: WordEntry,
45 level_node: LevelNode,
46) -> EtymologyData | None:
47 etymology_data = None
48 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
49 if level_node_template.template_name == "S": 49 ↛ 48line 49 didn't jump to line 48 because the condition on line 49 was always true
50 # French Wiktionary uses a `S` template for all subtitles, we could
51 # find the subtitle type by only checking the template parameter.
52 # https://fr.wiktionary.org/wiki/Modèle:S
53 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
54 first_param = level_node_template.template_parameters.get(1, "")
55 if not isinstance(first_param, str): 55 ↛ 56line 55 didn't jump to line 56 because the condition on line 55 was never true
56 continue
57 section_type = first_param.strip().lower()
58 title_categories = {}
59 subtitle = clean_node(wxr, title_categories, level_node.largs)
60 wxr.wtp.start_subsection(subtitle)
61 if section_type in IGNORED_SECTIONS:
62 pass
63 # POS parameters:
64 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
65 elif section_type in POS_SECTIONS:
66 process_pos_block(
67 wxr,
68 page_data,
69 base_data,
70 level_node,
71 section_type,
72 subtitle,
73 )
74 if len(page_data) > 0: 74 ↛ 48line 74 didn't jump to line 48 because the condition on line 74 was always true
75 page_data[-1].categories.extend(
76 title_categories.get("categories", [])
77 )
78 elif (
79 wxr.config.capture_etymologies
80 and section_type in ETYMOLOGY_SECTIONS
81 ):
82 etymology_data = extract_etymology(wxr, level_node, base_data)
83 elif (
84 wxr.config.capture_pronunciation
85 and section_type in PRONUNCIATION_SECTIONS
86 ):
87 extract_pronunciation(wxr, page_data, level_node, base_data)
88 elif (
89 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
90 ):
91 extract_linkage(
92 wxr,
93 page_data if len(page_data) > 0 else [base_data],
94 level_node,
95 section_type,
96 )
97 elif (
98 wxr.config.capture_translations
99 and section_type in TRANSLATION_SECTIONS
100 ):
101 extract_translation_section(
102 wxr,
103 page_data if len(page_data) > 0 else [base_data],
104 level_node,
105 )
106 elif ( 106 ↛ 110line 106 didn't jump to line 110 because the condition on line 106 was never true
107 wxr.config.capture_inflections
108 and section_type in INFLECTION_SECTIONS
109 ):
110 pass
111 elif section_type in NOTES_SECTIONS:
112 extract_note(
113 wxr,
114 page_data if len(page_data) > 0 else [base_data],
115 level_node,
116 )
117 elif section_type == "taux de reconnaissance": 117 ↛ 118line 117 didn't jump to line 118 because the condition on line 117 was never true
118 extract_recognition_rate_section(
119 wxr,
120 page_data[-1] if len(page_data) > 0 else base_data,
121 level_node,
122 )
123 elif section_type == "attestations":
124 extract_etymology_examples(wxr, level_node, base_data)
125 elif section_type in ["homophones", "homo"]:
126 extract_homophone_section(
127 wxr,
128 page_data,
129 base_data,
130 level_node,
131 title_categories.get("categories", []),
132 )
133 elif section_type == "dérivés autres langues": 133 ↛ 140line 133 didn't jump to line 140 because the condition on line 133 was always true
134 extract_desc_section(
135 wxr,
136 page_data[-1] if len(page_data) > 0 else base_data,
137 level_node,
138 )
139 else:
140 wxr.wtp.debug(
141 f"Unknown section: {section_type}",
142 sortid="extractor/fr/page/parse_section/127",
143 )
145 find_bottom_category_links(wxr, page_data, level_node)
146 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
147 parse_section(wxr, page_data, base_data, next_level_node)
148 return etymology_data
151def process_pos_block(
152 wxr: WiktextractContext,
153 page_data: list[WordEntry],
154 base_data: WordEntry,
155 pos_title_node: LevelNode,
156 pos_argument: str,
157 pos_title: str,
158):
159 pos_data = POS_SECTIONS[pos_argument]
160 pos_type = pos_data["pos"]
161 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 161 ↛ 163line 161 didn't jump to line 163 because the condition on line 161 was always true
162 page_data.append(base_data.model_copy(deep=True))
163 page_data[-1].pos = pos_type
164 page_data[-1].pos_title = pos_title
165 page_data[-1].tags.extend(pos_data.get("tags", []))
166 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
167 if level_node_template.template_name == "S": 167 ↛ 166line 167 didn't jump to line 166 because the condition on line 167 was always true
168 if level_node_template.template_parameters.get(3) == "flexion":
169 page_data[-1].tags.append("form-of")
170 expanded_s = wxr.wtp.parse(
171 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
172 )
173 for span_tag in expanded_s.find_html("span"): 173 ↛ 174line 173 didn't jump to line 174 because the loop on line 173 never started
174 page_data[-1].pos_id = span_tag.attrs.get("id", "")
175 break
176 child_nodes = list(pos_title_node.filter_empty_str_child())
177 form_line_start = 0 # Ligne de forme
178 level_node_index = len(child_nodes)
179 gloss_start = len(child_nodes)
180 lang_code = page_data[-1].lang_code
181 has_gloss_list = False
182 is_first_bold = True
183 for index, child in enumerate(child_nodes):
184 if isinstance(child, WikiNode):
185 if child.kind == NodeKind.TEMPLATE:
186 template_name = child.template_name
187 if (
188 template_name.endswith("-exemple")
189 and len(page_data[-1].senses) > 0
190 ):
191 # zh-exemple and ja-exemple expand to list thus are not the
192 # child of gloss list item.
193 process_exemple_template(
194 wxr, child, page_data[-1].senses[-1]
195 )
196 elif template_name.startswith(("zh-mot", "ja-mot")): 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was never true
197 # skip form line templates
198 form_line_start = index
199 elif template_name.startswith((f"{lang_code}-", "flex-ku-")):
200 extract_inflection(wxr, page_data, child)
201 elif child.kind == NodeKind.BOLD and is_first_bold:
202 if index < form_line_start: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 form_line_start = index
204 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"):
205 if index < gloss_start: 205 ↛ 207line 205 didn't jump to line 207 because the condition on line 205 was always true
206 gloss_start = index
207 extract_gloss(wxr, page_data, child)
208 has_gloss_list = True
209 elif child.kind in LEVEL_KIND_FLAGS:
210 level_node_index = index
211 break
213 form_line_nodes = child_nodes[form_line_start:gloss_start]
214 extract_form_line(wxr, page_data, form_line_nodes)
215 if not has_gloss_list:
216 gloss_text = clean_node(
217 wxr, None, child_nodes[form_line_start + 1 : level_node_index]
218 )
219 if gloss_text != "": 219 ↛ exitline 219 didn't return from function 'process_pos_block' because the condition on line 219 was always true
220 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
223def parse_page(
224 wxr: WiktextractContext, page_title: str, page_text: str
225) -> list[dict[str, Any]]:
226 # Page structure
227 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
228 if wxr.config.verbose: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 logger.info(f"Parsing page: {page_title}")
230 wxr.config.word = page_title
231 wxr.wtp.start_page(page_title)
232 tree = wxr.wtp.parse(page_text)
233 page_data: list[WordEntry] = []
234 for level2_node in tree.find_child(NodeKind.LEVEL2):
235 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
236 # https://fr.wiktionary.org/wiki/Modèle:langue
237 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
238 if subtitle_template.template_name == "langue": 238 ↛ 235line 238 didn't jump to line 235 because the condition on line 238 was always true
239 categories = {}
240 lang_code = subtitle_template.template_parameters.get(1)
241 if ( 241 ↛ 245line 241 didn't jump to line 245 because the condition on line 241 was never true
242 wxr.config.capture_language_codes is not None
243 and lang_code not in wxr.config.capture_language_codes
244 ):
245 continue
246 lang_name = clean_node(wxr, categories, subtitle_template)
247 wxr.wtp.start_section(lang_name)
248 base_data = WordEntry(
249 word=page_title,
250 lang_code=lang_code,
251 lang=lang_name,
252 pos="unknown",
253 categories=categories.get("categories", []),
254 )
255 etymology_data: EtymologyData | None = None
256 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
257 new_etymology_data = parse_section(
258 wxr, page_data, base_data, level3_node
259 )
260 if new_etymology_data is not None:
261 etymology_data = new_etymology_data
263 if etymology_data is not None:
264 insert_etymology_data(lang_code, page_data, etymology_data)
266 for data in page_data:
267 if len(data.senses) == 0: 267 ↛ 268line 267 didn't jump to line 268 because the condition on line 267 was never true
268 data.senses.append(Sense(tags=["no-gloss"]))
269 return [m.model_dump(exclude_defaults=True) for m in page_data]
272def find_bottom_category_links(
273 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
274) -> None:
275 if len(page_data) == 0:
276 return
277 categories = {}
278 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
279 if isinstance(node, TemplateNode) and node.template_name.endswith( 279 ↛ 282line 279 didn't jump to line 282 because the condition on line 279 was never true
280 " entrée"
281 ):
282 clean_node(wxr, categories, node)
283 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
284 clean_node(wxr, categories, node)
286 for data in page_data:
287 if data.lang_code == page_data[-1].lang_code:
288 data.categories.extend(categories.get("categories", []))