Coverage for src/wiktextract/extractor/fr/page.py: 88%
143 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-11 10:26 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ...wxr_logging import logger
14from .etymology import (
15 EtymologyData,
16 extract_etymology,
17 extract_etymology_examples,
18 insert_etymology_data,
19)
20from .form_line import extract_form_line
21from .gloss import extract_gloss, process_exemple_template
22from .inflection import extract_inflection
23from .linkage import extract_linkage
24from .models import Sense, WordEntry
25from .note import extract_note, extract_recognition_rate_section
26from .pronunciation import extract_homophone_section, extract_pronunciation
27from .section_types import (
28 ETYMOLOGY_SECTIONS,
29 IGNORED_SECTIONS,
30 INFLECTION_SECTIONS,
31 LINKAGE_SECTIONS,
32 NOTES_SECTIONS,
33 POS_SECTIONS,
34 PRONUNCIATION_SECTIONS,
35 TRANSLATION_SECTIONS,
36)
37from .translation import extract_translation_section
40def parse_section(
41 wxr: WiktextractContext,
42 page_data: list[WordEntry],
43 base_data: WordEntry,
44 level_node: LevelNode,
45) -> EtymologyData | None:
46 etymology_data = None
47 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
48 if level_node_template.template_name == "S": 48 ↛ 47line 48 didn't jump to line 47 because the condition on line 48 was always true
49 # French Wiktionary uses a `S` template for all subtitles, we could
50 # find the subtitle type by only checking the template parameter.
51 # https://fr.wiktionary.org/wiki/Modèle:S
52 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
53 first_param = level_node_template.template_parameters.get(1, "")
54 if not isinstance(first_param, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 continue
56 section_type = first_param.strip().lower()
57 title_categories = {}
58 subtitle = clean_node(wxr, title_categories, level_node.largs)
59 wxr.wtp.start_subsection(subtitle)
60 if section_type in IGNORED_SECTIONS:
61 pass
62 # POS parameters:
63 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
64 elif section_type in POS_SECTIONS:
65 process_pos_block(
66 wxr,
67 page_data,
68 base_data,
69 level_node,
70 section_type,
71 subtitle,
72 )
73 if len(page_data) > 0: 73 ↛ 47line 73 didn't jump to line 47 because the condition on line 73 was always true
74 page_data[-1].categories.extend(
75 title_categories.get("categories", [])
76 )
77 elif (
78 wxr.config.capture_etymologies
79 and section_type in ETYMOLOGY_SECTIONS
80 ):
81 etymology_data = extract_etymology(wxr, level_node, base_data)
82 elif (
83 wxr.config.capture_pronunciation
84 and section_type in PRONUNCIATION_SECTIONS
85 ):
86 extract_pronunciation(wxr, page_data, level_node, base_data)
87 elif (
88 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
89 ):
90 extract_linkage(
91 wxr,
92 page_data if len(page_data) > 0 else [base_data],
93 level_node,
94 section_type,
95 )
96 elif (
97 wxr.config.capture_translations
98 and section_type in TRANSLATION_SECTIONS
99 ):
100 extract_translation_section(
101 wxr,
102 page_data if len(page_data) > 0 else [base_data],
103 level_node,
104 )
105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true
106 wxr.config.capture_inflections
107 and section_type in INFLECTION_SECTIONS
108 ):
109 pass
110 elif section_type in NOTES_SECTIONS:
111 extract_note(
112 wxr,
113 page_data if len(page_data) > 0 else [base_data],
114 level_node,
115 )
116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 extract_recognition_rate_section(
118 wxr,
119 page_data[-1] if len(page_data) > 0 else base_data,
120 level_node,
121 )
122 elif section_type == "attestations":
123 extract_etymology_examples(wxr, level_node, base_data)
124 elif section_type in ["homophones", "homo"]: 124 ↛ 133line 124 didn't jump to line 133 because the condition on line 124 was always true
125 extract_homophone_section(
126 wxr,
127 page_data,
128 base_data,
129 level_node,
130 title_categories.get("categories", []),
131 )
132 else:
133 wxr.wtp.debug(
134 f"Unknown section: {section_type}",
135 sortid="extractor/fr/page/parse_section/127",
136 )
138 find_bottom_category_links(wxr, page_data, level_node)
139 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
140 parse_section(wxr, page_data, base_data, next_level_node)
141 return etymology_data
144def process_pos_block(
145 wxr: WiktextractContext,
146 page_data: list[WordEntry],
147 base_data: WordEntry,
148 pos_title_node: LevelNode,
149 pos_argument: str,
150 pos_title: str,
151):
152 pos_data = POS_SECTIONS[pos_argument]
153 pos_type = pos_data["pos"]
154 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true
155 page_data.append(base_data.model_copy(deep=True))
156 page_data[-1].pos = pos_type
157 page_data[-1].pos_title = pos_title
158 page_data[-1].tags.extend(pos_data.get("tags", []))
159 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
160 if level_node_template.template_name == "S": 160 ↛ 159line 160 didn't jump to line 159 because the condition on line 160 was always true
161 if level_node_template.template_parameters.get(3) == "flexion":
162 page_data[-1].tags.append("form-of")
163 expanded_s = wxr.wtp.parse(
164 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
165 )
166 for span_tag in expanded_s.find_html("span"): 166 ↛ 167line 166 didn't jump to line 167 because the loop on line 166 never started
167 page_data[-1].pos_id = span_tag.attrs.get("id", "")
168 break
169 child_nodes = list(pos_title_node.filter_empty_str_child())
170 form_line_start = 0 # Ligne de forme
171 level_node_index = len(child_nodes)
172 gloss_start = len(child_nodes)
173 lang_code = page_data[-1].lang_code
174 has_gloss_list = False
175 for index, child in enumerate(child_nodes):
176 if isinstance(child, WikiNode):
177 if child.kind == NodeKind.TEMPLATE:
178 template_name = child.template_name
179 if (
180 template_name.endswith("-exemple")
181 and len(page_data[-1].senses) > 0
182 ):
183 # zh-exemple and ja-exemple expand to list thus are not the
184 # child of gloss list item.
185 process_exemple_template(
186 wxr, child, page_data[-1].senses[-1]
187 )
188 elif template_name.startswith(("zh-mot", "ja-mot")): 188 ↛ 190line 188 didn't jump to line 190 because the condition on line 188 was never true
189 # skip form line templates
190 form_line_start = index
191 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 extract_inflection(wxr, page_data, child)
193 elif child.kind == NodeKind.BOLD and form_line_start == 0:
194 form_line_start = index + 1
195 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"):
196 if index < gloss_start: 196 ↛ 198line 196 didn't jump to line 198 because the condition on line 196 was always true
197 gloss_start = index
198 extract_gloss(wxr, page_data, child)
199 has_gloss_list = True
200 elif child.kind in LEVEL_KIND_FLAGS:
201 level_node_index = index
202 break
204 form_line_nodes = child_nodes[form_line_start:gloss_start]
205 extract_form_line(wxr, page_data, form_line_nodes)
206 if not has_gloss_list:
207 gloss_text = clean_node(
208 wxr, None, child_nodes[form_line_start:level_node_index]
209 )
210 if gloss_text != "": 210 ↛ exitline 210 didn't return from function 'process_pos_block' because the condition on line 210 was always true
211 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
214def parse_page(
215 wxr: WiktextractContext, page_title: str, page_text: str
216) -> list[dict[str, Any]]:
217 # Page structure
218 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
219 if wxr.config.verbose: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 logger.info(f"Parsing page: {page_title}")
221 wxr.config.word = page_title
222 wxr.wtp.start_page(page_title)
223 tree = wxr.wtp.parse(page_text)
224 page_data: list[WordEntry] = []
225 for level2_node in tree.find_child(NodeKind.LEVEL2):
226 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
227 # https://fr.wiktionary.org/wiki/Modèle:langue
228 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
229 if subtitle_template.template_name == "langue": 229 ↛ 226line 229 didn't jump to line 226 because the condition on line 229 was always true
230 categories = {}
231 lang_code = subtitle_template.template_parameters.get(1)
232 if ( 232 ↛ 236line 232 didn't jump to line 236 because the condition on line 232 was never true
233 wxr.config.capture_language_codes is not None
234 and lang_code not in wxr.config.capture_language_codes
235 ):
236 continue
237 lang_name = clean_node(wxr, categories, subtitle_template)
238 wxr.wtp.start_section(lang_name)
239 base_data = WordEntry(
240 word=wxr.wtp.title,
241 lang_code=lang_code,
242 lang=lang_name,
243 pos="unknown",
244 categories=categories.get("categories", []),
245 )
246 etymology_data: EtymologyData | None = None
247 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
248 new_etymology_data = parse_section(
249 wxr, page_data, base_data, level3_node
250 )
251 if new_etymology_data is not None:
252 etymology_data = new_etymology_data
254 if etymology_data is not None:
255 insert_etymology_data(lang_code, page_data, etymology_data)
257 for data in page_data:
258 if len(data.senses) == 0: 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 data.senses.append(Sense(tags=["no-gloss"]))
260 return [m.model_dump(exclude_defaults=True) for m in page_data]
263def find_bottom_category_links(
264 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
265) -> None:
266 if len(page_data) == 0:
267 return
268 categories = {}
269 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
270 if isinstance(node, TemplateNode) and node.template_name.endswith( 270 ↛ 273line 270 didn't jump to line 273 because the condition on line 270 was never true
271 " entrée"
272 ):
273 clean_node(wxr, categories, node)
274 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
275 clean_node(wxr, categories, node)
277 for data in page_data:
278 if data.lang_code == page_data[-1].lang_code:
279 data.categories.extend(categories.get("categories", []))