Coverage for src/wiktextract/extractor/fr/page.py: 88%
144 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 LevelNode,
6 NodeKind,
7 TemplateNode,
8 WikiNode,
9)
11from ...page import clean_node
12from ...wxr_context import WiktextractContext
13from ...wxr_logging import logger
14from .etymology import (
15 EtymologyData,
16 extract_etymology,
17 extract_etymology_examples,
18 insert_etymology_data,
19)
20from .form_line import extract_form_line
21from .gloss import extract_gloss, process_exemple_template
22from .inflection import extract_inflection
23from .linkage import extract_linkage
24from .models import Sense, WordEntry
25from .note import extract_note, extract_recognition_rate_section
26from .pronunciation import extract_homophone_section, extract_pronunciation
27from .section_types import (
28 ETYMOLOGY_SECTIONS,
29 IGNORED_SECTIONS,
30 INFLECTION_SECTIONS,
31 LINKAGE_SECTIONS,
32 NOTES_SECTIONS,
33 POS_SECTIONS,
34 PRONUNCIATION_SECTIONS,
35 TRANSLATION_SECTIONS,
36)
37from .translation import extract_translation_section
40def parse_section(
41 wxr: WiktextractContext,
42 page_data: list[WordEntry],
43 base_data: WordEntry,
44 level_node: LevelNode,
45) -> EtymologyData | None:
46 etymology_data = None
47 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
48 if level_node_template.template_name == "S": 48 ↛ 47line 48 didn't jump to line 47 because the condition on line 48 was always true
49 # French Wiktionary uses a `S` template for all subtitles, we could
50 # find the subtitle type by only checking the template parameter.
51 # https://fr.wiktionary.org/wiki/Modèle:S
52 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
53 first_param = level_node_template.template_parameters.get(1, "")
54 if not isinstance(first_param, str): 54 ↛ 55line 54 didn't jump to line 55 because the condition on line 54 was never true
55 continue
56 section_type = first_param.strip().lower()
57 title_categories = {}
58 subtitle = clean_node(wxr, title_categories, level_node.largs)
59 wxr.wtp.start_subsection(subtitle)
60 if section_type in IGNORED_SECTIONS:
61 pass
62 # POS parameters:
63 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
64 elif section_type in POS_SECTIONS:
65 process_pos_block(
66 wxr,
67 page_data,
68 base_data,
69 level_node,
70 section_type,
71 subtitle,
72 )
73 if len(page_data) > 0: 73 ↛ 47line 73 didn't jump to line 47 because the condition on line 73 was always true
74 page_data[-1].categories.extend(
75 title_categories.get("categories", [])
76 )
77 elif (
78 wxr.config.capture_etymologies
79 and section_type in ETYMOLOGY_SECTIONS
80 ):
81 etymology_data = extract_etymology(wxr, level_node, base_data)
82 elif (
83 wxr.config.capture_pronunciation
84 and section_type in PRONUNCIATION_SECTIONS
85 ):
86 extract_pronunciation(wxr, page_data, level_node, base_data)
87 elif (
88 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
89 ):
90 extract_linkage(
91 wxr,
92 page_data if len(page_data) > 0 else [base_data],
93 level_node,
94 section_type,
95 )
96 elif (
97 wxr.config.capture_translations
98 and section_type in TRANSLATION_SECTIONS
99 ):
100 extract_translation_section(
101 wxr,
102 page_data if len(page_data) > 0 else [base_data],
103 level_node,
104 )
105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true
106 wxr.config.capture_inflections
107 and section_type in INFLECTION_SECTIONS
108 ):
109 pass
110 elif section_type in NOTES_SECTIONS:
111 extract_note(
112 wxr,
113 page_data if len(page_data) > 0 else [base_data],
114 level_node,
115 )
116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 extract_recognition_rate_section(
118 wxr,
119 page_data[-1] if len(page_data) > 0 else base_data,
120 level_node,
121 )
122 elif section_type == "attestations":
123 extract_etymology_examples(wxr, level_node, base_data)
124 elif section_type in ["homophones", "homo"]: 124 ↛ 133line 124 didn't jump to line 133 because the condition on line 124 was always true
125 extract_homophone_section(
126 wxr,
127 page_data,
128 base_data,
129 level_node,
130 title_categories.get("categories", []),
131 )
132 else:
133 wxr.wtp.debug(
134 f"Unknown section: {section_type}",
135 sortid="extractor/fr/page/parse_section/127",
136 )
138 find_bottom_category_links(wxr, page_data, level_node)
139 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
140 parse_section(wxr, page_data, base_data, next_level_node)
141 return etymology_data
144def process_pos_block(
145 wxr: WiktextractContext,
146 page_data: list[WordEntry],
147 base_data: WordEntry,
148 pos_title_node: LevelNode,
149 pos_argument: str,
150 pos_title: str,
151):
152 pos_data = POS_SECTIONS[pos_argument]
153 pos_type = pos_data["pos"]
154 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 154 ↛ 156line 154 didn't jump to line 156 because the condition on line 154 was always true
155 page_data.append(base_data.model_copy(deep=True))
156 page_data[-1].pos = pos_type
157 page_data[-1].pos_title = pos_title
158 page_data[-1].tags.extend(pos_data.get("tags", []))
159 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
160 if level_node_template.template_name == "S": 160 ↛ 159line 160 didn't jump to line 159 because the condition on line 160 was always true
161 if level_node_template.template_parameters.get(3) == "flexion":
162 page_data[-1].tags.append("form-of")
163 expanded_s = wxr.wtp.parse(
164 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
165 )
166 for span_tag in expanded_s.find_html("span"): 166 ↛ 167line 166 didn't jump to line 167 because the loop on line 166 never started
167 page_data[-1].pos_id = span_tag.attrs.get("id", "")
168 break
169 child_nodes = list(pos_title_node.filter_empty_str_child())
170 form_line_start = 0 # Ligne de forme
171 level_node_index = len(child_nodes)
172 gloss_start = len(child_nodes)
173 lang_code = page_data[-1].lang_code
174 has_gloss_list = False
175 is_first_bold = True
176 for index, child in enumerate(child_nodes):
177 if isinstance(child, WikiNode):
178 if child.kind == NodeKind.TEMPLATE:
179 template_name = child.template_name
180 if (
181 template_name.endswith("-exemple")
182 and len(page_data[-1].senses) > 0
183 ):
184 # zh-exemple and ja-exemple expand to list thus are not the
185 # child of gloss list item.
186 process_exemple_template(
187 wxr, child, page_data[-1].senses[-1]
188 )
189 elif template_name.startswith(("zh-mot", "ja-mot")): 189 ↛ 191line 189 didn't jump to line 191 because the condition on line 189 was never true
190 # skip form line templates
191 form_line_start = index
192 elif template_name.startswith((f"{lang_code}-", "flex-ku-")): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 extract_inflection(wxr, page_data, child)
194 elif child.kind == NodeKind.BOLD and is_first_bold:
195 form_line_start = index
196 elif child.kind == NodeKind.LIST and child.sarg.startswith("#"):
197 if index < gloss_start: 197 ↛ 199line 197 didn't jump to line 199 because the condition on line 197 was always true
198 gloss_start = index
199 extract_gloss(wxr, page_data, child)
200 has_gloss_list = True
201 elif child.kind in LEVEL_KIND_FLAGS:
202 level_node_index = index
203 break
205 form_line_nodes = child_nodes[form_line_start:gloss_start]
206 extract_form_line(wxr, page_data, form_line_nodes)
207 if not has_gloss_list:
208 gloss_text = clean_node(
209 wxr, None, child_nodes[form_line_start + 1 : level_node_index]
210 )
211 if gloss_text != "": 211 ↛ exitline 211 didn't return from function 'process_pos_block' because the condition on line 211 was always true
212 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
215def parse_page(
216 wxr: WiktextractContext, page_title: str, page_text: str
217) -> list[dict[str, Any]]:
218 # Page structure
219 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
220 if wxr.config.verbose: 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true
221 logger.info(f"Parsing page: {page_title}")
222 wxr.config.word = page_title
223 wxr.wtp.start_page(page_title)
224 tree = wxr.wtp.parse(page_text)
225 page_data: list[WordEntry] = []
226 for level2_node in tree.find_child(NodeKind.LEVEL2):
227 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
228 # https://fr.wiktionary.org/wiki/Modèle:langue
229 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
230 if subtitle_template.template_name == "langue": 230 ↛ 227line 230 didn't jump to line 227 because the condition on line 230 was always true
231 categories = {}
232 lang_code = subtitle_template.template_parameters.get(1)
233 if ( 233 ↛ 237line 233 didn't jump to line 237 because the condition on line 233 was never true
234 wxr.config.capture_language_codes is not None
235 and lang_code not in wxr.config.capture_language_codes
236 ):
237 continue
238 lang_name = clean_node(wxr, categories, subtitle_template)
239 wxr.wtp.start_section(lang_name)
240 base_data = WordEntry(
241 word=page_title,
242 lang_code=lang_code,
243 lang=lang_name,
244 pos="unknown",
245 categories=categories.get("categories", []),
246 )
247 etymology_data: EtymologyData | None = None
248 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
249 new_etymology_data = parse_section(
250 wxr, page_data, base_data, level3_node
251 )
252 if new_etymology_data is not None:
253 etymology_data = new_etymology_data
255 if etymology_data is not None:
256 insert_etymology_data(lang_code, page_data, etymology_data)
258 for data in page_data:
259 if len(data.senses) == 0: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true
260 data.senses.append(Sense(tags=["no-gloss"]))
261 return [m.model_dump(exclude_defaults=True) for m in page_data]
264def find_bottom_category_links(
265 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
266) -> None:
267 if len(page_data) == 0:
268 return
269 categories = {}
270 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
271 if isinstance(node, TemplateNode) and node.template_name.endswith( 271 ↛ 274line 271 didn't jump to line 274 because the condition on line 271 was never true
272 " entrée"
273 ):
274 clean_node(wxr, categories, node)
275 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
276 clean_node(wxr, categories, node)
278 for data in page_data:
279 if data.lang_code == page_data[-1].lang_code:
280 data.categories.extend(categories.get("categories", []))