Coverage for src/wiktextract/extractor/fr/page.py: 86%
140 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from typing import Any
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ...wxr_logging import logger
13from .etymology import (
14 EtymologyData,
15 extract_etymology,
16 extract_etymology_examples,
17 insert_etymology_data,
18)
19from .form_line import extract_form_line
20from .gloss import extract_gloss, process_exemple_template
21from .inflection import extract_inflection
22from .linkage import extract_linkage
23from .models import Sense, WordEntry
24from .note import extract_note, extract_recognition_rate_section
25from .pronunciation import extract_pronunciation
26from .section_types import (
27 ETYMOLOGY_SECTIONS,
28 IGNORED_SECTIONS,
29 INFLECTION_SECTIONS,
30 LINKAGE_SECTIONS,
31 NOTES_SECTIONS,
32 POS_SECTIONS,
33 PRONUNCIATION_SECTIONS,
34 TRANSLATION_SECTIONS,
35)
36from .translation import extract_translation
39def parse_section(
40 wxr: WiktextractContext,
41 page_data: list[WordEntry],
42 base_data: WordEntry,
43 level_node: WikiNode,
44) -> EtymologyData | None:
45 etymology_data = None
46 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
47 if level_node_template.template_name == "S": 47 ↛ 46line 47 didn't jump to line 46 because the condition on line 47 was always true
48 # French Wiktionary uses a `S` template for all subtitles, we could
49 # find the subtitle type by only checking the template parameter.
50 # https://fr.wiktionary.org/wiki/Modèle:S
51 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
52 first_param = level_node_template.template_parameters.get(1, "")
53 if not isinstance(first_param, str): 53 ↛ 54line 53 didn't jump to line 54 because the condition on line 53 was never true
54 continue
55 section_type = first_param.strip().lower()
56 title_categories = {}
57 subtitle = clean_node(wxr, title_categories, level_node.largs)
58 wxr.wtp.start_subsection(subtitle)
59 if section_type in IGNORED_SECTIONS:
60 pass
61 # POS parameters:
62 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
63 elif section_type in POS_SECTIONS:
64 process_pos_block(
65 wxr,
66 page_data,
67 base_data,
68 level_node,
69 section_type,
70 subtitle,
71 )
72 if len(page_data) > 0: 72 ↛ 46line 72 didn't jump to line 46 because the condition on line 72 was always true
73 page_data[-1].categories.extend(
74 title_categories.get("categories", [])
75 )
76 elif (
77 wxr.config.capture_etymologies
78 and section_type in ETYMOLOGY_SECTIONS
79 ):
80 etymology_data = extract_etymology(wxr, level_node, base_data)
81 elif (
82 wxr.config.capture_pronunciation
83 and section_type in PRONUNCIATION_SECTIONS
84 ):
85 extract_pronunciation(wxr, page_data, level_node, base_data)
86 elif (
87 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
88 ):
89 extract_linkage(
90 wxr,
91 page_data if len(page_data) > 0 else [base_data],
92 level_node,
93 section_type,
94 )
95 elif (
96 wxr.config.capture_translations
97 and section_type in TRANSLATION_SECTIONS
98 ):
99 extract_translation(
100 wxr,
101 page_data if len(page_data) > 0 else [base_data],
102 base_data,
103 level_node,
104 )
105 elif ( 105 ↛ 109line 105 didn't jump to line 109 because the condition on line 105 was never true
106 wxr.config.capture_inflections
107 and section_type in INFLECTION_SECTIONS
108 ):
109 pass
110 elif section_type in NOTES_SECTIONS:
111 extract_note(
112 wxr,
113 page_data if len(page_data) > 0 else [base_data],
114 level_node,
115 )
116 elif section_type == "taux de reconnaissance": 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 extract_recognition_rate_section(
118 wxr,
119 page_data[-1] if len(page_data) > 0 else base_data,
120 level_node,
121 )
122 elif section_type == "attestations": 122 ↛ 46line 122 didn't jump to line 46 because the condition on line 122 was always true
123 extract_etymology_examples(wxr, level_node, base_data)
125 find_bottom_category_links(wxr, page_data, level_node)
126 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
127 parse_section(wxr, page_data, base_data, next_level_node)
128 return etymology_data
131def process_pos_block(
132 wxr: WiktextractContext,
133 page_data: list[WordEntry],
134 base_data: WordEntry,
135 pos_title_node: WikiNode,
136 pos_argument: str,
137 pos_title: str,
138):
139 pos_data = POS_SECTIONS[pos_argument]
140 pos_type = pos_data["pos"]
141 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 141 ↛ 143line 141 didn't jump to line 143 because the condition on line 141 was always true
142 page_data.append(base_data.model_copy(deep=True))
143 page_data[-1].pos = pos_type
144 page_data[-1].pos_title = pos_title
145 page_data[-1].tags.extend(pos_data.get("tags", []))
146 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
147 if level_node_template.template_name == "S": 147 ↛ 146line 147 didn't jump to line 146 because the condition on line 147 was always true
148 if level_node_template.template_parameters.get(3) == "flexion":
149 page_data[-1].tags.append("form-of")
150 expanded_s = wxr.wtp.parse(
151 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
152 )
153 for span_tag in expanded_s.find_html("span"): 153 ↛ 154line 153 didn't jump to line 154 because the loop on line 153 never started
154 page_data[-1].pos_id = span_tag.attrs.get("id", "")
155 break
156 child_nodes = list(pos_title_node.filter_empty_str_child())
157 form_line_start = 0 # Ligne de forme
158 level_node_index = len(child_nodes)
159 gloss_start = len(child_nodes)
160 lang_code = page_data[-1].lang_code
161 has_gloss_list = False
162 for index, child in enumerate(child_nodes):
163 if isinstance(child, WikiNode):
164 if child.kind == NodeKind.TEMPLATE:
165 template_name = child.template_name
166 if ( 166 ↛ 175line 166 didn't jump to line 175 because the condition on line 166 was always true
167 template_name.endswith("-exemple")
168 and len(page_data[-1].senses) > 0
169 ):
170 # zh-exemple and ja-exemple expand to list thus are not the
171 # child of gloss list item.
172 process_exemple_template(
173 wxr, child, page_data[-1].senses[-1]
174 )
175 elif template_name.startswith(("zh-mot", "ja-mot")):
176 # skip form line templates
177 form_line_start = index
178 elif template_name.startswith(f"{lang_code}-"):
179 extract_inflection(wxr, page_data, child)
180 elif child.kind == NodeKind.BOLD and form_line_start == 0:
181 form_line_start = index + 1
182 elif child.kind == NodeKind.LIST:
183 if index < gloss_start: 183 ↛ 185line 183 didn't jump to line 185 because the condition on line 183 was always true
184 gloss_start = index
185 extract_gloss(wxr, page_data, child)
186 has_gloss_list = True
187 elif child.kind in LEVEL_KIND_FLAGS:
188 level_node_index = index
189 break
191 form_line_nodes = child_nodes[form_line_start:gloss_start]
192 extract_form_line(wxr, page_data, form_line_nodes)
193 if not has_gloss_list:
194 gloss_text = clean_node(
195 wxr, None, child_nodes[form_line_start:level_node_index]
196 )
197 if gloss_text != "": 197 ↛ exitline 197 didn't return from function 'process_pos_block' because the condition on line 197 was always true
198 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
201def parse_page(
202 wxr: WiktextractContext, page_title: str, page_text: str
203) -> list[dict[str, Any]]:
204 # Page structure
205 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
206 if wxr.config.verbose: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 logger.info(f"Parsing page: {page_title}")
208 wxr.config.word = page_title
209 wxr.wtp.start_page(page_title)
210 tree = wxr.wtp.parse(page_text)
211 page_data: list[WordEntry] = []
212 for level2_node in tree.find_child(NodeKind.LEVEL2):
213 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
214 # https://fr.wiktionary.org/wiki/Modèle:langue
215 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
216 if subtitle_template.template_name == "langue": 216 ↛ 213line 216 didn't jump to line 213 because the condition on line 216 was always true
217 categories = {}
218 lang_code = subtitle_template.template_parameters.get(1)
219 if ( 219 ↛ 223line 219 didn't jump to line 223 because the condition on line 219 was never true
220 wxr.config.capture_language_codes is not None
221 and lang_code not in wxr.config.capture_language_codes
222 ):
223 continue
224 lang_name = clean_node(wxr, categories, subtitle_template)
225 wxr.wtp.start_section(lang_name)
226 base_data = WordEntry(
227 word=wxr.wtp.title,
228 lang_code=lang_code,
229 lang=lang_name,
230 pos="unknown",
231 categories=categories.get("categories", []),
232 )
233 etymology_data: EtymologyData | None = None
234 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
235 new_etymology_data = parse_section(
236 wxr, page_data, base_data, level3_node
237 )
238 if new_etymology_data is not None:
239 etymology_data = new_etymology_data
241 if etymology_data is not None:
242 insert_etymology_data(lang_code, page_data, etymology_data)
244 for data in page_data:
245 if len(data.senses) == 0: 245 ↛ 246line 245 didn't jump to line 246 because the condition on line 245 was never true
246 data.senses.append(Sense(tags=["no-gloss"]))
247 return [m.model_dump(exclude_defaults=True) for m in page_data]
250def find_bottom_category_links(
251 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
252) -> None:
253 if len(page_data) == 0:
254 return
255 categories = {}
256 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
257 if isinstance(node, TemplateNode) and node.template_name.endswith( 257 ↛ 260line 257 didn't jump to line 260 because the condition on line 257 was never true
258 " entrée"
259 ):
260 clean_node(wxr, categories, node)
261 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
262 clean_node(wxr, categories, node)
264 for data in page_data:
265 if data.lang_code == page_data[-1].lang_code:
266 data.categories.extend(categories.get("categories", []))