Coverage for src/wiktextract/extractor/fr/page.py: 84%
138 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from typing import Any, Optional
3from wikitextprocessor.parser import (
4 LEVEL_KIND_FLAGS,
5 NodeKind,
6 TemplateNode,
7 WikiNode,
8)
10from ...page import clean_node
11from ...wxr_context import WiktextractContext
12from ...wxr_logging import logger
13from .etymology import EtymologyData, extract_etymology, insert_etymology_data
14from .form_line import extract_form_line
15from .gloss import extract_gloss, process_exemple_template
16from .inflection import extract_inflection
17from .linkage import extract_linkage
18from .models import Sense, WordEntry
19from .note import extract_note, extract_recognition_rate_section
20from .pronunciation import extract_pronunciation
21from .section_types import (
22 ETYMOLOGY_SECTIONS,
23 IGNORED_SECTIONS,
24 INFLECTION_SECTIONS,
25 LINKAGE_SECTIONS,
26 NOTES_SECTIONS,
27 POS_SECTIONS,
28 PRONUNCIATION_SECTIONS,
29 TRANSLATION_SECTIONS,
30)
31from .translation import extract_translation
34def parse_section(
35 wxr: WiktextractContext,
36 page_data: list[WordEntry],
37 base_data: WordEntry,
38 level_node: WikiNode,
39) -> Optional[EtymologyData]:
40 etymology_data = None
41 for level_node_template in level_node.find_content(NodeKind.TEMPLATE):
42 if level_node_template.template_name == "S": 42 ↛ 41line 42 didn't jump to line 41 because the condition on line 42 was always true
43 # French Wiktionary uses a `S` template for all subtitles, we could
44 # find the subtitle type by only checking the template parameter.
45 # https://fr.wiktionary.org/wiki/Modèle:S
46 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections
47 first_param = level_node_template.template_parameters.get(1, "")
48 if not isinstance(first_param, str): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 continue
50 section_type = first_param.strip().lower()
51 title_categories = {}
52 subtitle = clean_node(wxr, title_categories, level_node.largs)
53 wxr.wtp.start_subsection(subtitle)
54 if section_type in IGNORED_SECTIONS:
55 pass
56 # POS parameters:
57 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_sections_de_types_de_mots
58 elif section_type in POS_SECTIONS:
59 process_pos_block(
60 wxr,
61 page_data,
62 base_data,
63 level_node,
64 section_type,
65 subtitle,
66 )
67 if len(page_data) > 0: 67 ↛ 41line 67 didn't jump to line 41 because the condition on line 67 was always true
68 page_data[-1].categories.extend(
69 title_categories.get("categories", [])
70 )
71 elif (
72 wxr.config.capture_etymologies
73 and section_type in ETYMOLOGY_SECTIONS
74 ):
75 etymology_data = extract_etymology(wxr, level_node, base_data)
76 elif (
77 wxr.config.capture_pronunciation
78 and section_type in PRONUNCIATION_SECTIONS
79 ):
80 extract_pronunciation(wxr, page_data, level_node, base_data)
81 elif (
82 wxr.config.capture_linkages and section_type in LINKAGE_SECTIONS
83 ):
84 extract_linkage(
85 wxr,
86 page_data if len(page_data) > 0 else [base_data],
87 level_node,
88 section_type,
89 )
90 elif (
91 wxr.config.capture_translations
92 and section_type in TRANSLATION_SECTIONS
93 ):
94 extract_translation(
95 wxr,
96 page_data if len(page_data) > 0 else [base_data],
97 base_data,
98 level_node,
99 )
100 elif ( 100 ↛ 104line 100 didn't jump to line 104
101 wxr.config.capture_inflections
102 and section_type in INFLECTION_SECTIONS
103 ):
104 pass
105 elif section_type in NOTES_SECTIONS: 105 ↛ 111line 105 didn't jump to line 111 because the condition on line 105 was always true
106 extract_note(
107 wxr,
108 page_data if len(page_data) > 0 else [base_data],
109 level_node,
110 )
111 elif section_type == "taux de reconnaissance":
112 extract_recognition_rate_section(
113 wxr,
114 page_data[-1] if len(page_data) > 0 else base_data,
115 level_node,
116 )
118 find_bottom_category_links(wxr, page_data, level_node)
119 for next_level_node in level_node.find_child(LEVEL_KIND_FLAGS):
120 parse_section(wxr, page_data, base_data, next_level_node)
121 return etymology_data
124def process_pos_block(
125 wxr: WiktextractContext,
126 page_data: list[WordEntry],
127 base_data: WordEntry,
128 pos_title_node: WikiNode,
129 pos_argument: str,
130 pos_title: str,
131):
132 pos_data = POS_SECTIONS[pos_argument]
133 pos_type = pos_data["pos"]
134 if len(page_data) == 0 or "pos" in page_data[-1].model_fields_set: 134 ↛ 136line 134 didn't jump to line 136 because the condition on line 134 was always true
135 page_data.append(base_data.model_copy(deep=True))
136 page_data[-1].pos = pos_type
137 page_data[-1].pos_title = pos_title
138 page_data[-1].tags.extend(pos_data.get("tags", []))
139 for level_node_template in pos_title_node.find_content(NodeKind.TEMPLATE):
140 if level_node_template.template_name == "S": 140 ↛ 139line 140 didn't jump to line 139 because the condition on line 140 was always true
141 if level_node_template.template_parameters.get(3) == "flexion":
142 page_data[-1].tags.append("form-of")
143 expanded_s = wxr.wtp.parse(
144 wxr.wtp.node_to_wikitext(level_node_template), expand_all=True
145 )
146 for span_tag in expanded_s.find_html("span"): 146 ↛ 147line 146 didn't jump to line 147 because the loop on line 146 never started
147 page_data[-1].pos_id = span_tag.attrs.get("id", "")
148 break
149 child_nodes = list(pos_title_node.filter_empty_str_child())
150 form_line_start = 0 # Ligne de forme
151 level_node_index = len(child_nodes)
152 gloss_start = len(child_nodes)
153 lang_code = page_data[-1].lang_code
154 has_gloss_list = False
155 for index, child in enumerate(child_nodes):
156 if isinstance(child, WikiNode):
157 if child.kind == NodeKind.TEMPLATE:
158 template_name = child.template_name
159 if ( 159 ↛ 168line 159 didn't jump to line 168
160 template_name.endswith("-exemple")
161 and len(page_data[-1].senses) > 0
162 ):
163 # zh-exemple and ja-exemple expand to list thus are not the
164 # child of gloss list item.
165 process_exemple_template(
166 wxr, child, page_data[-1].senses[-1]
167 )
168 elif template_name.startswith(("zh-mot", "ja-mot")):
169 # skip form line templates
170 form_line_start = index
171 elif template_name.startswith(f"{lang_code}-"):
172 extract_inflection(wxr, page_data, child)
173 elif child.kind == NodeKind.BOLD and form_line_start == 0:
174 form_line_start = index + 1
175 elif child.kind == NodeKind.LIST:
176 if index < gloss_start: 176 ↛ 178line 176 didn't jump to line 178 because the condition on line 176 was always true
177 gloss_start = index
178 extract_gloss(wxr, page_data, child)
179 has_gloss_list = True
180 elif child.kind in LEVEL_KIND_FLAGS:
181 level_node_index = index
182 break
184 form_line_nodes = child_nodes[form_line_start:gloss_start]
185 extract_form_line(wxr, page_data, form_line_nodes)
186 if not has_gloss_list:
187 gloss_text = clean_node(
188 wxr, None, child_nodes[form_line_start:level_node_index]
189 )
190 if gloss_text != "": 190 ↛ exitline 190 didn't return from function 'process_pos_block' because the condition on line 190 was always true
191 page_data[-1].senses.append(Sense(glosses=[gloss_text]))
194def parse_page(
195 wxr: WiktextractContext, page_title: str, page_text: str
196) -> list[dict[str, Any]]:
197 # Page structure
198 # https://fr.wiktionary.org/wiki/Convention:Structure_des_pages
199 if wxr.config.verbose: 199 ↛ 200line 199 didn't jump to line 200 because the condition on line 199 was never true
200 logger.info(f"Parsing page: {page_title}")
201 wxr.config.word = page_title
202 wxr.wtp.start_page(page_title)
203 tree = wxr.wtp.parse(page_text)
204 page_data: list[WordEntry] = []
205 for level2_node in tree.find_child(NodeKind.LEVEL2):
206 for subtitle_template in level2_node.find_content(NodeKind.TEMPLATE):
207 # https://fr.wiktionary.org/wiki/Modèle:langue
208 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_des_langues
209 if subtitle_template.template_name == "langue": 209 ↛ 206line 209 didn't jump to line 206 because the condition on line 209 was always true
210 categories = {}
211 lang_code = subtitle_template.template_parameters.get(1)
212 if ( 212 ↛ 216line 212 didn't jump to line 216
213 wxr.config.capture_language_codes is not None
214 and lang_code not in wxr.config.capture_language_codes
215 ):
216 continue
217 lang_name = clean_node(wxr, categories, subtitle_template)
218 wxr.wtp.start_section(lang_name)
219 base_data = WordEntry(
220 word=wxr.wtp.title,
221 lang_code=lang_code,
222 lang=lang_name,
223 pos="unknown",
224 categories=categories.get("categories", []),
225 )
226 etymology_data: Optional[EtymologyData] = None
227 for level3_node in level2_node.find_child(NodeKind.LEVEL3):
228 new_etymology_data = parse_section(
229 wxr, page_data, base_data, level3_node
230 )
231 if new_etymology_data is not None:
232 etymology_data = new_etymology_data
234 if etymology_data is not None:
235 insert_etymology_data(lang_code, page_data, etymology_data)
237 for data in page_data:
238 if len(data.senses) == 0: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 data.senses.append(Sense(tags=["no-gloss"]))
240 return [m.model_dump(exclude_defaults=True) for m in page_data]
243def find_bottom_category_links(
244 wxr: WiktextractContext, page_data: list[WordEntry], level_node: WikiNode
245) -> None:
246 if len(page_data) == 0:
247 return
248 categories = {}
249 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LINK):
250 if isinstance(node, TemplateNode) and node.template_name.endswith( 250 ↛ 253line 250 didn't jump to line 253 because the condition on line 250 was never true
251 " entrée"
252 ):
253 clean_node(wxr, categories, node)
254 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
255 clean_node(wxr, categories, node)
257 for data in page_data:
258 if data.lang_code == page_data[-1].lang_code:
259 data.categories.extend(categories.get("categories", []))