Coverage for src/wiktextract/extractor/fr/form_line.py: 68%
118 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-04 10:58 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .conjugation import extract_conjugation
6from .models import Form, Sound, WordEntry
7from .pronunciation import (
8 ASPIRATED_H_TEMPLATES,
9 PRON_TEMPLATES,
10 process_pron_template,
11)
12from .tags import translate_raw_tags
15def extract_form_line(
16 wxr: WiktextractContext,
17 page_data: list[WordEntry],
18 nodes: list[WikiNode | str],
19) -> None:
20 """
21 Ligne de forme
22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe
24 A line of wikitext between pos subtitle and the first gloss, contains IPA,
25 gender and inflection forms.
26 """
27 IGNORE_TEMPLATES = frozenset(
28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?", "réf"]
29 )
31 pre_template_name = ""
32 for index, node in enumerate(nodes):
33 if isinstance(node, TemplateNode):
34 if node.template_name in IGNORE_TEMPLATES: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 continue
36 elif node.template_name in PRON_TEMPLATES:
37 page_data[-1].sounds.extend(
38 process_pron_template(
39 wxr,
40 node,
41 [],
42 page_data[-1].lang_code,
43 nodes[index - 1 : index],
44 )
45 )
46 elif node.template_name == "équiv-pour":
47 process_equiv_pour_template(wxr, node, page_data)
48 elif node.template_name.startswith("zh-mot"): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 process_zh_mot_template(wxr, node, page_data)
50 elif node.template_name == "ja-mot": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true
51 process_ja_mot_template(wxr, node, page_data)
52 elif node.template_name in ( 52 ↛ 56line 52 didn't jump to line 56 because the condition on line 52 was never true
53 "conj",
54 "conjugaison",
55 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
56 process_conj_template(wxr, node, page_data)
57 elif node.template_name in ASPIRATED_H_TEMPLATES:
58 continue
59 elif node.template_name == "lien pronominal":
60 process_lien_pronominal(wxr, node, page_data)
61 elif node.template_name == "note":
62 note = clean_node(wxr, page_data[-1], nodes[index + 1 :])
63 if note != "": 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true
64 page_data[-1].notes.append(note)
65 break
66 else:
67 raw_tag = clean_node(wxr, page_data[-1], node)
68 expanded_template = wxr.wtp.parse(
69 wxr.wtp.node_to_wikitext(node), expand_all=True
70 )
71 if (
72 len(
73 list(
74 expanded_template.find_html(
75 "span", attr_name="id", attr_value="région"
76 )
77 )
78 )
79 == 1
80 and pre_template_name in PRON_TEMPLATES
81 and len(page_data[-1].sounds) > 0
82 ):
83 # it's the location of the previous IPA template
84 # https://fr.wiktionary.org/wiki/Modèle:région
85 page_data[-1].sounds[-1].raw_tags.append(
86 raw_tag.strip("()")
87 )
88 elif len(raw_tag.strip("()")) > 0: 88 ↛ 93line 88 didn't jump to line 93 because the condition on line 88 was always true
89 if raw_tag.startswith("(") and raw_tag.endswith(")"):
90 raw_tag = raw_tag.strip("()")
91 page_data[-1].raw_tags.append(raw_tag)
93 pre_template_name = node.template_name
94 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
95 raw_tag = clean_node(wxr, None, node)
96 if raw_tag != "ou":
97 page_data[-1].raw_tags.append(raw_tag)
98 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
99 process_conj_link_node(wxr, node, page_data)
101 translate_raw_tags(page_data[-1])
104def process_equiv_pour_template(
105 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]
106) -> list[Form]:
107 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
108 expanded_node = wxr.wtp.parse(
109 wxr.wtp.node_to_wikitext(node), expand_all=True
110 )
111 raw_gender_tag = ""
112 gender_tags = {
113 "un homme": "masculine",
114 "une femme": "feminine",
115 "des femmes": "feminine",
116 "le mâle": "masculine",
117 "la femelle": "feminine",
118 "un garçon": "masculine",
119 "une fille": "feminine",
120 "une personne non-binaire": "neuter",
121 }
122 forms = []
123 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
124 if child.kind == NodeKind.ITALIC:
125 raw_gender_tag = clean_node(wxr, None, child).strip("() ")
126 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit(
127 ",", 1
128 )[0]
129 elif isinstance(child, HTMLNode) and child.tag == "bdi": 129 ↛ 123line 129 didn't jump to line 123 because the condition on line 129 was always true
130 form_data = Form(
131 form=clean_node(wxr, None, child),
132 source="form line template 'équiv-pour'",
133 )
134 if len(raw_gender_tag) > 0: 134 ↛ 139line 134 didn't jump to line 139 because the condition on line 134 was always true
135 if raw_gender_tag in gender_tags: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true
136 form_data.tags.append(gender_tags[raw_gender_tag])
137 else:
138 form_data.raw_tags.append(raw_gender_tag)
139 if len(form_data.form) > 0: 139 ↛ 123line 139 didn't jump to line 123 because the condition on line 139 was always true
140 if len(page_data) > 0:
141 page_data[-1].forms.append(form_data)
142 forms.append(form_data)
143 return forms
146def process_zh_mot_template(
147 wxr: WiktextractContext,
148 node: TemplateNode,
149 page_data: list[WordEntry],
150) -> None:
151 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
152 # https://fr.wiktionary.org/wiki/Modèle:zh-mot
153 node = wxr.wtp.parse(
154 wxr.wtp.node_to_wikitext(node),
155 pre_expand=True,
156 additional_expand={node.template_name},
157 )
158 for template_node in node.find_child(NodeKind.TEMPLATE):
159 if template_node.template_name.lower() == "lang":
160 page_data[-1].sounds.append(
161 Sound(
162 zh_pron=clean_node(wxr, None, template_node),
163 tags=["Pinyin"],
164 )
165 )
166 elif template_node.template_name in ("pron", "prononciation"): 166 ↛ 158line 166 didn't jump to line 158 because the condition on line 166 was always true
167 page_data[-1].sounds.append(
168 Sound(ipa=clean_node(wxr, None, template_node))
169 )
172def process_ja_mot_template(
173 wxr: WiktextractContext,
174 template_node: TemplateNode,
175 page_data: list[WordEntry],
176) -> None:
177 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
178 expanded_node = wxr.wtp.parse(
179 wxr.wtp.node_to_wikitext(template_node), expand_all=True
180 )
181 existing_forms = {
182 existing_form.form for existing_form in page_data[-1].forms
183 }
184 for index, node in expanded_node.find_html("span", with_index=True):
185 # the first span tag is the word, the second is Hepburn romanization
186 if index == 1:
187 form_text = clean_node(wxr, None, node)
188 if form_text not in existing_forms:
189 # avoid adding duplicated form data extracted from
190 # inflection table before the form line
191 page_data[-1].forms.append(
192 Form(form=form_text, tags=["romanization"])
193 )
194 break
197def process_conj_template(
198 wxr: WiktextractContext,
199 template_node: TemplateNode,
200 page_data: list[WordEntry],
201) -> None:
202 # https://fr.wiktionary.org/wiki/Modèle:conjugaison
203 expanded_node = wxr.wtp.parse(
204 wxr.wtp.node_to_wikitext(template_node), expand_all=True
205 )
206 for link in expanded_node.find_child(NodeKind.LINK):
207 process_conj_link_node(wxr, link, page_data)
209 tag = clean_node(wxr, page_data[-1], expanded_node)
210 if template_node.template_name in ("conj", "conjugaison"):
211 tag = tag.removesuffix("(voir la conjugaison)").strip()
212 elif template_node.template_name.startswith("ja-"):
213 tag = (
214 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
215 )
216 if len(tag) > 0:
217 page_data[-1].raw_tags.append(tag)
220def is_conj_link(wxr: WiktextractContext, link: WikiNode) -> bool:
221 if len(link.largs) == 0 or len(link.largs[0]) == 0: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 return False
223 conj_title = clean_node(wxr, None, link.largs[0][0])
224 return conj_title.startswith("Conjugaison:")
227def process_conj_link_node(
228 wxr: WiktextractContext,
229 link: WikiNode,
230 page_data: list[WordEntry],
231) -> None:
232 if not is_conj_link(wxr, link): 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true
233 return
234 conj_title = link.largs[0][0]
235 conj_word = conj_title.split("/", 1)[-1]
236 if conj_word in (
237 "Premier groupe",
238 "Deuxième groupe",
239 "Troisième groupe",
240 ):
241 return
242 if (
243 len(page_data) > 1
244 and page_data[-2].lang_code == page_data[-1].lang_code
245 and page_data[-2].pos == page_data[-1].pos
246 and len(page_data[-2].forms) > 0
247 and page_data[-2].forms[-1].source == conj_title
248 ):
249 page_data[-1].forms = page_data[-2].forms
250 else:
251 extract_conjugation(wxr, page_data[-1], conj_title)
254def process_lien_pronominal(
255 wxr: WiktextractContext,
256 template_node: TemplateNode,
257 page_data: list[WordEntry],
258) -> None:
259 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal
260 expanded_node = wxr.wtp.parse(
261 wxr.wtp.node_to_wikitext(template_node), expand_all=True
262 )
263 for bdi_tag in expanded_node.find_html_recursively("bdi"):
264 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"])
265 if form.form != "": 265 ↛ 263line 265 didn't jump to line 263 because the condition on line 265 was always true
266 page_data[-1].forms.append(form)
267 clean_node(wxr, page_data[-1], expanded_node)