Coverage for src/wiktextract/extractor/fr/form_line.py: 71%
129 statements
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
« prev ^ index » next coverage.py v7.10.3, created at 2025-08-15 05:18 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .conjugation import extract_conjugation
6from .models import Form, Sound, WordEntry
7from .pronunciation import (
8 ASPIRATED_H_TEMPLATES,
9 PRON_TEMPLATES,
10 process_pron_template,
11)
12from .tags import translate_raw_tags
15def extract_form_line(
16 wxr: WiktextractContext,
17 page_data: list[WordEntry],
18 nodes: list[WikiNode | str],
19) -> None:
20 """
21 Ligne de forme
22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe
24 A line of wikitext between pos subtitle and the first gloss, contains IPA,
25 gender and inflection forms.
26 """
27 IGNORE_TEMPLATES = frozenset(
28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?", "réf"]
29 )
31 pre_template_name = ""
32 first_bold = True
33 for index, node in enumerate(nodes):
34 if isinstance(node, TemplateNode):
35 if node.template_name in IGNORE_TEMPLATES: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true
36 continue
37 elif node.template_name in PRON_TEMPLATES:
38 page_data[-1].sounds.extend(
39 process_pron_template(
40 wxr,
41 node,
42 [],
43 page_data[-1].lang_code,
44 nodes[index - 1 : index],
45 )
46 )
47 elif node.template_name == "équiv-pour":
48 process_equiv_pour_template(wxr, node, page_data)
49 elif node.template_name.startswith("zh-mot"): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true
50 process_zh_mot_template(wxr, node, page_data)
51 elif node.template_name == "ja-mot": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true
52 process_ja_mot_template(wxr, node, page_data)
53 elif node.template_name in ( 53 ↛ 57line 53 didn't jump to line 57 because the condition on line 53 was never true
54 "conj",
55 "conjugaison",
56 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
57 process_conj_template(wxr, node, page_data)
58 elif node.template_name in ASPIRATED_H_TEMPLATES:
59 continue
60 elif node.template_name == "lien pronominal":
61 process_lien_pronominal(wxr, node, page_data)
62 elif node.template_name == "note":
63 note = clean_node(wxr, page_data[-1], nodes[index + 1 :])
64 if note != "": 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true
65 page_data[-1].notes.append(note)
66 break
67 else:
68 raw_tag = clean_node(wxr, page_data[-1], node)
69 expanded_template = wxr.wtp.parse(
70 wxr.wtp.node_to_wikitext(node), expand_all=True
71 )
72 if (
73 len(
74 list(
75 expanded_template.find_html(
76 "span", attr_name="id", attr_value="région"
77 )
78 )
79 )
80 == 1
81 and pre_template_name in PRON_TEMPLATES
82 and len(page_data[-1].sounds) > 0
83 ):
84 # it's the location of the previous IPA template
85 # https://fr.wiktionary.org/wiki/Modèle:région
86 page_data[-1].sounds[-1].raw_tags.append(
87 raw_tag.strip("()")
88 )
89 elif len(raw_tag.strip("()")) > 0: 89 ↛ 94line 89 didn't jump to line 94 because the condition on line 89 was always true
90 if raw_tag.startswith("(") and raw_tag.endswith(")"):
91 raw_tag = raw_tag.strip("()")
92 page_data[-1].raw_tags.append(raw_tag)
94 pre_template_name = node.template_name
95 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
96 raw_tag = clean_node(wxr, None, node)
97 if raw_tag != "ou":
98 page_data[-1].raw_tags.append(raw_tag)
99 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:
100 process_conj_link_node(wxr, node, page_data)
101 elif (
102 isinstance(node, WikiNode)
103 and node.kind == NodeKind.BOLD
104 and first_bold
105 ):
106 process_form_line_bold_node(wxr, node, page_data[-1])
107 first_bold = False
109 translate_raw_tags(page_data[-1])
112def process_equiv_pour_template(
113 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]
114) -> list[Form]:
115 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
116 expanded_node = wxr.wtp.parse(
117 wxr.wtp.node_to_wikitext(node), expand_all=True
118 )
119 raw_gender_tag = ""
120 gender_tags = {
121 "un homme": "masculine",
122 "une femme": "feminine",
123 "des femmes": "feminine",
124 "le mâle": "masculine",
125 "la femelle": "feminine",
126 "un garçon": "masculine",
127 "une fille": "feminine",
128 "une personne non-binaire": "neuter",
129 }
130 forms = []
131 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
132 if child.kind == NodeKind.ITALIC:
133 raw_gender_tag = clean_node(wxr, None, child).strip("() ")
134 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit(
135 ",", 1
136 )[0]
137 elif isinstance(child, HTMLNode) and child.tag == "bdi": 137 ↛ 131line 137 didn't jump to line 131 because the condition on line 137 was always true
138 form_data = Form(
139 form=clean_node(wxr, None, child),
140 source="form line template 'équiv-pour'",
141 )
142 if len(raw_gender_tag) > 0: 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true
143 if raw_gender_tag in gender_tags: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true
144 form_data.tags.append(gender_tags[raw_gender_tag])
145 else:
146 form_data.raw_tags.append(raw_gender_tag)
147 if len(form_data.form) > 0: 147 ↛ 131line 147 didn't jump to line 131 because the condition on line 147 was always true
148 if len(page_data) > 0:
149 page_data[-1].forms.append(form_data)
150 forms.append(form_data)
151 return forms
154def process_zh_mot_template(
155 wxr: WiktextractContext,
156 node: TemplateNode,
157 page_data: list[WordEntry],
158) -> None:
159 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
160 # https://fr.wiktionary.org/wiki/Modèle:zh-mot
161 node = wxr.wtp.parse(
162 wxr.wtp.node_to_wikitext(node),
163 pre_expand=True,
164 additional_expand={node.template_name},
165 )
166 for template_node in node.find_child(NodeKind.TEMPLATE):
167 if template_node.template_name.lower() == "lang":
168 page_data[-1].sounds.append(
169 Sound(
170 zh_pron=clean_node(wxr, None, template_node),
171 tags=["Pinyin"],
172 )
173 )
174 elif template_node.template_name in ("pron", "prononciation"): 174 ↛ 166line 174 didn't jump to line 166 because the condition on line 174 was always true
175 page_data[-1].sounds.append(
176 Sound(ipa=clean_node(wxr, None, template_node))
177 )
180def process_ja_mot_template(
181 wxr: WiktextractContext,
182 template_node: TemplateNode,
183 page_data: list[WordEntry],
184) -> None:
185 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
186 expanded_node = wxr.wtp.parse(
187 wxr.wtp.node_to_wikitext(template_node), expand_all=True
188 )
189 existing_forms = {
190 existing_form.form for existing_form in page_data[-1].forms
191 }
192 for index, node in expanded_node.find_html("span", with_index=True):
193 # the first span tag is the word, the second is Hepburn romanization
194 if index == 1:
195 form_text = clean_node(wxr, None, node)
196 if form_text not in existing_forms:
197 # avoid adding duplicated form data extracted from
198 # inflection table before the form line
199 page_data[-1].forms.append(
200 Form(form=form_text, tags=["romanization"])
201 )
202 break
205def process_conj_template(
206 wxr: WiktextractContext,
207 template_node: TemplateNode,
208 page_data: list[WordEntry],
209) -> None:
210 # https://fr.wiktionary.org/wiki/Modèle:conjugaison
211 expanded_node = wxr.wtp.parse(
212 wxr.wtp.node_to_wikitext(template_node), expand_all=True
213 )
214 for link in expanded_node.find_child(NodeKind.LINK):
215 process_conj_link_node(wxr, link, page_data)
217 tag = clean_node(wxr, page_data[-1], expanded_node)
218 if template_node.template_name in ("conj", "conjugaison"):
219 tag = tag.removesuffix("(voir la conjugaison)").strip()
220 elif template_node.template_name.startswith("ja-"):
221 tag = (
222 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
223 )
224 if len(tag) > 0:
225 page_data[-1].raw_tags.append(tag)
228def is_conj_link(wxr: WiktextractContext, link: WikiNode) -> bool:
229 if len(link.largs) == 0 or len(link.largs[0]) == 0: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true
230 return False
231 conj_title = clean_node(wxr, None, link.largs[0][0])
232 return conj_title.startswith("Conjugaison:")
235def process_conj_link_node(
236 wxr: WiktextractContext,
237 link: WikiNode,
238 page_data: list[WordEntry],
239) -> None:
240 if not is_conj_link(wxr, link): 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true
241 return
242 conj_title = link.largs[0][0]
243 conj_word = conj_title.split("/", 1)[-1]
244 if conj_word in (
245 "Premier groupe",
246 "Deuxième groupe",
247 "Troisième groupe",
248 ):
249 return
250 if (
251 len(page_data) > 1
252 and page_data[-2].lang_code == page_data[-1].lang_code
253 and page_data[-2].pos == page_data[-1].pos
254 and len(page_data[-2].forms) > 0
255 and page_data[-2].forms[-1].source == conj_title
256 ):
257 page_data[-1].forms = page_data[-2].forms
258 else:
259 extract_conjugation(wxr, page_data[-1], conj_title)
262def process_lien_pronominal(
263 wxr: WiktextractContext,
264 template_node: TemplateNode,
265 page_data: list[WordEntry],
266) -> None:
267 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal
268 expanded_node = wxr.wtp.parse(
269 wxr.wtp.node_to_wikitext(template_node), expand_all=True
270 )
271 for bdi_tag in expanded_node.find_html_recursively("bdi"):
272 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"])
273 if form.form != "": 273 ↛ 271line 273 didn't jump to line 271 because the condition on line 273 was always true
274 page_data[-1].forms.append(form)
275 clean_node(wxr, page_data[-1], expanded_node)
278def process_form_line_bold_node(
279 wxr: WiktextractContext, bold_node: WikiNode, word_entry: WordEntry
280):
281 bold_str = clean_node(wxr, None, bold_node)
282 if wxr.wtp.title.startswith("Titres non pris en charge/"):
283 # Unsupported titles:
284 # https://fr.wiktionary.org/wiki/Annexe:Titres_non_pris_en_charge
285 # https://fr.wiktionary.org/wiki/Spécial:Index/Titres_non_pris_en_charge
286 word_entry.word = bold_str
287 word_entry.original_title = wxr.wtp.title
288 elif bold_str not in [wxr.wtp.title, ""]:
289 word_entry.forms.append(Form(form=bold_str, tags=["canonical"]))