Coverage for src/wiktextract/extractor/fr/form_line.py: 65%
111 statements
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
« prev ^ index » next coverage.py v7.6.10, created at 2024-12-27 08:07 +0000
1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
3from ...page import clean_node
4from ...wxr_context import WiktextractContext
5from .conjugation import extract_conjugation
6from .models import Form, Sound, WordEntry
7from .pronunciation import (
8 ASPIRATED_H_TEMPLATES,
9 PRON_TEMPLATES,
10 process_pron_template,
11)
12from .tags import translate_raw_tags
15def extract_form_line(
16 wxr: WiktextractContext,
17 page_data: list[WordEntry],
18 nodes: list[WikiNode | str],
19) -> None:
20 """
21 Ligne de forme
22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe
24 A line of wikitext between pos subtitle and the first gloss, contains IPA,
25 gender and inflection forms.
26 """
27 IGNORE_TEMPLATES = frozenset(
28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?"]
29 )
31 pre_template_name = ""
32 for index, node in enumerate(nodes):
33 if isinstance(node, TemplateNode):
34 if node.template_name in IGNORE_TEMPLATES: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true
35 continue
36 elif node.template_name in PRON_TEMPLATES:
37 page_data[-1].sounds.extend(
38 process_pron_template(
39 wxr, node, [], nodes[index - 1 : index]
40 )
41 )
42 elif node.template_name == "équiv-pour":
43 process_equiv_pour_template(wxr, node, page_data)
44 elif node.template_name.startswith("zh-mot"): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true
45 process_zh_mot_template(wxr, node, page_data)
46 elif node.template_name == "ja-mot": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 process_ja_mot_template(wxr, node, page_data)
48 elif node.template_name in ( 48 ↛ 52line 48 didn't jump to line 52 because the condition on line 48 was never true
49 "conj",
50 "conjugaison",
51 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
52 process_conj_template(wxr, node, page_data)
53 elif node.template_name in ASPIRATED_H_TEMPLATES:
54 continue
55 elif node.template_name == "lien pronominal":
56 process_lien_pronominal(wxr, node, page_data)
57 elif node.template_name == "note":
58 note = clean_node(wxr, page_data[-1], nodes[index + 1 :])
59 if note != "": 59 ↛ 61line 59 didn't jump to line 61 because the condition on line 59 was always true
60 page_data[-1].notes.append(note)
61 break
62 else:
63 raw_tag = clean_node(wxr, page_data[-1], node)
64 expanded_template = wxr.wtp.parse(
65 wxr.wtp.node_to_wikitext(node), expand_all=True
66 )
67 if (
68 len(
69 list(
70 expanded_template.find_html(
71 "span", attr_name="id", attr_value="région"
72 )
73 )
74 )
75 == 1
76 and pre_template_name in PRON_TEMPLATES
77 and len(page_data[-1].sounds) > 0
78 ):
79 # it's the location of the previous IPA template
80 # https://fr.wiktionary.org/wiki/Modèle:région
81 page_data[-1].sounds[-1].raw_tags.append(
82 raw_tag.strip("()")
83 )
84 elif len(raw_tag.strip("()")) > 0: 84 ↛ 89line 84 didn't jump to line 89 because the condition on line 84 was always true
85 if raw_tag.startswith("(") and raw_tag.endswith(")"):
86 raw_tag = raw_tag.strip("()")
87 page_data[-1].raw_tags.append(raw_tag)
89 pre_template_name = node.template_name
90 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
91 raw_tag = clean_node(wxr, None, node)
92 if raw_tag != "ou":
93 page_data[-1].raw_tags.append(raw_tag)
95 translate_raw_tags(page_data[-1])
98def process_equiv_pour_template(
99 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]
100) -> list[Form]:
101 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
102 expanded_node = wxr.wtp.parse(
103 wxr.wtp.node_to_wikitext(node), expand_all=True
104 )
105 raw_gender_tag = ""
106 gender_tags = {
107 "un homme": "masculine",
108 "une femme": "feminine",
109 "le mâle": "masculine",
110 "la femelle": "feminine",
111 "un garçon": "masculine",
112 "une fille": "feminine",
113 "une personne non-binaire": "neuter",
114 }
115 forms = []
116 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
117 if child.kind == NodeKind.ITALIC:
118 raw_gender_tag = clean_node(wxr, None, child).strip("() ")
119 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit(
120 ",", 1
121 )[0]
122 elif isinstance(child, HTMLNode) and child.tag == "bdi": 122 ↛ 116line 122 didn't jump to line 116 because the condition on line 122 was always true
123 form_data = Form(
124 form=clean_node(wxr, None, child),
125 source="form line template 'équiv-pour'",
126 )
127 if len(raw_gender_tag) > 0: 127 ↛ 132line 127 didn't jump to line 132 because the condition on line 127 was always true
128 if raw_gender_tag in gender_tags: 128 ↛ 131line 128 didn't jump to line 131 because the condition on line 128 was always true
129 form_data.tags.append(gender_tags[raw_gender_tag])
130 else:
131 form_data.raw_tags.append(raw_gender_tag)
132 if len(form_data.form) > 0: 132 ↛ 116line 132 didn't jump to line 116 because the condition on line 132 was always true
133 if len(page_data) > 0:
134 page_data[-1].forms.append(form_data)
135 forms.append(form_data)
136 return forms
139def process_zh_mot_template(
140 wxr: WiktextractContext,
141 node: TemplateNode,
142 page_data: list[WordEntry],
143) -> None:
144 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
145 # https://fr.wiktionary.org/wiki/Modèle:zh-mot
146 node = wxr.wtp.parse(
147 wxr.wtp.node_to_wikitext(node),
148 pre_expand=True,
149 additional_expand={node.template_name},
150 )
151 for template_node in node.find_child(NodeKind.TEMPLATE):
152 if template_node.template_name.lower() == "lang":
153 page_data[-1].sounds.append(
154 Sound(
155 zh_pron=clean_node(wxr, None, template_node),
156 tags=["Pinyin"],
157 )
158 )
159 elif template_node.template_name in ("pron", "prononciation"): 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true
160 page_data[-1].sounds.append(
161 Sound(ipa=clean_node(wxr, None, template_node))
162 )
165def process_ja_mot_template(
166 wxr: WiktextractContext,
167 template_node: TemplateNode,
168 page_data: list[WordEntry],
169) -> None:
170 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
171 expanded_node = wxr.wtp.parse(
172 wxr.wtp.node_to_wikitext(template_node), expand_all=True
173 )
174 existing_forms = {
175 existing_form.form for existing_form in page_data[-1].forms
176 }
177 for index, node in expanded_node.find_html("span", with_index=True):
178 # the first span tag is the word, the second is Hepburn romanization
179 if index == 1:
180 form_text = clean_node(wxr, None, node)
181 if form_text not in existing_forms:
182 # avoid adding duplicated form data extracted from
183 # inflection table before the form line
184 page_data[-1].forms.append(
185 Form(form=form_text, tags=["romanization"])
186 )
187 break
190def process_conj_template(
191 wxr: WiktextractContext,
192 template_node: TemplateNode,
193 page_data: list[WordEntry],
194) -> None:
195 # https://fr.wiktionary.org/wiki/Modèle:conjugaison
196 expanded_node = wxr.wtp.parse(
197 wxr.wtp.node_to_wikitext(template_node), expand_all=True
198 )
199 for link in expanded_node.find_child(NodeKind.LINK):
200 if len(link.largs) == 0:
201 continue
202 conj_title = link.largs[0][0]
203 if not conj_title.startswith("Conjugaison:"):
204 continue
205 conj_word = conj_title.split("/", 1)[-1]
206 if conj_word in (
207 "Premier groupe",
208 "Deuxième groupe",
209 "Troisième groupe",
210 ):
211 continue
212 if (
213 len(page_data) > 1
214 and page_data[-2].lang_code == page_data[-1].lang_code
215 and page_data[-2].pos == page_data[-1].pos
216 and len(page_data[-2].forms) > 0
217 and page_data[-2].forms[-1].source == conj_title
218 ):
219 page_data[-1].forms = page_data[-2].forms
220 else:
221 extract_conjugation(wxr, page_data[-1], conj_title)
223 tag = clean_node(wxr, page_data[-1], expanded_node)
224 if template_node.template_name in ("conj", "conjugaison"):
225 tag = tag.removesuffix("(voir la conjugaison)").strip()
226 elif template_node.template_name.startswith("ja-"):
227 tag = (
228 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
229 )
230 if len(tag) > 0:
231 page_data[-1].raw_tags.append(tag)
234def process_lien_pronominal(
235 wxr: WiktextractContext,
236 template_node: TemplateNode,
237 page_data: list[WordEntry],
238) -> None:
239 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal
240 expanded_node = wxr.wtp.parse(
241 wxr.wtp.node_to_wikitext(template_node), expand_all=True
242 )
243 for bdi_tag in expanded_node.find_html_recursively("bdi"):
244 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"])
245 if form.form != "": 245 ↛ 243line 245 didn't jump to line 243 because the condition on line 245 was always true
246 page_data[-1].forms.append(form)
247 clean_node(wxr, page_data[-1], expanded_node)