Coverage for src/wiktextract/extractor/fr/form_line.py: 63%
103 statements
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
« prev ^ index » next coverage.py v7.6.4, created at 2024-10-25 10:11 +0000
1from typing import Union
3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode
5from ...page import clean_node
6from ...wxr_context import WiktextractContext
7from .conjugation import extract_conjugation
8from .models import Form, Sound, WordEntry
9from .pronunciation import (
10 ASPIRATED_H_TEMPLATES,
11 PRON_TEMPLATES,
12 process_pron_template,
13)
14from .tags import translate_raw_tags
17def extract_form_line(
18 wxr: WiktextractContext,
19 page_data: list[WordEntry],
20 nodes: list[Union[WikiNode, str]],
21) -> None:
22 """
23 Ligne de forme
24 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe
26 A line of wikitext between pos subtitle and the first gloss, contains IPA,
27 gender and inflection forms.
28 """
29 IGNORE_TEMPLATES = frozenset(
30 ["voir-conj", "genre ?", "nombre ?", "pluriel ?"]
31 )
33 pre_template_name = ""
34 for index, node in enumerate(nodes):
35 if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE:
36 if node.template_name in IGNORE_TEMPLATES: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true
37 continue
38 elif node.template_name in PRON_TEMPLATES:
39 page_data[-1].sounds.extend(
40 process_pron_template(
41 wxr, node, [], nodes[index - 1 : index]
42 )
43 )
44 elif node.template_name == "équiv-pour":
45 process_equiv_pour_template(wxr, node, page_data)
46 elif node.template_name.startswith("zh-mot"): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true
47 process_zh_mot_template(wxr, node, page_data)
48 elif node.template_name == "ja-mot": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true
49 process_ja_mot_template(wxr, node, page_data)
50 elif node.template_name in ( 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was never true
51 "conj",
52 "conjugaison",
53 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):
54 process_conj_template(wxr, node, page_data)
55 elif node.template_name in ASPIRATED_H_TEMPLATES:
56 continue
57 elif node.template_name == "lien pronominal":
58 process_lien_pronominal(wxr, node, page_data)
59 else:
60 raw_tag = clean_node(wxr, page_data[-1], node)
61 expanded_template = wxr.wtp.parse(
62 wxr.wtp.node_to_wikitext(node), expand_all=True
63 )
64 if (
65 len(
66 list(
67 expanded_template.find_html(
68 "span", attr_name="id", attr_value="région"
69 )
70 )
71 )
72 == 1
73 and pre_template_name in PRON_TEMPLATES
74 and len(page_data[-1].sounds) > 0
75 ):
76 # it's the location of the previous IPA template
77 # https://fr.wiktionary.org/wiki/Modèle:région
78 page_data[-1].sounds[-1].raw_tags.append(
79 raw_tag.strip("()")
80 )
81 elif len(raw_tag.strip("()")) > 0: 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true
82 if raw_tag.startswith("(") and raw_tag.endswith(")"):
83 raw_tag = raw_tag.strip("()")
84 page_data[-1].raw_tags.append(raw_tag)
86 pre_template_name = node.template_name
87 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:
88 raw_tag = clean_node(wxr, None, node)
89 if raw_tag != "ou":
90 page_data[-1].raw_tags.append(raw_tag)
92 translate_raw_tags(page_data[-1])
95def process_equiv_pour_template(
96 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]
97) -> None:
98 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour
99 expanded_node = wxr.wtp.parse(
100 wxr.wtp.node_to_wikitext(node), expand_all=True
101 )
102 raw_gender_tag = ""
103 gender_tags = {
104 "un homme": "masculine",
105 "une femme": "feminine",
106 "le mâle": "masculine",
107 "la femelle": "feminine",
108 "un garçon": "masculine",
109 "une fille": "feminine",
110 "une personne non-binaire": "neuter",
111 }
113 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):
114 if child.kind == NodeKind.ITALIC:
115 raw_gender_tag = clean_node(wxr, None, child).strip("() ")
116 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit(
117 ",", 1
118 )[0]
119 elif isinstance(child, HTMLNode) and child.tag == "bdi": 119 ↛ 113line 119 didn't jump to line 113 because the condition on line 119 was always true
120 form_data = Form(
121 form=clean_node(wxr, None, child),
122 source="form line template 'équiv-pour'",
123 )
124 if len(raw_gender_tag) > 0: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true
125 if raw_gender_tag in gender_tags: 125 ↛ 128line 125 didn't jump to line 128 because the condition on line 125 was always true
126 form_data.tags.append(gender_tags[raw_gender_tag])
127 else:
128 form_data.raw_tags.append(raw_gender_tag)
129 if len(form_data.form) > 0: 129 ↛ 113line 129 didn't jump to line 113 because the condition on line 129 was always true
130 page_data[-1].forms.append(form_data)
133def process_zh_mot_template(
134 wxr: WiktextractContext,
135 node: TemplateNode,
136 page_data: list[WordEntry],
137) -> None:
138 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t
139 # https://fr.wiktionary.org/wiki/Modèle:zh-mot
140 node = wxr.wtp.parse(
141 wxr.wtp.node_to_wikitext(node),
142 pre_expand=True,
143 additional_expand={node.template_name},
144 )
145 for template_node in node.find_child(NodeKind.TEMPLATE):
146 if template_node.template_name.lower() == "lang":
147 page_data[-1].sounds.append(
148 Sound(
149 zh_pron=clean_node(wxr, None, template_node),
150 tags=["Pinyin"],
151 )
152 )
153 elif template_node.template_name in ("pron", "prononciation"): 153 ↛ 145line 153 didn't jump to line 145 because the condition on line 153 was always true
154 page_data[-1].sounds.append(
155 Sound(ipa=clean_node(wxr, None, template_node))
156 )
159def process_ja_mot_template(
160 wxr: WiktextractContext,
161 template_node: TemplateNode,
162 page_data: list[WordEntry],
163) -> None:
164 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot
165 expanded_node = wxr.wtp.parse(
166 wxr.wtp.node_to_wikitext(template_node), expand_all=True
167 )
168 existing_forms = {
169 existing_form.form for existing_form in page_data[-1].forms
170 }
171 for index, node in expanded_node.find_html("span", with_index=True):
172 # the first span tag is the word, the second is Hepburn romanization
173 if index == 1:
174 form_text = clean_node(wxr, None, node)
175 if form_text not in existing_forms:
176 # avoid adding duplicated form data extracted from
177 # inflection table before the form line
178 page_data[-1].forms.append(
179 Form(form=form_text, tags=["romanization"])
180 )
181 break
184def process_conj_template(
185 wxr: WiktextractContext,
186 template_node: TemplateNode,
187 page_data: list[WordEntry],
188) -> None:
189 # https://fr.wiktionary.org/wiki/Modèle:conjugaison
190 expanded_node = wxr.wtp.parse(
191 wxr.wtp.node_to_wikitext(template_node), expand_all=True
192 )
193 for link in expanded_node.find_child(NodeKind.LINK):
194 if len(link.largs) == 0:
195 continue
196 conj_title = link.largs[0][0]
197 if not conj_title.startswith("Conjugaison:"):
198 continue
199 conj_word = conj_title.split("/", 1)[-1]
200 if conj_word in (
201 "Premier groupe",
202 "Deuxième groupe",
203 "Troisième groupe",
204 ):
205 continue
206 if (
207 len(page_data) > 1
208 and page_data[-2].lang_code == page_data[-1].lang_code
209 and page_data[-2].pos == page_data[-1].pos
210 and len(page_data[-2].forms) > 0
211 and page_data[-2].forms[-1].source == conj_title
212 ):
213 page_data[-1].forms = page_data[-2].forms
214 else:
215 extract_conjugation(wxr, page_data[-1], conj_title)
217 tag = clean_node(wxr, page_data[-1], expanded_node)
218 if template_node.template_name in ("conj", "conjugaison"):
219 tag = tag.removesuffix("(voir la conjugaison)").strip()
220 elif template_node.template_name.startswith("ja-"):
221 tag = (
222 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()
223 )
224 if len(tag) > 0:
225 page_data[-1].raw_tags.append(tag)
228def process_lien_pronominal(
229 wxr: WiktextractContext,
230 template_node: TemplateNode,
231 page_data: list[WordEntry],
232) -> None:
233 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal
234 expanded_node = wxr.wtp.parse(
235 wxr.wtp.node_to_wikitext(template_node), expand_all=True
236 )
237 for bdi_tag in expanded_node.find_html_recursively("bdi"):
238 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"])
239 if form.form != "": 239 ↛ 237line 239 didn't jump to line 237 because the condition on line 239 was always true
240 page_data[-1].forms.append(form)
241 clean_node(wxr, page_data[-1], expanded_node)