Coverage for src/wiktextract/extractor/fr/form

1from typing import Union

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from .conjugation import extract_conjugation

8from .models import Form, Sound, WordEntry

9from .pronunciation import (

10 ASPIRATED_H_TEMPLATES,

11 PRON_TEMPLATES,

12 process_pron_template,

13)

14from .tags import translate_raw_tags

17def extract_form_line(

18 wxr: WiktextractContext,

19 page_data: list[WordEntry],

20 nodes: list[Union[WikiNode, str]],

21) -> None:

22 """

23 Ligne de forme

24 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe

26 A line of wikitext between pos subtitle and the first gloss, contains IPA,

27 gender and inflection forms.

28 """

29 IGNORE_TEMPLATES = frozenset(

30 ["voir-conj", "genre ?", "nombre ?", "pluriel ?"]

31 )

33 pre_template_name = ""

34 for index, node in enumerate(nodes):

35 if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE:

36 if node.template_name in IGNORE_TEMPLATES: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 continue

38 elif node.template_name in PRON_TEMPLATES:

39 page_data[-1].sounds.extend(

40 process_pron_template(

41 wxr, node, [], nodes[index - 1 : index]

42 )

43 )

44 elif node.template_name == "équiv-pour":

45 process_equiv_pour_template(wxr, node, page_data)

46 elif node.template_name.startswith("zh-mot"): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 process_zh_mot_template(wxr, node, page_data)

48 elif node.template_name == "ja-mot": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 process_ja_mot_template(wxr, node, page_data)

50 elif node.template_name in ( 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was never true

51 "conj",

52 "conjugaison",

53 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")):

54 process_conj_template(wxr, node, page_data)

55 elif node.template_name in ASPIRATED_H_TEMPLATES:

56 continue

57 elif node.template_name == "lien pronominal":

58 process_lien_pronominal(wxr, node, page_data)

59 else:

60 raw_tag = clean_node(wxr, page_data[-1], node)

61 expanded_template = wxr.wtp.parse(

62 wxr.wtp.node_to_wikitext(node), expand_all=True

63 )

64 if (

65 len(

66 list(

67 expanded_template.find_html(

68 "span", attr_name="id", attr_value="région"

69 )

70 )

71 )

72 == 1

73 and pre_template_name in PRON_TEMPLATES

74 and len(page_data[-1].sounds) > 0

75 ):

76 # it's the location of the previous IPA template

77 # https://fr.wiktionary.org/wiki/Modèle:région

78 page_data[-1].sounds[-1].raw_tags.append(

79 raw_tag.strip("()")

80 )

81 elif len(raw_tag.strip("()")) > 0: 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true

82 if raw_tag.startswith("(") and raw_tag.endswith(")"):

83 raw_tag = raw_tag.strip("()")

84 page_data[-1].raw_tags.append(raw_tag)

86 pre_template_name = node.template_name

87 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC:

88 raw_tag = clean_node(wxr, None, node)

89 if raw_tag != "ou":

90 page_data[-1].raw_tags.append(raw_tag)

92 translate_raw_tags(page_data[-1])

95def process_equiv_pour_template(

96 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry]

97) -> None:

98 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour

99 expanded_node = wxr.wtp.parse(

100 wxr.wtp.node_to_wikitext(node), expand_all=True

101 )

102 raw_gender_tag = ""

103 gender_tags = {

104 "un homme": "masculine",

105 "une femme": "feminine",

106 "le mâle": "masculine",

107 "la femelle": "feminine",

108 "un garçon": "masculine",

109 "une fille": "feminine",

110 "une personne non-binaire": "neuter",

111 }

112

113 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML):

114 if child.kind == NodeKind.ITALIC:

115 raw_gender_tag = clean_node(wxr, None, child).strip("() ")

116 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit(

117 ",", 1

118 )[0]

119 elif isinstance(child, HTMLNode) and child.tag == "bdi": 119 ↛ 113line 119 didn't jump to line 113 because the condition on line 119 was always true

120 form_data = Form(

121 form=clean_node(wxr, None, child),

122 source="form line template 'équiv-pour'",

123 )

124 if len(raw_gender_tag) > 0: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true

125 if raw_gender_tag in gender_tags: 125 ↛ 128line 125 didn't jump to line 128 because the condition on line 125 was always true

126 form_data.tags.append(gender_tags[raw_gender_tag])

127 else:

128 form_data.raw_tags.append(raw_gender_tag)

129 if len(form_data.form) > 0: 129 ↛ 113line 129 didn't jump to line 113 because the condition on line 129 was always true

130 page_data[-1].forms.append(form_data)

131

132

133def process_zh_mot_template(

134 wxr: WiktextractContext,

135 node: TemplateNode,

136 page_data: list[WordEntry],

137) -> None:

138 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t

139 # https://fr.wiktionary.org/wiki/Modèle:zh-mot

140 node = wxr.wtp.parse(

141 wxr.wtp.node_to_wikitext(node),

142 pre_expand=True,

143 additional_expand={node.template_name},

144 )

145 for template_node in node.find_child(NodeKind.TEMPLATE):

146 if template_node.template_name.lower() == "lang":

147 page_data[-1].sounds.append(

148 Sound(

149 zh_pron=clean_node(wxr, None, template_node),

150 tags=["Pinyin"],

151 )

152 )

153 elif template_node.template_name in ("pron", "prononciation"): 153 ↛ 145line 153 didn't jump to line 145 because the condition on line 153 was always true

154 page_data[-1].sounds.append(

155 Sound(ipa=clean_node(wxr, None, template_node))

156 )

157

158

159def process_ja_mot_template(

160 wxr: WiktextractContext,

161 template_node: TemplateNode,

162 page_data: list[WordEntry],

163) -> None:

164 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot

165 expanded_node = wxr.wtp.parse(

166 wxr.wtp.node_to_wikitext(template_node), expand_all=True

167 )

168 existing_forms = {

169 existing_form.form for existing_form in page_data[-1].forms

170 }

171 for index, node in expanded_node.find_html("span", with_index=True):

172 # the first span tag is the word, the second is Hepburn romanization

173 if index == 1:

174 form_text = clean_node(wxr, None, node)

175 if form_text not in existing_forms:

176 # avoid adding duplicated form data extracted from

177 # inflection table before the form line

178 page_data[-1].forms.append(

179 Form(form=form_text, tags=["romanization"])

180 )

181 break

182

183

184def process_conj_template(

185 wxr: WiktextractContext,

186 template_node: TemplateNode,

187 page_data: list[WordEntry],

188) -> None:

189 # https://fr.wiktionary.org/wiki/Modèle:conjugaison

190 expanded_node = wxr.wtp.parse(

191 wxr.wtp.node_to_wikitext(template_node), expand_all=True

192 )

193 for link in expanded_node.find_child(NodeKind.LINK):

194 if len(link.largs) == 0:

195 continue

196 conj_title = link.largs[0][0]

197 if not conj_title.startswith("Conjugaison:"):

198 continue

199 conj_word = conj_title.split("/", 1)[-1]

200 if conj_word in (

201 "Premier groupe",

202 "Deuxième groupe",

203 "Troisième groupe",

204 ):

205 continue

206 if (

207 len(page_data) > 1

208 and page_data[-2].lang_code == page_data[-1].lang_code

209 and page_data[-2].pos == page_data[-1].pos

210 and len(page_data[-2].forms) > 0

211 and page_data[-2].forms[-1].source == conj_title

212 ):

213 page_data[-1].forms = page_data[-2].forms

214 else:

215 extract_conjugation(wxr, page_data[-1], conj_title)

216

217 tag = clean_node(wxr, page_data[-1], expanded_node)

218 if template_node.template_name in ("conj", "conjugaison"):

219 tag = tag.removesuffix("(voir la conjugaison)").strip()

220 elif template_node.template_name.startswith("ja-"):

221 tag = (

222 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip()

223 )

224 if len(tag) > 0:

225 page_data[-1].raw_tags.append(tag)

226

227

228def process_lien_pronominal(

229 wxr: WiktextractContext,

230 template_node: TemplateNode,

231 page_data: list[WordEntry],

232) -> None:

233 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal

234 expanded_node = wxr.wtp.parse(

235 wxr.wtp.node_to_wikitext(template_node), expand_all=True

236 )

237 for bdi_tag in expanded_node.find_html_recursively("bdi"):

238 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"])

239 if form.form != "": 239 ↛ 237line 239 didn't jump to line 237 because the condition on line 239 was always true

240 page_data[-1].forms.append(form)

241 clean_node(wxr, page_data[-1], expanded_node)

Coverage for src/wiktextract/extractor/fr/form_line.py: 63%

103 statements