Coverage for src/wiktextract/extractor/fr/form_line.py: 68%

118 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-04 10:58 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .conjugation import extract_conjugation 

6from .models import Form, Sound, WordEntry 

7from .pronunciation import ( 

8 ASPIRATED_H_TEMPLATES, 

9 PRON_TEMPLATES, 

10 process_pron_template, 

11) 

12from .tags import translate_raw_tags 

13 

14 

15def extract_form_line( 

16 wxr: WiktextractContext, 

17 page_data: list[WordEntry], 

18 nodes: list[WikiNode | str], 

19) -> None: 

20 """ 

21 Ligne de forme 

22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe 

23 

24 A line of wikitext between pos subtitle and the first gloss, contains IPA, 

25 gender and inflection forms. 

26 """ 

27 IGNORE_TEMPLATES = frozenset( 

28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?", "réf"] 

29 ) 

30 

31 pre_template_name = "" 

32 for index, node in enumerate(nodes): 

33 if isinstance(node, TemplateNode): 

34 if node.template_name in IGNORE_TEMPLATES: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 continue 

36 elif node.template_name in PRON_TEMPLATES: 

37 page_data[-1].sounds.extend( 

38 process_pron_template( 

39 wxr, 

40 node, 

41 [], 

42 page_data[-1].lang_code, 

43 nodes[index - 1 : index], 

44 ) 

45 ) 

46 elif node.template_name == "équiv-pour": 

47 process_equiv_pour_template(wxr, node, page_data) 

48 elif node.template_name.startswith("zh-mot"): 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 process_zh_mot_template(wxr, node, page_data) 

50 elif node.template_name == "ja-mot": 50 ↛ 51line 50 didn't jump to line 51 because the condition on line 50 was never true

51 process_ja_mot_template(wxr, node, page_data) 

52 elif node.template_name in ( 52 ↛ 56line 52 didn't jump to line 56 because the condition on line 52 was never true

53 "conj", 

54 "conjugaison", 

55 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")): 

56 process_conj_template(wxr, node, page_data) 

57 elif node.template_name in ASPIRATED_H_TEMPLATES: 

58 continue 

59 elif node.template_name == "lien pronominal": 

60 process_lien_pronominal(wxr, node, page_data) 

61 elif node.template_name == "note": 

62 note = clean_node(wxr, page_data[-1], nodes[index + 1 :]) 

63 if note != "": 63 ↛ 65line 63 didn't jump to line 65 because the condition on line 63 was always true

64 page_data[-1].notes.append(note) 

65 break 

66 else: 

67 raw_tag = clean_node(wxr, page_data[-1], node) 

68 expanded_template = wxr.wtp.parse( 

69 wxr.wtp.node_to_wikitext(node), expand_all=True 

70 ) 

71 if ( 

72 len( 

73 list( 

74 expanded_template.find_html( 

75 "span", attr_name="id", attr_value="région" 

76 ) 

77 ) 

78 ) 

79 == 1 

80 and pre_template_name in PRON_TEMPLATES 

81 and len(page_data[-1].sounds) > 0 

82 ): 

83 # it's the location of the previous IPA template 

84 # https://fr.wiktionary.org/wiki/Modèle:région 

85 page_data[-1].sounds[-1].raw_tags.append( 

86 raw_tag.strip("()") 

87 ) 

88 elif len(raw_tag.strip("()")) > 0: 88 ↛ 93line 88 didn't jump to line 93 because the condition on line 88 was always true

89 if raw_tag.startswith("(") and raw_tag.endswith(")"): 

90 raw_tag = raw_tag.strip("()") 

91 page_data[-1].raw_tags.append(raw_tag) 

92 

93 pre_template_name = node.template_name 

94 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

95 raw_tag = clean_node(wxr, None, node) 

96 if raw_tag != "ou": 

97 page_data[-1].raw_tags.append(raw_tag) 

98 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

99 process_conj_link_node(wxr, node, page_data) 

100 

101 translate_raw_tags(page_data[-1]) 

102 

103 

104def process_equiv_pour_template( 

105 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] 

106) -> list[Form]: 

107 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour 

108 expanded_node = wxr.wtp.parse( 

109 wxr.wtp.node_to_wikitext(node), expand_all=True 

110 ) 

111 raw_gender_tag = "" 

112 gender_tags = { 

113 "un homme": "masculine", 

114 "une femme": "feminine", 

115 "des femmes": "feminine", 

116 "le mâle": "masculine", 

117 "la femelle": "feminine", 

118 "un garçon": "masculine", 

119 "une fille": "feminine", 

120 "une personne non-binaire": "neuter", 

121 } 

122 forms = [] 

123 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML): 

124 if child.kind == NodeKind.ITALIC: 

125 raw_gender_tag = clean_node(wxr, None, child).strip("() ") 

126 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit( 

127 ",", 1 

128 )[0] 

129 elif isinstance(child, HTMLNode) and child.tag == "bdi": 129 ↛ 123line 129 didn't jump to line 123 because the condition on line 129 was always true

130 form_data = Form( 

131 form=clean_node(wxr, None, child), 

132 source="form line template 'équiv-pour'", 

133 ) 

134 if len(raw_gender_tag) > 0: 134 ↛ 139line 134 didn't jump to line 139 because the condition on line 134 was always true

135 if raw_gender_tag in gender_tags: 135 ↛ 138line 135 didn't jump to line 138 because the condition on line 135 was always true

136 form_data.tags.append(gender_tags[raw_gender_tag]) 

137 else: 

138 form_data.raw_tags.append(raw_gender_tag) 

139 if len(form_data.form) > 0: 139 ↛ 123line 139 didn't jump to line 123 because the condition on line 139 was always true

140 if len(page_data) > 0: 

141 page_data[-1].forms.append(form_data) 

142 forms.append(form_data) 

143 return forms 

144 

145 

146def process_zh_mot_template( 

147 wxr: WiktextractContext, 

148 node: TemplateNode, 

149 page_data: list[WordEntry], 

150) -> None: 

151 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t 

152 # https://fr.wiktionary.org/wiki/Modèle:zh-mot 

153 node = wxr.wtp.parse( 

154 wxr.wtp.node_to_wikitext(node), 

155 pre_expand=True, 

156 additional_expand={node.template_name}, 

157 ) 

158 for template_node in node.find_child(NodeKind.TEMPLATE): 

159 if template_node.template_name.lower() == "lang": 

160 page_data[-1].sounds.append( 

161 Sound( 

162 zh_pron=clean_node(wxr, None, template_node), 

163 tags=["Pinyin"], 

164 ) 

165 ) 

166 elif template_node.template_name in ("pron", "prononciation"): 166 ↛ 158line 166 didn't jump to line 158 because the condition on line 166 was always true

167 page_data[-1].sounds.append( 

168 Sound(ipa=clean_node(wxr, None, template_node)) 

169 ) 

170 

171 

172def process_ja_mot_template( 

173 wxr: WiktextractContext, 

174 template_node: TemplateNode, 

175 page_data: list[WordEntry], 

176) -> None: 

177 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot 

178 expanded_node = wxr.wtp.parse( 

179 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

180 ) 

181 existing_forms = { 

182 existing_form.form for existing_form in page_data[-1].forms 

183 } 

184 for index, node in expanded_node.find_html("span", with_index=True): 

185 # the first span tag is the word, the second is Hepburn romanization 

186 if index == 1: 

187 form_text = clean_node(wxr, None, node) 

188 if form_text not in existing_forms: 

189 # avoid adding duplicated form data extracted from 

190 # inflection table before the form line 

191 page_data[-1].forms.append( 

192 Form(form=form_text, tags=["romanization"]) 

193 ) 

194 break 

195 

196 

197def process_conj_template( 

198 wxr: WiktextractContext, 

199 template_node: TemplateNode, 

200 page_data: list[WordEntry], 

201) -> None: 

202 # https://fr.wiktionary.org/wiki/Modèle:conjugaison 

203 expanded_node = wxr.wtp.parse( 

204 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

205 ) 

206 for link in expanded_node.find_child(NodeKind.LINK): 

207 process_conj_link_node(wxr, link, page_data) 

208 

209 tag = clean_node(wxr, page_data[-1], expanded_node) 

210 if template_node.template_name in ("conj", "conjugaison"): 

211 tag = tag.removesuffix("(voir la conjugaison)").strip() 

212 elif template_node.template_name.startswith("ja-"): 

213 tag = ( 

214 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip() 

215 ) 

216 if len(tag) > 0: 

217 page_data[-1].raw_tags.append(tag) 

218 

219 

220def is_conj_link(wxr: WiktextractContext, link: WikiNode) -> bool: 

221 if len(link.largs) == 0 or len(link.largs[0]) == 0: 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 return False 

223 conj_title = clean_node(wxr, None, link.largs[0][0]) 

224 return conj_title.startswith("Conjugaison:") 

225 

226 

227def process_conj_link_node( 

228 wxr: WiktextractContext, 

229 link: WikiNode, 

230 page_data: list[WordEntry], 

231) -> None: 

232 if not is_conj_link(wxr, link): 232 ↛ 234line 232 didn't jump to line 234 because the condition on line 232 was always true

233 return 

234 conj_title = link.largs[0][0] 

235 conj_word = conj_title.split("/", 1)[-1] 

236 if conj_word in ( 

237 "Premier groupe", 

238 "Deuxième groupe", 

239 "Troisième groupe", 

240 ): 

241 return 

242 if ( 

243 len(page_data) > 1 

244 and page_data[-2].lang_code == page_data[-1].lang_code 

245 and page_data[-2].pos == page_data[-1].pos 

246 and len(page_data[-2].forms) > 0 

247 and page_data[-2].forms[-1].source == conj_title 

248 ): 

249 page_data[-1].forms = page_data[-2].forms 

250 else: 

251 extract_conjugation(wxr, page_data[-1], conj_title) 

252 

253 

254def process_lien_pronominal( 

255 wxr: WiktextractContext, 

256 template_node: TemplateNode, 

257 page_data: list[WordEntry], 

258) -> None: 

259 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal 

260 expanded_node = wxr.wtp.parse( 

261 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

262 ) 

263 for bdi_tag in expanded_node.find_html_recursively("bdi"): 

264 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"]) 

265 if form.form != "": 265 ↛ 263line 265 didn't jump to line 263 because the condition on line 265 was always true

266 page_data[-1].forms.append(form) 

267 clean_node(wxr, page_data[-1], expanded_node)