Coverage for src/wiktextract/extractor/fr/form_line.py: 71%

129 statements  

« prev     ^ index     » next       coverage.py v7.10.3, created at 2025-08-15 05:18 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .conjugation import extract_conjugation 

6from .models import Form, Sound, WordEntry 

7from .pronunciation import ( 

8 ASPIRATED_H_TEMPLATES, 

9 PRON_TEMPLATES, 

10 process_pron_template, 

11) 

12from .tags import translate_raw_tags 

13 

14 

15def extract_form_line( 

16 wxr: WiktextractContext, 

17 page_data: list[WordEntry], 

18 nodes: list[WikiNode | str], 

19) -> None: 

20 """ 

21 Ligne de forme 

22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe 

23 

24 A line of wikitext between pos subtitle and the first gloss, contains IPA, 

25 gender and inflection forms. 

26 """ 

27 IGNORE_TEMPLATES = frozenset( 

28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?", "réf"] 

29 ) 

30 

31 pre_template_name = "" 

32 first_bold = True 

33 for index, node in enumerate(nodes): 

34 if isinstance(node, TemplateNode): 

35 if node.template_name in IGNORE_TEMPLATES: 35 ↛ 36line 35 didn't jump to line 36 because the condition on line 35 was never true

36 continue 

37 elif node.template_name in PRON_TEMPLATES: 

38 page_data[-1].sounds.extend( 

39 process_pron_template( 

40 wxr, 

41 node, 

42 [], 

43 page_data[-1].lang_code, 

44 nodes[index - 1 : index], 

45 ) 

46 ) 

47 elif node.template_name == "équiv-pour": 

48 process_equiv_pour_template(wxr, node, page_data) 

49 elif node.template_name.startswith("zh-mot"): 49 ↛ 50line 49 didn't jump to line 50 because the condition on line 49 was never true

50 process_zh_mot_template(wxr, node, page_data) 

51 elif node.template_name == "ja-mot": 51 ↛ 52line 51 didn't jump to line 52 because the condition on line 51 was never true

52 process_ja_mot_template(wxr, node, page_data) 

53 elif node.template_name in ( 53 ↛ 57line 53 didn't jump to line 57 because the condition on line 53 was never true

54 "conj", 

55 "conjugaison", 

56 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")): 

57 process_conj_template(wxr, node, page_data) 

58 elif node.template_name in ASPIRATED_H_TEMPLATES: 

59 continue 

60 elif node.template_name == "lien pronominal": 

61 process_lien_pronominal(wxr, node, page_data) 

62 elif node.template_name == "note": 

63 note = clean_node(wxr, page_data[-1], nodes[index + 1 :]) 

64 if note != "": 64 ↛ 66line 64 didn't jump to line 66 because the condition on line 64 was always true

65 page_data[-1].notes.append(note) 

66 break 

67 else: 

68 raw_tag = clean_node(wxr, page_data[-1], node) 

69 expanded_template = wxr.wtp.parse( 

70 wxr.wtp.node_to_wikitext(node), expand_all=True 

71 ) 

72 if ( 

73 len( 

74 list( 

75 expanded_template.find_html( 

76 "span", attr_name="id", attr_value="région" 

77 ) 

78 ) 

79 ) 

80 == 1 

81 and pre_template_name in PRON_TEMPLATES 

82 and len(page_data[-1].sounds) > 0 

83 ): 

84 # it's the location of the previous IPA template 

85 # https://fr.wiktionary.org/wiki/Modèle:région 

86 page_data[-1].sounds[-1].raw_tags.append( 

87 raw_tag.strip("()") 

88 ) 

89 elif len(raw_tag.strip("()")) > 0: 89 ↛ 94line 89 didn't jump to line 94 because the condition on line 89 was always true

90 if raw_tag.startswith("(") and raw_tag.endswith(")"): 

91 raw_tag = raw_tag.strip("()") 

92 page_data[-1].raw_tags.append(raw_tag) 

93 

94 pre_template_name = node.template_name 

95 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

96 raw_tag = clean_node(wxr, None, node) 

97 if raw_tag != "ou": 

98 page_data[-1].raw_tags.append(raw_tag) 

99 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

100 process_conj_link_node(wxr, node, page_data) 

101 elif ( 

102 isinstance(node, WikiNode) 

103 and node.kind == NodeKind.BOLD 

104 and first_bold 

105 ): 

106 process_form_line_bold_node(wxr, node, page_data[-1]) 

107 first_bold = False 

108 

109 translate_raw_tags(page_data[-1]) 

110 

111 

112def process_equiv_pour_template( 

113 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] 

114) -> list[Form]: 

115 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour 

116 expanded_node = wxr.wtp.parse( 

117 wxr.wtp.node_to_wikitext(node), expand_all=True 

118 ) 

119 raw_gender_tag = "" 

120 gender_tags = { 

121 "un homme": "masculine", 

122 "une femme": "feminine", 

123 "des femmes": "feminine", 

124 "le mâle": "masculine", 

125 "la femelle": "feminine", 

126 "un garçon": "masculine", 

127 "une fille": "feminine", 

128 "une personne non-binaire": "neuter", 

129 } 

130 forms = [] 

131 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML): 

132 if child.kind == NodeKind.ITALIC: 

133 raw_gender_tag = clean_node(wxr, None, child).strip("() ") 

134 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit( 

135 ",", 1 

136 )[0] 

137 elif isinstance(child, HTMLNode) and child.tag == "bdi": 137 ↛ 131line 137 didn't jump to line 131 because the condition on line 137 was always true

138 form_data = Form( 

139 form=clean_node(wxr, None, child), 

140 source="form line template 'équiv-pour'", 

141 ) 

142 if len(raw_gender_tag) > 0: 142 ↛ 147line 142 didn't jump to line 147 because the condition on line 142 was always true

143 if raw_gender_tag in gender_tags: 143 ↛ 146line 143 didn't jump to line 146 because the condition on line 143 was always true

144 form_data.tags.append(gender_tags[raw_gender_tag]) 

145 else: 

146 form_data.raw_tags.append(raw_gender_tag) 

147 if len(form_data.form) > 0: 147 ↛ 131line 147 didn't jump to line 131 because the condition on line 147 was always true

148 if len(page_data) > 0: 

149 page_data[-1].forms.append(form_data) 

150 forms.append(form_data) 

151 return forms 

152 

153 

154def process_zh_mot_template( 

155 wxr: WiktextractContext, 

156 node: TemplateNode, 

157 page_data: list[WordEntry], 

158) -> None: 

159 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t 

160 # https://fr.wiktionary.org/wiki/Modèle:zh-mot 

161 node = wxr.wtp.parse( 

162 wxr.wtp.node_to_wikitext(node), 

163 pre_expand=True, 

164 additional_expand={node.template_name}, 

165 ) 

166 for template_node in node.find_child(NodeKind.TEMPLATE): 

167 if template_node.template_name.lower() == "lang": 

168 page_data[-1].sounds.append( 

169 Sound( 

170 zh_pron=clean_node(wxr, None, template_node), 

171 tags=["Pinyin"], 

172 ) 

173 ) 

174 elif template_node.template_name in ("pron", "prononciation"): 174 ↛ 166line 174 didn't jump to line 166 because the condition on line 174 was always true

175 page_data[-1].sounds.append( 

176 Sound(ipa=clean_node(wxr, None, template_node)) 

177 ) 

178 

179 

180def process_ja_mot_template( 

181 wxr: WiktextractContext, 

182 template_node: TemplateNode, 

183 page_data: list[WordEntry], 

184) -> None: 

185 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot 

186 expanded_node = wxr.wtp.parse( 

187 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

188 ) 

189 existing_forms = { 

190 existing_form.form for existing_form in page_data[-1].forms 

191 } 

192 for index, node in expanded_node.find_html("span", with_index=True): 

193 # the first span tag is the word, the second is Hepburn romanization 

194 if index == 1: 

195 form_text = clean_node(wxr, None, node) 

196 if form_text not in existing_forms: 

197 # avoid adding duplicated form data extracted from 

198 # inflection table before the form line 

199 page_data[-1].forms.append( 

200 Form(form=form_text, tags=["romanization"]) 

201 ) 

202 break 

203 

204 

205def process_conj_template( 

206 wxr: WiktextractContext, 

207 template_node: TemplateNode, 

208 page_data: list[WordEntry], 

209) -> None: 

210 # https://fr.wiktionary.org/wiki/Modèle:conjugaison 

211 expanded_node = wxr.wtp.parse( 

212 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

213 ) 

214 for link in expanded_node.find_child(NodeKind.LINK): 

215 process_conj_link_node(wxr, link, page_data) 

216 

217 tag = clean_node(wxr, page_data[-1], expanded_node) 

218 if template_node.template_name in ("conj", "conjugaison"): 

219 tag = tag.removesuffix("(voir la conjugaison)").strip() 

220 elif template_node.template_name.startswith("ja-"): 

221 tag = ( 

222 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip() 

223 ) 

224 if len(tag) > 0: 

225 page_data[-1].raw_tags.append(tag) 

226 

227 

228def is_conj_link(wxr: WiktextractContext, link: WikiNode) -> bool: 

229 if len(link.largs) == 0 or len(link.largs[0]) == 0: 229 ↛ 230line 229 didn't jump to line 230 because the condition on line 229 was never true

230 return False 

231 conj_title = clean_node(wxr, None, link.largs[0][0]) 

232 return conj_title.startswith("Conjugaison:") 

233 

234 

235def process_conj_link_node( 

236 wxr: WiktextractContext, 

237 link: WikiNode, 

238 page_data: list[WordEntry], 

239) -> None: 

240 if not is_conj_link(wxr, link): 240 ↛ 242line 240 didn't jump to line 242 because the condition on line 240 was always true

241 return 

242 conj_title = link.largs[0][0] 

243 conj_word = conj_title.split("/", 1)[-1] 

244 if conj_word in ( 

245 "Premier groupe", 

246 "Deuxième groupe", 

247 "Troisième groupe", 

248 ): 

249 return 

250 if ( 

251 len(page_data) > 1 

252 and page_data[-2].lang_code == page_data[-1].lang_code 

253 and page_data[-2].pos == page_data[-1].pos 

254 and len(page_data[-2].forms) > 0 

255 and page_data[-2].forms[-1].source == conj_title 

256 ): 

257 page_data[-1].forms = page_data[-2].forms 

258 else: 

259 extract_conjugation(wxr, page_data[-1], conj_title) 

260 

261 

262def process_lien_pronominal( 

263 wxr: WiktextractContext, 

264 template_node: TemplateNode, 

265 page_data: list[WordEntry], 

266) -> None: 

267 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal 

268 expanded_node = wxr.wtp.parse( 

269 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

270 ) 

271 for bdi_tag in expanded_node.find_html_recursively("bdi"): 

272 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"]) 

273 if form.form != "": 273 ↛ 271line 273 didn't jump to line 271 because the condition on line 273 was always true

274 page_data[-1].forms.append(form) 

275 clean_node(wxr, page_data[-1], expanded_node) 

276 

277 

278def process_form_line_bold_node( 

279 wxr: WiktextractContext, bold_node: WikiNode, word_entry: WordEntry 

280): 

281 bold_str = clean_node(wxr, None, bold_node) 

282 if wxr.wtp.title.startswith("Titres non pris en charge/"): 

283 # Unsupported titles: 

284 # https://fr.wiktionary.org/wiki/Annexe:Titres_non_pris_en_charge 

285 # https://fr.wiktionary.org/wiki/Spécial:Index/Titres_non_pris_en_charge 

286 word_entry.word = bold_str 

287 word_entry.original_title = wxr.wtp.title 

288 elif bold_str not in [wxr.wtp.title, ""]: 

289 word_entry.forms.append(Form(form=bold_str, tags=["canonical"]))