Coverage for src/wiktextract/extractor/fr/form_line.py: 65%

111 statements  

« prev     ^ index     » next       coverage.py v7.6.10, created at 2024-12-27 08:07 +0000

1from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

2 

3from ...page import clean_node 

4from ...wxr_context import WiktextractContext 

5from .conjugation import extract_conjugation 

6from .models import Form, Sound, WordEntry 

7from .pronunciation import ( 

8 ASPIRATED_H_TEMPLATES, 

9 PRON_TEMPLATES, 

10 process_pron_template, 

11) 

12from .tags import translate_raw_tags 

13 

14 

15def extract_form_line( 

16 wxr: WiktextractContext, 

17 page_data: list[WordEntry], 

18 nodes: list[WikiNode | str], 

19) -> None: 

20 """ 

21 Ligne de forme 

22 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe 

23 

24 A line of wikitext between pos subtitle and the first gloss, contains IPA, 

25 gender and inflection forms. 

26 """ 

27 IGNORE_TEMPLATES = frozenset( 

28 ["voir-conj", "genre ?", "nombre ?", "pluriel ?"] 

29 ) 

30 

31 pre_template_name = "" 

32 for index, node in enumerate(nodes): 

33 if isinstance(node, TemplateNode): 

34 if node.template_name in IGNORE_TEMPLATES: 34 ↛ 35line 34 didn't jump to line 35 because the condition on line 34 was never true

35 continue 

36 elif node.template_name in PRON_TEMPLATES: 

37 page_data[-1].sounds.extend( 

38 process_pron_template( 

39 wxr, node, [], nodes[index - 1 : index] 

40 ) 

41 ) 

42 elif node.template_name == "équiv-pour": 

43 process_equiv_pour_template(wxr, node, page_data) 

44 elif node.template_name.startswith("zh-mot"): 44 ↛ 45line 44 didn't jump to line 45 because the condition on line 44 was never true

45 process_zh_mot_template(wxr, node, page_data) 

46 elif node.template_name == "ja-mot": 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 process_ja_mot_template(wxr, node, page_data) 

48 elif node.template_name in ( 48 ↛ 52line 48 didn't jump to line 52 because the condition on line 48 was never true

49 "conj", 

50 "conjugaison", 

51 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")): 

52 process_conj_template(wxr, node, page_data) 

53 elif node.template_name in ASPIRATED_H_TEMPLATES: 

54 continue 

55 elif node.template_name == "lien pronominal": 

56 process_lien_pronominal(wxr, node, page_data) 

57 elif node.template_name == "note": 

58 note = clean_node(wxr, page_data[-1], nodes[index + 1 :]) 

59 if note != "": 59 ↛ 61line 59 didn't jump to line 61 because the condition on line 59 was always true

60 page_data[-1].notes.append(note) 

61 break 

62 else: 

63 raw_tag = clean_node(wxr, page_data[-1], node) 

64 expanded_template = wxr.wtp.parse( 

65 wxr.wtp.node_to_wikitext(node), expand_all=True 

66 ) 

67 if ( 

68 len( 

69 list( 

70 expanded_template.find_html( 

71 "span", attr_name="id", attr_value="région" 

72 ) 

73 ) 

74 ) 

75 == 1 

76 and pre_template_name in PRON_TEMPLATES 

77 and len(page_data[-1].sounds) > 0 

78 ): 

79 # it's the location of the previous IPA template 

80 # https://fr.wiktionary.org/wiki/Modèle:région 

81 page_data[-1].sounds[-1].raw_tags.append( 

82 raw_tag.strip("()") 

83 ) 

84 elif len(raw_tag.strip("()")) > 0: 84 ↛ 89line 84 didn't jump to line 89 because the condition on line 84 was always true

85 if raw_tag.startswith("(") and raw_tag.endswith(")"): 

86 raw_tag = raw_tag.strip("()") 

87 page_data[-1].raw_tags.append(raw_tag) 

88 

89 pre_template_name = node.template_name 

90 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

91 raw_tag = clean_node(wxr, None, node) 

92 if raw_tag != "ou": 

93 page_data[-1].raw_tags.append(raw_tag) 

94 

95 translate_raw_tags(page_data[-1]) 

96 

97 

98def process_equiv_pour_template( 

99 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] 

100) -> list[Form]: 

101 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour 

102 expanded_node = wxr.wtp.parse( 

103 wxr.wtp.node_to_wikitext(node), expand_all=True 

104 ) 

105 raw_gender_tag = "" 

106 gender_tags = { 

107 "un homme": "masculine", 

108 "une femme": "feminine", 

109 "le mâle": "masculine", 

110 "la femelle": "feminine", 

111 "un garçon": "masculine", 

112 "une fille": "feminine", 

113 "une personne non-binaire": "neuter", 

114 } 

115 forms = [] 

116 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML): 

117 if child.kind == NodeKind.ITALIC: 

118 raw_gender_tag = clean_node(wxr, None, child).strip("() ") 

119 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit( 

120 ",", 1 

121 )[0] 

122 elif isinstance(child, HTMLNode) and child.tag == "bdi": 122 ↛ 116line 122 didn't jump to line 116 because the condition on line 122 was always true

123 form_data = Form( 

124 form=clean_node(wxr, None, child), 

125 source="form line template 'équiv-pour'", 

126 ) 

127 if len(raw_gender_tag) > 0: 127 ↛ 132line 127 didn't jump to line 132 because the condition on line 127 was always true

128 if raw_gender_tag in gender_tags: 128 ↛ 131line 128 didn't jump to line 131 because the condition on line 128 was always true

129 form_data.tags.append(gender_tags[raw_gender_tag]) 

130 else: 

131 form_data.raw_tags.append(raw_gender_tag) 

132 if len(form_data.form) > 0: 132 ↛ 116line 132 didn't jump to line 116 because the condition on line 132 was always true

133 if len(page_data) > 0: 

134 page_data[-1].forms.append(form_data) 

135 forms.append(form_data) 

136 return forms 

137 

138 

139def process_zh_mot_template( 

140 wxr: WiktextractContext, 

141 node: TemplateNode, 

142 page_data: list[WordEntry], 

143) -> None: 

144 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t 

145 # https://fr.wiktionary.org/wiki/Modèle:zh-mot 

146 node = wxr.wtp.parse( 

147 wxr.wtp.node_to_wikitext(node), 

148 pre_expand=True, 

149 additional_expand={node.template_name}, 

150 ) 

151 for template_node in node.find_child(NodeKind.TEMPLATE): 

152 if template_node.template_name.lower() == "lang": 

153 page_data[-1].sounds.append( 

154 Sound( 

155 zh_pron=clean_node(wxr, None, template_node), 

156 tags=["Pinyin"], 

157 ) 

158 ) 

159 elif template_node.template_name in ("pron", "prononciation"): 159 ↛ 151line 159 didn't jump to line 151 because the condition on line 159 was always true

160 page_data[-1].sounds.append( 

161 Sound(ipa=clean_node(wxr, None, template_node)) 

162 ) 

163 

164 

165def process_ja_mot_template( 

166 wxr: WiktextractContext, 

167 template_node: TemplateNode, 

168 page_data: list[WordEntry], 

169) -> None: 

170 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot 

171 expanded_node = wxr.wtp.parse( 

172 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

173 ) 

174 existing_forms = { 

175 existing_form.form for existing_form in page_data[-1].forms 

176 } 

177 for index, node in expanded_node.find_html("span", with_index=True): 

178 # the first span tag is the word, the second is Hepburn romanization 

179 if index == 1: 

180 form_text = clean_node(wxr, None, node) 

181 if form_text not in existing_forms: 

182 # avoid adding duplicated form data extracted from 

183 # inflection table before the form line 

184 page_data[-1].forms.append( 

185 Form(form=form_text, tags=["romanization"]) 

186 ) 

187 break 

188 

189 

190def process_conj_template( 

191 wxr: WiktextractContext, 

192 template_node: TemplateNode, 

193 page_data: list[WordEntry], 

194) -> None: 

195 # https://fr.wiktionary.org/wiki/Modèle:conjugaison 

196 expanded_node = wxr.wtp.parse( 

197 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

198 ) 

199 for link in expanded_node.find_child(NodeKind.LINK): 

200 if len(link.largs) == 0: 

201 continue 

202 conj_title = link.largs[0][0] 

203 if not conj_title.startswith("Conjugaison:"): 

204 continue 

205 conj_word = conj_title.split("/", 1)[-1] 

206 if conj_word in ( 

207 "Premier groupe", 

208 "Deuxième groupe", 

209 "Troisième groupe", 

210 ): 

211 continue 

212 if ( 

213 len(page_data) > 1 

214 and page_data[-2].lang_code == page_data[-1].lang_code 

215 and page_data[-2].pos == page_data[-1].pos 

216 and len(page_data[-2].forms) > 0 

217 and page_data[-2].forms[-1].source == conj_title 

218 ): 

219 page_data[-1].forms = page_data[-2].forms 

220 else: 

221 extract_conjugation(wxr, page_data[-1], conj_title) 

222 

223 tag = clean_node(wxr, page_data[-1], expanded_node) 

224 if template_node.template_name in ("conj", "conjugaison"): 

225 tag = tag.removesuffix("(voir la conjugaison)").strip() 

226 elif template_node.template_name.startswith("ja-"): 

227 tag = ( 

228 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip() 

229 ) 

230 if len(tag) > 0: 

231 page_data[-1].raw_tags.append(tag) 

232 

233 

234def process_lien_pronominal( 

235 wxr: WiktextractContext, 

236 template_node: TemplateNode, 

237 page_data: list[WordEntry], 

238) -> None: 

239 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal 

240 expanded_node = wxr.wtp.parse( 

241 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

242 ) 

243 for bdi_tag in expanded_node.find_html_recursively("bdi"): 

244 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"]) 

245 if form.form != "": 245 ↛ 243line 245 didn't jump to line 243 because the condition on line 245 was always true

246 page_data[-1].forms.append(form) 

247 clean_node(wxr, page_data[-1], expanded_node)