Coverage for src/wiktextract/extractor/fr/form_line.py: 63%

103 statements  

« prev     ^ index     » next       coverage.py v7.6.4, created at 2024-10-25 10:11 +0000

1from typing import Union 

2 

3from wikitextprocessor.parser import HTMLNode, NodeKind, TemplateNode, WikiNode 

4 

5from ...page import clean_node 

6from ...wxr_context import WiktextractContext 

7from .conjugation import extract_conjugation 

8from .models import Form, Sound, WordEntry 

9from .pronunciation import ( 

10 ASPIRATED_H_TEMPLATES, 

11 PRON_TEMPLATES, 

12 process_pron_template, 

13) 

14from .tags import translate_raw_tags 

15 

16 

17def extract_form_line( 

18 wxr: WiktextractContext, 

19 page_data: list[WordEntry], 

20 nodes: list[Union[WikiNode, str]], 

21) -> None: 

22 """ 

23 Ligne de forme 

24 https://fr.wiktionary.org/wiki/Wiktionnaire:Structure_des_pages#Syntaxe 

25 

26 A line of wikitext between pos subtitle and the first gloss, contains IPA, 

27 gender and inflection forms. 

28 """ 

29 IGNORE_TEMPLATES = frozenset( 

30 ["voir-conj", "genre ?", "nombre ?", "pluriel ?"] 

31 ) 

32 

33 pre_template_name = "" 

34 for index, node in enumerate(nodes): 

35 if isinstance(node, WikiNode) and node.kind == NodeKind.TEMPLATE: 

36 if node.template_name in IGNORE_TEMPLATES: 36 ↛ 37line 36 didn't jump to line 37 because the condition on line 36 was never true

37 continue 

38 elif node.template_name in PRON_TEMPLATES: 

39 page_data[-1].sounds.extend( 

40 process_pron_template( 

41 wxr, node, [], nodes[index - 1 : index] 

42 ) 

43 ) 

44 elif node.template_name == "équiv-pour": 

45 process_equiv_pour_template(wxr, node, page_data) 

46 elif node.template_name.startswith("zh-mot"): 46 ↛ 47line 46 didn't jump to line 47 because the condition on line 46 was never true

47 process_zh_mot_template(wxr, node, page_data) 

48 elif node.template_name == "ja-mot": 48 ↛ 49line 48 didn't jump to line 49 because the condition on line 48 was never true

49 process_ja_mot_template(wxr, node, page_data) 

50 elif node.template_name in ( 50 ↛ 54line 50 didn't jump to line 54 because the condition on line 50 was never true

51 "conj", 

52 "conjugaison", 

53 ) or node.template_name.startswith(("ja-adj-", "ja-verbe")): 

54 process_conj_template(wxr, node, page_data) 

55 elif node.template_name in ASPIRATED_H_TEMPLATES: 

56 continue 

57 elif node.template_name == "lien pronominal": 

58 process_lien_pronominal(wxr, node, page_data) 

59 else: 

60 raw_tag = clean_node(wxr, page_data[-1], node) 

61 expanded_template = wxr.wtp.parse( 

62 wxr.wtp.node_to_wikitext(node), expand_all=True 

63 ) 

64 if ( 

65 len( 

66 list( 

67 expanded_template.find_html( 

68 "span", attr_name="id", attr_value="région" 

69 ) 

70 ) 

71 ) 

72 == 1 

73 and pre_template_name in PRON_TEMPLATES 

74 and len(page_data[-1].sounds) > 0 

75 ): 

76 # it's the location of the previous IPA template 

77 # https://fr.wiktionary.org/wiki/Modèle:région 

78 page_data[-1].sounds[-1].raw_tags.append( 

79 raw_tag.strip("()") 

80 ) 

81 elif len(raw_tag.strip("()")) > 0: 81 ↛ 86line 81 didn't jump to line 86 because the condition on line 81 was always true

82 if raw_tag.startswith("(") and raw_tag.endswith(")"): 

83 raw_tag = raw_tag.strip("()") 

84 page_data[-1].raw_tags.append(raw_tag) 

85 

86 pre_template_name = node.template_name 

87 elif isinstance(node, WikiNode) and node.kind == NodeKind.ITALIC: 

88 raw_tag = clean_node(wxr, None, node) 

89 if raw_tag != "ou": 

90 page_data[-1].raw_tags.append(raw_tag) 

91 

92 translate_raw_tags(page_data[-1]) 

93 

94 

95def process_equiv_pour_template( 

96 wxr: WiktextractContext, node: TemplateNode, page_data: list[WordEntry] 

97) -> None: 

98 # equivalent form: https://fr.wiktionary.org/wiki/Modèle:équiv-pour 

99 expanded_node = wxr.wtp.parse( 

100 wxr.wtp.node_to_wikitext(node), expand_all=True 

101 ) 

102 raw_gender_tag = "" 

103 gender_tags = { 

104 "un homme": "masculine", 

105 "une femme": "feminine", 

106 "le mâle": "masculine", 

107 "la femelle": "feminine", 

108 "un garçon": "masculine", 

109 "une fille": "feminine", 

110 "une personne non-binaire": "neuter", 

111 } 

112 

113 for child in expanded_node.find_child(NodeKind.ITALIC | NodeKind.HTML): 

114 if child.kind == NodeKind.ITALIC: 

115 raw_gender_tag = clean_node(wxr, None, child).strip("() ") 

116 raw_gender_tag = raw_gender_tag.removeprefix("pour ").rsplit( 

117 ",", 1 

118 )[0] 

119 elif isinstance(child, HTMLNode) and child.tag == "bdi": 119 ↛ 113line 119 didn't jump to line 113 because the condition on line 119 was always true

120 form_data = Form( 

121 form=clean_node(wxr, None, child), 

122 source="form line template 'équiv-pour'", 

123 ) 

124 if len(raw_gender_tag) > 0: 124 ↛ 129line 124 didn't jump to line 129 because the condition on line 124 was always true

125 if raw_gender_tag in gender_tags: 125 ↛ 128line 125 didn't jump to line 128 because the condition on line 125 was always true

126 form_data.tags.append(gender_tags[raw_gender_tag]) 

127 else: 

128 form_data.raw_tags.append(raw_gender_tag) 

129 if len(form_data.form) > 0: 129 ↛ 113line 129 didn't jump to line 113 because the condition on line 129 was always true

130 page_data[-1].forms.append(form_data) 

131 

132 

133def process_zh_mot_template( 

134 wxr: WiktextractContext, 

135 node: TemplateNode, 

136 page_data: list[WordEntry], 

137) -> None: 

138 # Chinese form line template: zh-mot, zh-mot-s, zh-mot-t 

139 # https://fr.wiktionary.org/wiki/Modèle:zh-mot 

140 node = wxr.wtp.parse( 

141 wxr.wtp.node_to_wikitext(node), 

142 pre_expand=True, 

143 additional_expand={node.template_name}, 

144 ) 

145 for template_node in node.find_child(NodeKind.TEMPLATE): 

146 if template_node.template_name.lower() == "lang": 

147 page_data[-1].sounds.append( 

148 Sound( 

149 zh_pron=clean_node(wxr, None, template_node), 

150 tags=["Pinyin"], 

151 ) 

152 ) 

153 elif template_node.template_name in ("pron", "prononciation"): 153 ↛ 145line 153 didn't jump to line 145 because the condition on line 153 was always true

154 page_data[-1].sounds.append( 

155 Sound(ipa=clean_node(wxr, None, template_node)) 

156 ) 

157 

158 

159def process_ja_mot_template( 

160 wxr: WiktextractContext, 

161 template_node: TemplateNode, 

162 page_data: list[WordEntry], 

163) -> None: 

164 # Japanese form line template: https://fr.wiktionary.org/wiki/Modèle:ja-mot 

165 expanded_node = wxr.wtp.parse( 

166 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

167 ) 

168 existing_forms = { 

169 existing_form.form for existing_form in page_data[-1].forms 

170 } 

171 for index, node in expanded_node.find_html("span", with_index=True): 

172 # the first span tag is the word, the second is Hepburn romanization 

173 if index == 1: 

174 form_text = clean_node(wxr, None, node) 

175 if form_text not in existing_forms: 

176 # avoid adding duplicated form data extracted from 

177 # inflection table before the form line 

178 page_data[-1].forms.append( 

179 Form(form=form_text, tags=["romanization"]) 

180 ) 

181 break 

182 

183 

184def process_conj_template( 

185 wxr: WiktextractContext, 

186 template_node: TemplateNode, 

187 page_data: list[WordEntry], 

188) -> None: 

189 # https://fr.wiktionary.org/wiki/Modèle:conjugaison 

190 expanded_node = wxr.wtp.parse( 

191 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

192 ) 

193 for link in expanded_node.find_child(NodeKind.LINK): 

194 if len(link.largs) == 0: 

195 continue 

196 conj_title = link.largs[0][0] 

197 if not conj_title.startswith("Conjugaison:"): 

198 continue 

199 conj_word = conj_title.split("/", 1)[-1] 

200 if conj_word in ( 

201 "Premier groupe", 

202 "Deuxième groupe", 

203 "Troisième groupe", 

204 ): 

205 continue 

206 if ( 

207 len(page_data) > 1 

208 and page_data[-2].lang_code == page_data[-1].lang_code 

209 and page_data[-2].pos == page_data[-1].pos 

210 and len(page_data[-2].forms) > 0 

211 and page_data[-2].forms[-1].source == conj_title 

212 ): 

213 page_data[-1].forms = page_data[-2].forms 

214 else: 

215 extract_conjugation(wxr, page_data[-1], conj_title) 

216 

217 tag = clean_node(wxr, page_data[-1], expanded_node) 

218 if template_node.template_name in ("conj", "conjugaison"): 

219 tag = tag.removesuffix("(voir la conjugaison)").strip() 

220 elif template_node.template_name.startswith("ja-"): 

221 tag = ( 

222 tag.removesuffix("(conjugaison)").removesuffix("(flexions)").strip() 

223 ) 

224 if len(tag) > 0: 

225 page_data[-1].raw_tags.append(tag) 

226 

227 

228def process_lien_pronominal( 

229 wxr: WiktextractContext, 

230 template_node: TemplateNode, 

231 page_data: list[WordEntry], 

232) -> None: 

233 # https://fr.wiktionary.org/wiki/Modèle:lien_pronominal 

234 expanded_node = wxr.wtp.parse( 

235 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

236 ) 

237 for bdi_tag in expanded_node.find_html_recursively("bdi"): 

238 form = Form(form=clean_node(wxr, None, bdi_tag), tags=["pronominal"]) 

239 if form.form != "": 239 ↛ 237line 239 didn't jump to line 237 because the condition on line 239 was always true

240 page_data[-1].forms.append(form) 

241 clean_node(wxr, page_data[-1], expanded_node)