Coverage for src/wiktextract/extractor/fr/gloss.py: 96%

1from collections import defaultdict

2from typing import Optional, Union

4from wikitextprocessor import NodeKind, TemplateNode, WikiNode

6from ...page import clean_node

7from ...wxr_context import WiktextractContext

8from .models import AltForm, Example, Sense, WordEntry

9from .tags import translate_raw_tags

12def extract_gloss(

13 wxr: WiktextractContext,

14 page_data: list[WordEntry],

15 list_node: WikiNode,

16 parent_sense: Optional[Sense] = None,

17) -> None:

18 for list_item_node in list_node.find_child(NodeKind.LIST_ITEM):

19 gloss_nodes = list(

20 list_item_node.invert_find_child(

21 NodeKind.LIST, include_empty_str=True

22 )

23 )

24 gloss_data = Sense()

25 if parent_sense is not None:

26 gloss_data.glosses.extend(parent_sense.glosses)

27 gloss_data.tags.extend(parent_sense.tags)

28 gloss_data.raw_tags.extend(parent_sense.raw_tags)

29 gloss_data.topics.extend(parent_sense.topics)

30 # process modifier, theme tempaltes before gloss text

31 # https://fr.wiktionary.org/wiki/Wiktionnaire:Liste_de_tous_les_modèles/Précisions_de_sens

32 tag_indexes = set()

33 for index, gloss_node in enumerate(gloss_nodes):

34 if isinstance(gloss_node, TemplateNode):

35 categories_data = defaultdict(list)

36 expanded_text = clean_node(wxr, categories_data, gloss_node)

37 if (

38 expanded_text.startswith("(")

39 and expanded_text.endswith(")")

40 and "(" not in expanded_text[1:-1]

41 ):

42 tags = expanded_text.strip("() \n").split(", ")

43 if len(tags) > 0: 43 ↛ 45line 43 didn't jump to line 45 because the condition on line 43 was always true

44 gloss_data.raw_tags.extend(tags)

45 if "categories" in categories_data:

46 gloss_data.categories.extend(

47 categories_data["categories"]

48 )

49 tag_indexes.add(index)

50 # if an italic node is between parentheses then it's a tag, also

51 # don't add the parenthese strings to `gloss_only_nodes`

52 elif (

53 isinstance(gloss_node, WikiNode)

54 and gloss_node.kind == NodeKind.ITALIC

55 and isinstance(gloss_nodes[index - 1], str)

56 and gloss_nodes[index - 1].strip() == "("

57 and index + 1 < len(gloss_nodes)

58 and isinstance(gloss_nodes[index + 1], str)

59 and gloss_nodes[index + 1].strip() == ")"

60 ):

61 gloss_data.raw_tags.append(clean_node(wxr, None, gloss_node))

62 tag_indexes |= {index - 1, index, index + 1}

64 gloss_only_nodes = [

65 node

66 for index, node in enumerate(gloss_nodes)

67 if index not in tag_indexes

68 ]

69 note_index = len(gloss_only_nodes)

70 for index in range(note_index):

71 if (

72 isinstance(gloss_only_nodes[index], TemplateNode)

73 and gloss_only_nodes[index].template_name == "note"

74 ):

75 note_index = index

76 gloss_text = find_alt_of_form(

77 wxr, gloss_only_nodes[:note_index], page_data[-1].pos, gloss_data

78 )

79 if "form-of" in page_data[-1].tags:

80 find_form_of_word(wxr, gloss_only_nodes[:note_index], gloss_data)

81 if gloss_text != "":

82 gloss_data.glosses.append(gloss_text)

83 gloss_data.note = clean_node(

84 wxr, gloss_data, gloss_only_nodes[note_index + 1 :]

85 ).strip(" ().")

86 page_data[-1].senses.append(gloss_data)

88 for nest_gloss_list in list_item_node.find_child(NodeKind.LIST):

89 if nest_gloss_list.sarg.endswith("#"):

90 extract_gloss(wxr, page_data, nest_gloss_list, gloss_data)

91 elif nest_gloss_list.sarg.endswith("*"): 91 ↛ 88line 91 didn't jump to line 88 because the condition on line 91 was always true

92 extract_examples(wxr, gloss_data, nest_gloss_list)

94 translate_raw_tags(gloss_data)

95 if len(gloss_data.glosses) == 0:

96 gloss_data.tags.append("no-gloss")

99def extract_examples(

100 wxr: WiktextractContext,

101 gloss_data: Sense,

102 example_list_node: WikiNode,

103) -> None:

104 for example_node in example_list_node.find_child(NodeKind.LIST_ITEM):

105 example_node_children = list(example_node.filter_empty_str_child())

106 if len(example_node_children) == 0: 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 continue

108 first_child = example_node_children[0]

109 if isinstance(

110 first_child, TemplateNode

111 ) and first_child.template_name.endswith("exemple"):

112 process_exemple_template(wxr, first_child, gloss_data)

113 else:

114 example_data = Example()

115 ignored_nodes = []

116 for node in example_node.find_child(

117 NodeKind.TEMPLATE | NodeKind.LIST

118 ):

119 if (

120 node.kind == NodeKind.TEMPLATE

121 and node.template_name == "source"

122 ):

123 example_data.ref = clean_node(wxr, None, node).strip("— ()")

124 ignored_nodes.append(node)

125 elif node.kind == NodeKind.LIST: 125 ↛ 116line 125 didn't jump to line 116 because the condition on line 125 was always true

126 for tr_item in node.find_child(NodeKind.LIST_ITEM):

127 example_data.translation = clean_node(

128 wxr, None, tr_item.children

129 )

130 ignored_nodes.append(node)

131 example_nodes = [

132 node

133 for node in example_node_children

134 if node not in ignored_nodes

135 ]

136 example_data.text = clean_node(wxr, None, example_nodes)

137 gloss_data.examples.append(example_data)

138

139

140def process_exemple_template(

141 wxr: WiktextractContext,

142 node: TemplateNode,

143 gloss_data: Optional[Sense],

144 time: str = "",

145) -> Example:

146 # https://fr.wiktionary.org/wiki/Modèle:exemple

147 # https://fr.wiktionary.org/wiki/Modèle:ja-exemple

148 # https://fr.wiktionary.org/wiki/Modèle:zh-exemple

149 text = clean_node(wxr, None, node.template_parameters.get(1, ""))

150 translation = clean_node(

151 wxr,

152 None,

153 node.template_parameters.get(

154 2, node.template_parameters.get("sens", "")

155 ),

156 )

157 transcription = clean_node(

158 wxr,

159 None,

160 node.template_parameters.get(3, node.template_parameters.get("tr", "")),

161 )

162 source = clean_node(wxr, None, node.template_parameters.get("source", ""))

163 example_data = Example(

164 text=clean_node(wxr, None, text),

165 translation=clean_node(wxr, None, translation),

166 roman=clean_node(wxr, None, transcription),

167 ref=clean_node(wxr, None, source),

168 time=time,

169 )

170 if len(example_data.text) > 0 and isinstance(gloss_data, Sense):

171 gloss_data.examples.append(example_data)

172 if gloss_data is not None: 172 ↛ 174line 172 didn't jump to line 174 because the condition on line 172 was always true

173 clean_node(wxr, gloss_data, node)

174 return example_data

175

176

177def find_alt_of_form(

178 wxr: WiktextractContext,

179 gloss_nodes: list[Union[str, WikiNode]],

180 pos_type: str,

181 gloss_data: Sense,

182) -> str:

183 """

184 Return gloss text, remove tag template expanded from "variante *" templates.

185 """

186

187 alt_of = ""

188 filtered_gloss_nodes = []

189 for gloss_node in gloss_nodes:

190 # https://fr.wiktionary.org/wiki/Modèle:variante_de

191 # https://fr.wiktionary.org/wiki/Modèle:variante_kyujitai_de

192 if isinstance(

193 gloss_node, TemplateNode

194 ) and gloss_node.template_name.startswith("variante "):

195 alt_of = clean_node(

196 wxr, None, gloss_node.template_parameters.get("dif", "")

197 )

198 if len(alt_of) == 0:

199 alt_of = clean_node(

200 wxr, None, gloss_node.template_parameters.get(1, "")

201 )

202 if len(alt_of) > 0: 202 ↛ 205line 202 didn't jump to line 205 because the condition on line 202 was always true

203 gloss_data.alt_of.append(AltForm(word=alt_of))

204 gloss_data.tags.append("alt-of")

205 expanded_template = wxr.wtp.parse(

206 wxr.wtp.node_to_wikitext(gloss_node),

207 pre_expand=True,

208 additional_expand={gloss_node.template_name},

209 )

210 for node in expanded_template.children:

211 if (

212 isinstance(node, TemplateNode)

213 and node.template_name == "désuet"

214 ):

215 raw_tag = clean_node(wxr, gloss_data, node).strip(" ()")

216 gloss_data.raw_tags.append(raw_tag)

217 else:

218 filtered_gloss_nodes.append(node)

219 else:

220 filtered_gloss_nodes.append(gloss_node)

221

222 if alt_of == "" and pos_type == "typographic variant":

223 for gloss_node in filter(

224 lambda n: isinstance(n, WikiNode), gloss_nodes

225 ):

226 # use the last link

227 if gloss_node.kind == NodeKind.LINK:

228 alt_of = clean_node(wxr, None, gloss_node)

229 if isinstance(gloss_node, TemplateNode):

230 gloss_node = wxr.wtp.parse(

231 wxr.wtp.node_to_wikitext(gloss_node), expand_all=True

232 )

233 for link in gloss_node.find_child_recursively(NodeKind.LINK):

234 alt_of = clean_node(wxr, None, link)

235 if len(alt_of) > 0: 235 ↛ 238line 235 didn't jump to line 238 because the condition on line 235 was always true

236 gloss_data.alt_of.append(AltForm(word=alt_of))

237

238 gloss_text = clean_node(wxr, gloss_data, filtered_gloss_nodes)

239 brackets = 0

240 for char in gloss_text:

241 if char == "(":

242 brackets += 1

243 elif char == ")":

244 brackets -= 1

245 if brackets != 0:

246 gloss_text = gloss_text.strip(" ()")

247 return gloss_text

248

249

250def find_form_of_word(

251 wxr: WiktextractContext,

252 gloss_nodes: list[Union[str, WikiNode]],

253 gloss_data: Sense,

254) -> None:

255 # https://fr.wiktionary.org/wiki/Catégorie:Modèles_de_variantes

256 form_of = ""

257 for node in gloss_nodes:

258 if isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

259 form_of = clean_node(wxr, None, node)

260 elif isinstance(node, TemplateNode):

261 if node.template_name in ("mutation de", "lien"): 261 ↛ 257line 261 didn't jump to line 257 because the condition on line 261 was always true

262 # https://fr.wiktionary.org/wiki/Modèle:mutation_de

263 form_of = clean_node(

264 wxr, None, node.template_parameters.get(1, "")

265 )

266 if len(form_of) > 0: 266 ↛ exitline 266 didn't return from function 'find_form_of_word' because the condition on line 266 was always true

267 gloss_data.form_of.append(AltForm(word=form_of))