Coverage for src/wiktextract/extractor/fr/linkage.py: 94%

1import re

3from wikitextprocessor import NodeKind, TemplateNode, WikiNode

5from ...page import clean_node

6from ...wxr_context import WiktextractContext

7from ..share import capture_text_in_parentheses

8from .models import Form, Linkage, WordEntry

9from .section_types import LINKAGE_SECTIONS, LINKAGE_TAGS

10from .tags import translate_raw_tags

13def extract_linkage(

14 wxr: WiktextractContext,

15 page_data: list[WordEntry],

16 level_node: WikiNode,

17 section_type: str,

18) -> None:

19 if section_type == "dérivés autres langues":

20 process_derives_autres_list(wxr, page_data, level_node)

21 elif section_type == "anagrammes":

22 for node in level_node.find_child(NodeKind.TEMPLATE):

23 if node.template_name == "voir anagrammes": 23 ↛ 22line 23 didn't jump to line 22 because the condition on line 23 was always true

24 anagram_list = process_voir_anagrammes_template(wxr, node)

25 for data in page_data:

26 if data.lang_code == page_data[-1].lang_code: 26 ↛ 25line 26 didn't jump to line 25 because the condition on line 26 was always true

27 data.anagrams.extend(anagram_list)

28 else:

29 process_linkage_list(

30 wxr,

31 page_data,

32 level_node,

33 LINKAGE_SECTIONS[section_type],

34 LINKAGE_TAGS.get(section_type, []),

35 )

38def process_derives_autres_list(

39 wxr: WiktextractContext,

40 page_data: list[WordEntry],

41 level_node: WikiNode,

42):

43 # drrive to other languages list

44 for list_item in level_node.find_child_recursively(NodeKind.LIST_ITEM):

45 lang_code = ""

46 lang_name = ""

47 for node in list_item.find_child(NodeKind.TEMPLATE | NodeKind.LINK):

48 if isinstance(node, TemplateNode) and node.template_name == "L":

49 lang_code = node.template_parameters.get(1)

50 lang_name = clean_node(wxr, None, node)

51 elif node.kind == NodeKind.LINK:

52 word = clean_node(wxr, None, node)

53 page_data[-1].derived.append(

54 Linkage(lang_code=lang_code, lang=lang_name, word=word)

55 )

56 elif isinstance(node, TemplateNode) and node.template_name in [ 56 ↛ 47line 56 didn't jump to line 47 because the condition on line 56 was always true

57 "l",

58 "lien",

59 "zh-lien",

60 "zh-lien-t",

61 ]:

62 linkage_data = Linkage(

63 lang_code=lang_code, lang=lang_name, word=""

64 )

65 process_linkage_template(wxr, node, linkage_data)

66 page_data[-1].derived.append(linkage_data)

69def process_linkage_list(

70 wxr: WiktextractContext,

71 page_data: list[WordEntry],

72 level_node: WikiNode,

73 linkage_type: str,

74 section_tags: list[str] = [],

75) -> None:

76 sense_text = ""

77 sense_index = 0

78 for template_or_list_node in level_node.find_child_recursively(

79 NodeKind.LIST_ITEM | NodeKind.TEMPLATE

80 ):

81 # list table start template: https://fr.wiktionary.org/wiki/Modèle:(

82 if (

83 isinstance(template_or_list_node, TemplateNode)

84 and template_or_list_node.template_name == "("

85 ):

86 sense_text = clean_node(

87 wxr, None, template_or_list_node.template_parameters.get(1, "")

88 )

89 sense_index_text = template_or_list_node.template_parameters.get(

90 2, "0"

91 )

92 if ( 92 ↛ 97line 92 didn't jump to line 97 because the condition on line 92 was always true

93 isinstance(sense_index_text, str)

94 and sense_index_text.isdecimal()

95 ):

96 sense_index = int(sense_index_text)

97 continue

98 # sense could also be in ";" description list

99 if (

100 template_or_list_node.kind == NodeKind.LIST_ITEM

101 and template_or_list_node.sarg in {";", ":"}

102 ):

103 sense_text = clean_node(wxr, None, template_or_list_node.children)

104 index_pattern = r"\s*$(?:sens\s*)?(\d+)$$"

105 m = re.search(index_pattern, sense_text)

106 if m is not None: 106 ↛ 109line 106 didn't jump to line 109 because the condition on line 106 was always true

107 sense_text = re.sub(index_pattern, "", sense_text)

108 sense_index = int(m.group(1))

109 continue

110

111 linkage_data = Linkage(word="", tags=section_tags)

112 if len(sense_text) > 0:

113 linkage_data.sense = sense_text

114 if sense_index != 0:

115 linkage_data.sense_index = sense_index

116 pending_tag = ""

117 inside_bracket = False

118 for index, child_node in enumerate( # remove nested lists

119 template_or_list_node.invert_find_child(NodeKind.LIST, True)

120 ):

121 if isinstance(

122 child_node, TemplateNode

123 ) and child_node.template_name in [

124 "l",

125 "lien",

126 "zh-lien",

127 "zh-lien-t",

128 ]:

129 process_linkage_template(wxr, child_node, linkage_data)

130 elif (

131 isinstance(child_node, WikiNode)

132 and child_node.kind == NodeKind.LINK

133 and not inside_bracket

134 ):

135 linkage_data.word = clean_node(wxr, None, child_node)

136 elif (

137 isinstance(child_node, WikiNode)

138 and child_node.kind == NodeKind.ITALIC

139 ):

140 current_sense = clean_node(wxr, None, child_node).strip("()")

141 if (

142 len(list(template_or_list_node.filter_empty_str_child()))

143 == 1

144 ):

145 linkage_data.word = current_sense

146 elif current_sense.isdecimal(): 146 ↛ 147line 146 didn't jump to line 147 because the condition on line 146 was never true

147 linkage_data.sense_index = int(current_sense)

148 else:

149 linkage_data.sense = current_sense

150 elif (

151 isinstance(child_node, TemplateNode)

152 and child_node.template_name == "réf"

153 ):

154 continue

155 else:

156 tag_text = (

157 child_node

158 if isinstance(child_node, str)

159 else clean_node(wxr, page_data[-1], child_node)

160 )

161 if (

162 tag_text.strip() in {",", "/", "(ou"}

163 and linkage_data.word != ""

164 ):

165 # list item has more than one word

166 add_linkage_data(page_data[-1], linkage_type, linkage_data)

167 linkage_data = Linkage(word="", tags=section_tags)

168 continue

169 if tag_text.strip().startswith(

170 "("

171 ) and not tag_text.strip().endswith(")"):

172 pending_tag = tag_text

173 inside_bracket = True

174 continue

175 elif not tag_text.strip().startswith(

176 "("

177 ) and tag_text.strip().endswith(")"):

178 tag_text = pending_tag + tag_text

179 pending_tag = ""

180 inside_bracket = False

181 elif len(pending_tag) > 0:

182 pending_tag += tag_text

183 continue

184

185 if tag_text.strip().startswith("—"):

186 linkage_data.translation = clean_node(

187 wxr,

188 None,

189 list(

190 template_or_list_node.invert_find_child(

191 NodeKind.LIST, True

192 )

193 )[index:],

194 ).strip("— ")

195 break

196 elif tag_text.strip().startswith(":"):

197 sense_text = tag_text.strip().removeprefix(":").strip()

198 linkage_data.sense = sense_text

199 else:

200 tags, _ = capture_text_in_parentheses(tag_text)

201 for tag in tags:

202 if tag.isdecimal():

203 linkage_data.sense_index = int(tag)

204 else:

205 linkage_data.raw_tags.append(tag)

206

207 if len(linkage_data.word) > 0:

208 add_linkage_data(page_data[-1], linkage_type, linkage_data)

209

210

211def add_linkage_data(

212 word_entry: WordEntry, l_type: str, l_data: Linkage

213) -> None:

214 if l_data.word == "": 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 return

216 translate_raw_tags(l_data)

217 if l_type == "forms":

218 word_entry.forms.append(

219 Form(

220 form=l_data.word,

221 tags=l_data.tags,

222 raw_tags=l_data.raw_tags,

223 roman=l_data.roman,

224 sense=l_data.sense,

225 sense_index=l_data.sense_index,

226 )

227 )

228 else:

229 getattr(word_entry, l_type).append(l_data)

230

231

232def process_linkage_template(

233 wxr: WiktextractContext,

234 node: TemplateNode,

235 linkage_data: Linkage,

236) -> None:

237 if node.template_name in ["lien", "l"]:

238 process_lien_template(wxr, node, linkage_data)

239 elif node.template_name.startswith("zh-lien"): 239 ↛ exitline 239 didn't return from function 'process_linkage_template' because the condition on line 239 was always true

240 process_zh_lien_template(wxr, node, linkage_data)

241

242

243def process_lien_template(

244 wxr: WiktextractContext,

245 node: TemplateNode,

246 linkage_data: Linkage,

247) -> None:

248 # link word template: https://fr.wiktionary.org/wiki/Modèle:lien

249 word = clean_node(

250 wxr,

251 None,

252 node.template_parameters.get("dif", node.template_parameters.get(1)),

253 )

254 linkage_data.word = word

255 if "tr" in node.template_parameters:

256 linkage_data.roman = clean_node(

257 wxr, None, node.template_parameters.get("tr")

258 )

259 if "sens" in node.template_parameters: 259 ↛ 260line 259 didn't jump to line 260 because the condition on line 259 was never true

260 linkage_data.translation = clean_node(

261 wxr, None, node.template_parameters.get("sens")

262 )

263

264

265def process_zh_lien_template(

266 wxr: WiktextractContext,

267 node: TemplateNode,

268 linkage_data: Linkage,

269) -> None:

270 # https://fr.wiktionary.org/wiki/Modèle:zh-lien

271 linkage_data.word = clean_node(wxr, None, node.template_parameters.get(1))

272 linkage_data.roman = clean_node(

273 wxr, None, node.template_parameters.get(2, "")

274 ) # pinyin

275 traditional_form = clean_node(

276 wxr, None, node.template_parameters.get(3, "")

277 )

278 if len(traditional_form) > 0:

279 linkage_data.alt = traditional_form

280

281

282def process_voir_anagrammes_template(

283 wxr: WiktextractContext, node: TemplateNode

284) -> list[Linkage]:

285 # https://fr.wiktionary.org/wiki/Modèle:voir_anagrammes

286 results = []

287 expanded_node = wxr.wtp.parse(

288 wxr.wtp.node_to_wikitext(node), expand_all=True

289 )

290 for list_item in expanded_node.find_child_recursively(NodeKind.LIST_ITEM):

291 for link_node in list_item.find_child(NodeKind.LINK):

292 word = clean_node(wxr, None, link_node)

293 if len(word) > 0: 293 ↛ 291line 293 didn't jump to line 291 because the condition on line 293 was always true

294 results.append(Linkage(word=word))

295 return results