Coverage for src/wiktextract/extractor/zh/linkage.py: 91%

1import re

2from collections import defaultdict

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..ruby import extract_ruby

15from .models import Form, Linkage, WordEntry

16from .tags import translate_raw_tags

19def extract_linkage_section(

20 wxr: WiktextractContext,

21 page_data: list[WordEntry],

22 level_node: LevelNode,

23 linkage_type: str,

24) -> None:

25 sense = ""

26 linkage_list = []

27 for node in level_node.find_child(NodeKind.TEMPLATE | NodeKind.LIST):

28 if node.kind == NodeKind.LIST:

29 for item_node in node.find_child(NodeKind.LIST_ITEM):

30 sense, new_linkage_list = process_linkage_list_item(

31 wxr, item_node, sense

32 )

33 linkage_list.extend(new_linkage_list)

34 elif isinstance(node, TemplateNode): 34 ↛ 27line 34 didn't jump to line 27 because the condition on line 34 was always true

35 if node.template_name in ["s", "sense"]:

36 sense = clean_node(wxr, None, node).strip("()： ")

37 elif node.template_name == "zh-dial":

38 linkage_list.extend(extract_zh_dial_template(wxr, node, sense))

39 elif re.fullmatch(

40 r"(?:col|der|rel)\d", node.template_name, re.I

41 ) or node.template_name.endswith("-saurus"):

42 linkage_list.extend(

43 process_linkage_col_template(wxr, node, sense)

44 )

45 elif node.template_name == "ja-r/multi":

46 linkage_list.extend(

47 extract_ja_r_multi_template(wxr, node, sense)

48 )

50 if linkage_type == "alt_forms":

51 forms = [

52 Form(

53 form=l_data.word,

54 sense=l_data.sense,

55 tags=l_data.tags + ["alternative"],

56 raw_tags=l_data.raw_tags,

57 roman=l_data.roman,

58 ruby=l_data.ruby,

59 )

60 for l_data in linkage_list

61 ]

62 page_data[-1].forms.extend(forms)

63 else:

64 getattr(page_data[-1], linkage_type).extend(linkage_list)

65 for data in page_data[:-1]:

66 if ( 66 ↛ 72line 66 didn't jump to line 72 because the condition on line 66 was never true

67 data.lang_code == page_data[-1].lang_code

68 and data.sounds == page_data[-1].sounds

69 and data.etymology_text == page_data[-1].etymology_text

70 and data.pos_level == page_data[-1].pos_level == level_node.kind

71 ):

72 getattr(data, linkage_type).extend(linkage_list)

75def process_linkage_list_item(

76 wxr: WiktextractContext, list_item: WikiNode, sense: str

77) -> tuple[str, list[Linkage]]:

78 raw_tags = []

79 linkage_list = []

80 for item_child in list_item.children:

81 if isinstance(item_child, TemplateNode):

82 if item_child.template_name in ["s", "sense"]:

83 sense = clean_node(wxr, None, item_child).strip("()： ")

84 elif item_child.template_name in ["qualifier", "qual"]:

85 raw_tags.append(clean_node(wxr, None, item_child).strip("()"))

86 elif item_child.template_name == "zh-l":

87 linkage_list.extend(

88 process_zh_l_template(wxr, item_child, sense, raw_tags)

89 )

90 raw_tags.clear()

91 elif item_child.template_name == "ja-r":

92 linkage_list.append(

93 process_ja_r_template(wxr, item_child, sense, raw_tags)

94 )

95 raw_tags.clear()

96 elif item_child.template_name in ["l", "link", "alter"]: 96 ↛ 80line 96 didn't jump to line 80 because the condition on line 96 was always true

97 linkage_list.extend(

98 process_l_template(wxr, item_child, sense, raw_tags)

99 )

100 raw_tags.clear()

101 elif (

102 isinstance(item_child, WikiNode)

103 and item_child.kind == NodeKind.LINK

104 ):

105 word = clean_node(wxr, None, item_child)

106 if len(word) > 0: 106 ↛ 80line 106 didn't jump to line 80 because the condition on line 106 was always true

107 linkage_data = Linkage(

108 word=word, sense=sense, raw_tags=raw_tags

109 )

110 translate_raw_tags(linkage_data)

111 linkage_list.append(linkage_data)

112 raw_tags.clear()

113 elif ( 113 ↛ 117line 113 didn't jump to line 117 because the condition on line 113 was never true

114 isinstance(item_child, WikiNode)

115 and item_child.kind == NodeKind.LIST

116 ):

117 for child_list_item in item_child.find_child(NodeKind.LIST_ITEM):

118 _, new_list = process_linkage_list_item(

119 wxr, child_list_item, sense

120 )

121 linkage_list.extend(new_list)

122

123 return sense, linkage_list

124

125

126def extract_zh_dial_template(

127 wxr: WiktextractContext, template_node: TemplateNode, sense: str

128) -> list[Linkage]:

129 linkage_list = []

130 dial_data = defaultdict(set)

131 tag_data = defaultdict(set)

132 expanded_node = wxr.wtp.parse(

133 wxr.wtp.node_to_wikitext(template_node), expand_all=True

134 )

135 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):

136 lang_tag = ""

137 region_tag = ""

138 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

139 if not row_node.contain_node(NodeKind.TABLE_CELL):

140 continue # skip header row

141 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):

142 lang_tag = clean_node(wxr, None, header_node)

143 if lang_tag == "註解": # skip last note row

144 continue

145 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):

146 for link_node in cell_node.find_child(NodeKind.LINK):

147 region_tag = clean_node(wxr, None, link_node)

148 word = ""

149 for span_tag in cell_node.find_html("span"):

150 span_text = clean_node(wxr, None, span_tag)

151 if span_text == "": 151 ↛ 152line 151 didn't jump to line 152 because the condition on line 151 was never true

152 continue

153 if (

154 span_tag.attrs.get("lang", "") == "zh"

155 and span_text != wxr.wtp.title

156 ):

157 word = span_text

158 if lang_tag != "": 158 ↛ 160line 158 didn't jump to line 160 because the condition on line 158 was always true

159 dial_data[span_text].add(lang_tag)

160 if region_tag != "":

161 dial_data[span_text].add(region_tag)

162 elif (

163 span_tag.attrs.get("style", "") == "font-size:60%"

164 and word != ""

165 ):

166 tag_data[word].add(span_text)

167

168 for term, lang_tags in dial_data.items():

169 linkage_data = Linkage(word=term, sense=sense, raw_tags=list(lang_tags))

170 linkage_data.raw_tags.extend(list(tag_data.get(term, {})))

171 translate_raw_tags(linkage_data)

172 linkage_list.append(linkage_data)

173 return linkage_list

174

175

176def process_zh_l_template(

177 wxr: WiktextractContext,

178 template_node: TemplateNode,

179 sense: str,

180 raw_tags: list[str] = [],

181) -> list[Linkage]:

182 # https://zh.wiktionary.org/wiki/Template:Zh-l

183 expanded_node = wxr.wtp.parse(

184 wxr.wtp.node_to_wikitext(template_node), expand_all=True

185 )

186 roman = ""

187 linkage_list = []

188 for i_tag in expanded_node.find_html_recursively(

189 "span", attr_name="class", attr_value="Latn"

190 ):

191 roman = clean_node(wxr, None, i_tag)

192 for span_tag in expanded_node.find_html(

193 "span", attr_name="lang", attr_value="zh"

194 ):

195 linkage_data = Linkage(

196 sense=sense,

197 raw_tags=raw_tags,

198 roman=roman,

199 word=clean_node(wxr, None, span_tag),

200 )

201 lang_attr = span_tag.attrs.get("lang", "")

202 if lang_attr == "zh-Hant":

203 linkage_data.tags.append("Traditional Chinese")

204 elif lang_attr == "zh-Hans":

205 linkage_data.tags.append("Simplified Chinese")

206 if len(linkage_data.word) > 0 and linkage_data.word != "／":

207 translate_raw_tags(linkage_data)

208 linkage_list.append(linkage_data)

209 return linkage_list

210

211

212def process_ja_r_template(

213 wxr: WiktextractContext,

214 template_node: TemplateNode,

215 sense: str,

216 raw_tags: list[str] = [],

217) -> Linkage:

218 # https://zh.wiktionary.org/wiki/Template:Ja-r

219 expanded_node = wxr.wtp.parse(

220 wxr.wtp.node_to_wikitext(template_node), expand_all=True

221 )

222 return process_expanded_ja_r_node(wxr, expanded_node, sense, raw_tags)

223

224

225def process_expanded_ja_r_node(

226 wxr: WiktextractContext,

227 expanded_node: WikiNode,

228 sense: str,

229 raw_tags: list[str] = [],

230) -> Linkage:

231 linkage_data = Linkage(sense=sense, raw_tags=raw_tags)

232 for span_node in expanded_node.find_html("span"):

233 span_class = span_node.attrs.get("class", "")

234 if "lang" in span_node.attrs:

235 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)

236 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)

237 linkage_data.ruby = ruby_data

238 elif "tr" in span_class:

239 linkage_data.roman = clean_node(wxr, None, span_node)

240 elif "mention-gloss" == span_class: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 linkage_data.sense = clean_node(wxr, None, span_node)

242

243 translate_raw_tags(linkage_data)

244 return linkage_data

245

246

247def process_l_template(

248 wxr: WiktextractContext,

249 t_node: TemplateNode,

250 sense: str,

251 raw_tags: list[str] = [],

252) -> None:

253 # https://zh.wiktionary.org/wiki/Template:l

254 expanded_node = wxr.wtp.parse(

255 wxr.wtp.node_to_wikitext(t_node), expand_all=True

256 )

257 linkage_list = []

258 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

259 for span_tag in expanded_node.find_html("span"):

260 span_lang = span_tag.attrs.get("lang", "")

261 span_class = span_tag.attrs.get("class", "")

262 if span_lang == lang_code:

263 linkage_data = Linkage(

264 sense=sense,

265 raw_tags=raw_tags,

266 word=clean_node(wxr, None, span_tag),

267 )

268 if len(linkage_data.word) > 0: 268 ↛ 259line 268 didn't jump to line 259 because the condition on line 268 was always true

269 translate_raw_tags(linkage_data)

270 linkage_list.append(linkage_data)

271 elif span_lang.endswith("-Latn") and len(linkage_list) > 0:

272 linkage_list[-1].roman = clean_node(wxr, None, span_tag)

273 elif "mention-gloss" == span_class and len(linkage_list) > 0:

274 linkage_list[-1].sense = clean_node(wxr, None, span_tag)

275

276 return linkage_list

277

278

279def process_linkage_col_template(

280 wxr: WiktextractContext, template_node: TemplateNode, sense: str

281) -> list[Linkage]:

282 # https://zh.wiktionary.org/wiki/Template:Col3

283 linkage_list = []

284 expanded_template = wxr.wtp.parse(

285 wxr.wtp.node_to_wikitext(template_node), expand_all=True

286 )

287 for ui_tag in expanded_template.find_html_recursively("li"):

288 current_data = []

289 roman = ""

290 raw_tags = []

291 for span_tag in ui_tag.find_html("span"):

292 span_lang = span_tag.attrs.get("lang", "")

293 if span_lang.endswith("-Latn"): 293 ↛ 294line 293 didn't jump to line 294 because the condition on line 293 was never true

294 roman = clean_node(wxr, None, span_tag)

295 elif "qualifier-content" in span_tag.attrs.get("class", ""):

296 span_text = clean_node(wxr, None, span_tag)

297 for raw_tag in re.split(r"或|、", span_text):

298 raw_tag = raw_tag.strip()

299 if raw_tag != "": 299 ↛ 297line 299 didn't jump to line 297 because the condition on line 299 was always true

300 raw_tags.append(raw_tag)

301 elif span_lang != "":

302 l_data = Linkage(

303 word=clean_node(wxr, None, span_tag), sense=sense

304 )

305 class_names = span_tag.attrs.get("class", "")

306 if class_names == "Hant": 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 l_data.tags.append("Traditional Chinese")

308 elif class_names == "Hans": 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 l_data.tags.append("Simplified Chinese")

310 if l_data.word != "": 310 ↛ 291line 310 didn't jump to line 291 because the condition on line 310 was always true

311 current_data.append(l_data)

312

313 for data in current_data:

314 data.raw_tags.extend(raw_tags)

315 data.roman = roman

316 translate_raw_tags(data)

317 linkage_list.extend(current_data)

318

319 return linkage_list

320

321

322def process_linkage_templates_in_gloss(

323 wxr: WiktextractContext,

324 word_entry: WordEntry,

325 t_node: TemplateNode,

326 linkage_type: str,

327 sense: str,

328) -> None:

329 # https://en.wiktionary.org/wiki/Template:synonyms

330 expanded_node = wxr.wtp.parse(

331 wxr.wtp.node_to_wikitext(t_node), expand_all=True

332 )

333 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

334 l_list = []

335 raw_tags = []

336 for top_span_tag in expanded_node.find_html("span"):

337 for node in top_span_tag.children:

338 if isinstance(node, HTMLNode) and node.tag == "span":

339 span_lang = node.attrs.get("lang", "")

340 span_class = node.attrs.get("class", "")

341 if span_lang == lang_code:

342 l_data = Linkage(

343 word=clean_node(wxr, None, node),

344 sense=sense,

345 raw_tags=raw_tags,

346 )

347 if span_class == "Hant":

348 l_data.tags.append("Traditional Chinese")

349 elif span_class == "Hans":

350 l_data.tags.append("Simplified Chinese")

351 if l_data.word != "": 351 ↛ 337line 351 didn't jump to line 337 because the condition on line 351 was always true

352 l_list.append(l_data)

353 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:

354 roman = clean_node(wxr, None, node)

355 for d in l_list:

356 d.roman = roman

357 elif span_class == "mention-gloss": 357 ↛ 358line 357 didn't jump to line 358 because the condition on line 357 was never true

358 sense = clean_node(wxr, None, node)

359 for d in l_list:

360 d.sense = sense

361 elif "qualifier-content" in span_class:

362 raw_tag_str = clean_node(wxr, None, node)

363 for raw_tag in raw_tag_str.split("，"):

364 raw_tag = raw_tag.strip()

365 if raw_tag != "": 365 ↛ 363line 365 didn't jump to line 363 because the condition on line 365 was always true

366 raw_tags.append(raw_tag)

367 elif isinstance(node, str) and node.strip() == "、":

368 getattr(word_entry, linkage_type).extend(l_list)

369 l_list.clear()

370

371 getattr(word_entry, linkage_type).extend(l_list)

372 for data in getattr(word_entry, linkage_type):

373 translate_raw_tags(data)

374

375

376def extract_ja_r_multi_template(

377 wxr: WiktextractContext, template_node: TemplateNode, sense: str

378) -> Linkage:

379 expanded_node = wxr.wtp.parse(

380 wxr.wtp.node_to_wikitext(template_node), expand_all=True

381 )

382 linkage_list = []

383 for list_node in expanded_node.find_child(NodeKind.LIST):

384 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

385 linkage_list.append(

386 process_expanded_ja_r_node(wxr, list_item, sense, [])

387 )

388

389 return linkage_list