Coverage for src/wiktextract/extractor/vi/linkage.py: 60%

315 statements  

« prev     ^ index     » next       coverage.py v7.10.7, created at 2025-10-13 10:14 +0000

1import re 

2from itertools import count 

3 

4from wikitextprocessor import ( 

5 HTMLNode, 

6 LevelNode, 

7 NodeKind, 

8 TemplateNode, 

9 WikiNode, 

10) 

11 

12from ...page import clean_node 

13from ...wxr_context import WiktextractContext 

14from ..ruby import extract_ruby 

15from .models import Form, Linkage, WordEntry 

16from .tags import translate_raw_tags 

17 

18GLOSS_LIST_LINKAGE_TEMPLATES = { 

19 "antonyms": "antonyms", 

20 "def-ant": "antonyms", 

21 "antonym": "antonyms", 

22 "coordinate terms": "coordinate_terms", 

23 "def-cot": "coordinate_terms", 

24 "def-coo": "coordinate_terms", 

25 "cot": "coordinate_terms", 

26 "holonyms": "holonyms", 

27 "holonym": "holonyms", 

28 "holo": "holonyms", 

29 "hypernyms": "hypernyms", 

30 "hyper": "hypernyms", 

31 "hyponyms": "hyponyms", 

32 "hypo": "hyponyms", 

33 "inline alt forms": "alt_forms", 

34 "alti": "alt_forms", 

35 "meronyms": "meronyms", 

36 "mero": "meronyms", 

37 "synonyms": "synonyms", 

38 "synonym": "synonyms", 

39 "def-syn": "synonyms", 

40 "synsee": "synonyms", 

41} 

42 

43QUALIFIER_TEMPALTES = ["qualifier", "qual", "q", "qf", "i"] 

44 

45 

46def extract_gloss_list_linkage_template( 

47 wxr: WiktextractContext, 

48 word_entry: WordEntry, 

49 t_node: TemplateNode, 

50 linkage_type: str, 

51 sense: str, 

52): 

53 expanded_node = wxr.wtp.parse( 

54 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

55 ) 

56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

57 l_list = [] 

58 raw_tags = [] 

59 for top_span_tag in expanded_node.find_html("span"): 

60 for node in top_span_tag.children: 

61 if isinstance(node, HTMLNode) and node.tag == "span": 

62 span_lang = node.attrs.get("lang", "") 

63 span_class = node.attrs.get("class", "").split() 

64 if span_lang == lang_code: 

65 l_data = Linkage( 

66 word=clean_node(wxr, None, node), 

67 sense=sense, 

68 raw_tags=raw_tags, 

69 ) 

70 if "Hant" in span_class: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data.tags.append("Traditional-Chinese") 

72 elif "Hans" in span_class: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 l_data.tags.append("Simplified-Chinese") 

74 if l_data.word != "": 74 ↛ 60line 74 didn't jump to line 60 because the condition on line 74 was always true

75 translate_raw_tags(l_data) 

76 l_list.append(l_data) 

77 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class: 

78 roman = clean_node(wxr, None, node) 

79 for d in l_list: 

80 d.roman = roman 

81 elif "mention-gloss" in span_class: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 sense = clean_node(wxr, None, node) 

83 for d in l_list: 

84 d.sense = sense 

85 elif "qualifier-content" in span_class: 

86 raw_tag_str = clean_node(wxr, None, node) 

87 for raw_tag in raw_tag_str.split(","): 

88 raw_tag = raw_tag.strip() 

89 if raw_tag != "": 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true

90 raw_tags.append(raw_tag) 

91 elif isinstance(node, str) and node.strip() == ",": 

92 if linkage_type == "alt_forms": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 for l_data in l_list: 

94 word_entry.forms.append( 

95 Form( 

96 form=l_data.word, 

97 sense=l_data.sense, 

98 tags=l_data.tags + ["alternative"], 

99 raw_tags=l_data.raw_tags, 

100 roman=l_data.roman, 

101 ) 

102 ) 

103 else: 

104 getattr(word_entry, linkage_type).extend(l_list) 

105 l_list.clear() 

106 raw_tags.clear() 

107 

108 if linkage_type == "alt_forms": 

109 for l_data in l_list: 

110 word_entry.forms.append( 

111 Form( 

112 form=l_data.word, 

113 sense=l_data.sense, 

114 tags=l_data.tags + ["alternative"], 

115 raw_tags=l_data.raw_tags, 

116 roman=l_data.roman, 

117 ) 

118 ) 

119 else: 

120 getattr(word_entry, linkage_type).extend(l_list) 

121 

122 

123def extract_alt_form_section( 

124 wxr: WiktextractContext, 

125 base_data: WordEntry, 

126 page_data: list[WordEntry], 

127 level_node: LevelNode, 

128): 

129 forms = [] 

130 for list_node in level_node.find_child(NodeKind.LIST): 

131 for list_item in list_node.find_child(NodeKind.LIST_ITEM): 

132 raw_tags = [] 

133 for node in list_item.children: 

134 if isinstance(node, TemplateNode) and node.template_name in [ 

135 "alter", 

136 "def-alt", 

137 ]: 

138 forms.extend(extract_alter_template(wxr, node, raw_tags)) 

139 elif ( 

140 isinstance(node, TemplateNode) 

141 and node.template_name in QUALIFIER_TEMPALTES 

142 ): 

143 raw_tags.extend(extract_qualifier_template(wxr, node)) 

144 

145 if len(page_data) == 0 or page_data[-1].lang != base_data.lang: 

146 base_data.forms.extend(forms) 

147 else: 

148 page_data[-1].forms.extend(forms) 

149 

150 

151def extract_alter_template( 

152 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str] 

153) -> list[Form]: 

154 forms = [] 

155 expanded_node = wxr.wtp.parse( 

156 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

157 ) 

158 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

159 for span_tag in expanded_node.find_html( 

160 "span", attr_name="lang", attr_value=lang_code 

161 ): 

162 word = clean_node(wxr, None, span_tag) 

163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true

164 form = Form(form=word, tags=["alternative"], raw_tags=raw_tags) 

165 translate_raw_tags(form) 

166 forms.append(form) 

167 return forms 

168 

169 

170def extract_qualifier_template( 

171 wxr: WiktextractContext, t_node: TemplateNode 

172) -> list[str]: 

173 raw_tags = [] 

174 for raw_tag in clean_node(wxr, None, t_node).strip("()").split(","): 

175 raw_tag = raw_tag.strip() 

176 if raw_tag != "": 176 ↛ 174line 176 didn't jump to line 174 because the condition on line 176 was always true

177 raw_tags.append(raw_tag) 

178 return raw_tags 

179 

180 

181def extract_linkage_section( 

182 wxr: WiktextractContext, 

183 page_data: list[WordEntry], 

184 level_node: LevelNode, 

185 linkage_type: str, 

186): 

187 l_list = [] 

188 sense = "" 

189 for node in level_node.children: 

190 if isinstance(node, TemplateNode) and ( 

191 re.fullmatch(r"(?:col|der|rel)(?:\d+)?", node.template_name) 

192 or node.template_name in ["columns", "column"] 

193 ): 

194 l_list.extend(extract_col_template(wxr, node)) 

195 elif isinstance(node, TemplateNode) and node.template_name.startswith( 

196 "der-top" 

197 ): 

198 sense = clean_node(wxr, None, node.template_parameters.get(1, "")) 

199 elif isinstance(node, TemplateNode) and node.template_name in [ 199 ↛ 203line 199 didn't jump to line 203 because the condition on line 199 was never true

200 "zh-dial", 

201 "zho-dial", 

202 ]: 

203 l_list.extend(extract_zh_dial_template(wxr, node, sense)) 

204 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

205 for list_item in node.find_child(NodeKind.LIST_ITEM): 

206 l_list.extend( 

207 extract_idiom_list_item(wxr, list_item) 

208 if linkage_type == "idioms" 

209 and list_item.contain_node(NodeKind.BOLD) 

210 else extract_linkage_list_item(wxr, list_item, sense) 

211 ) 

212 if linkage_type == "idioms": 

213 linkage_type = "related" 

214 if level_node.kind == NodeKind.LEVEL3: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 for data in page_data: 

216 if data.lang_code == page_data[-1].lang_code: 

217 getattr(data, linkage_type).extend(l_list) 

218 for l_data in l_list: 

219 data.categories.extend(l_data.categories) 

220 elif len(page_data) > 0: 220 ↛ exitline 220 didn't return from function 'extract_linkage_section' because the condition on line 220 was always true

221 getattr(page_data[-1], linkage_type).extend(l_list) 

222 for l_data in l_list: 

223 page_data[-1].categories.extend(l_data.categories) 

224 

225 

226def extract_col_template( 

227 wxr: WiktextractContext, t_node: TemplateNode 

228) -> list[Linkage]: 

229 l_list = [] 

230 expanded_template = wxr.wtp.parse( 

231 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

232 ) 

233 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

234 for li_tag in expanded_template.find_html_recursively("li"): 

235 first_word = True 

236 translation = "" 

237 for node in li_tag.children: 

238 if isinstance(node, str): 

239 m = re.search(r"“(.+)”", node) 

240 if m is not None: 

241 translation = m.group(1).strip() 

242 for span_tag in li_tag.find_html("span"): 

243 span_lang = span_tag.attrs.get("lang", "") 

244 span_class = span_tag.attrs.get("class", "") 

245 if span_lang.endswith("-Latn") and len(l_list) > 0: 

246 l_list[-1].roman = clean_node(wxr, None, span_tag) 

247 elif span_lang == lang_code: 

248 if lang_code == "zh": 

249 l_data = Linkage(word=clean_node(wxr, None, span_tag)) 

250 if "Hant" in span_class: 

251 l_data.tags.append("Traditional-Chinese") 

252 elif "Hans" in span_class: 252 ↛ 254line 252 didn't jump to line 254 because the condition on line 252 was always true

253 l_data.tags.append("Simplified-Chinese") 

254 l_list.append(l_data) 

255 elif not first_word: 

256 l_list[-1].other = clean_node(wxr, None, span_tag) 

257 else: 

258 l_list.append( 

259 Linkage( 

260 word=clean_node(wxr, None, span_tag), 

261 translation=translation, 

262 ) 

263 ) 

264 first_word = False 

265 

266 return l_list 

267 

268 

269def extract_linkage_list_item( 

270 wxr: WiktextractContext, list_item: WikiNode, sense: str 

271) -> list[Linkage]: 

272 l_list = [] 

273 raw_tags = [] 

274 for index, node in enumerate(list_item.children): 

275 if isinstance(node, TemplateNode): 

276 if node.template_name in ["sense", "s"]: 

277 sense = clean_node(wxr, None, node).strip("(): ") 

278 elif node.template_name in ["l", "link"]: 

279 l_list.extend(extract_link_template(wxr, node, sense)) 

280 elif node.template_name in ["qualifier", "qual"]: 

281 raw_tags.append(clean_node(wxr, None, node).strip("()")) 

282 elif node.template_name in ["zh-l", "zho-l"]: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 l_list.extend(extract_zh_l_template(wxr, node, sense, raw_tags)) 

284 raw_tags.clear() 

285 elif node.template_name in ["ja-r", "jpn-r"]: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 l_list.append(extract_ja_r_template(wxr, node, sense, raw_tags)) 

287 raw_tags.clear() 

288 elif node.template_name in ["vi-l", "vie-l"]: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true

289 l_list.append(extract_vi_l_template(wxr, node, sense, raw_tags)) 

290 raw_tags.clear() 

291 elif node.template_name in ["anagrams", "Anagrams", "đảo chữ"]: 

292 l_list.extend( 

293 extract_anagrams_template(wxr, node, sense, raw_tags) 

294 ) 

295 raw_tags.clear() 

296 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK: 

297 word = clean_node(wxr, None, node) 

298 if word != "": 298 ↛ 274line 298 didn't jump to line 274 because the condition on line 298 was always true

299 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags) 

300 translate_raw_tags(l_data) 

301 l_list.append(l_data) 

302 elif ( 

303 isinstance(node, str) 

304 and node.strip().startswith("-") 

305 and len(l_list) > 0 

306 ): 

307 l_list[-1].sense = clean_node( 

308 wxr, None, list_item.children[index:] 

309 ).strip("- \n") 

310 break 

311 if len(raw_tags) > 0 and len(l_list) > 0: 

312 l_list[-1].raw_tags.extend(raw_tags) 

313 translate_raw_tags(l_list[-1]) 

314 return l_list 

315 

316 

317def extract_link_template( 

318 wxr: WiktextractContext, t_node: TemplateNode, sense: str 

319) -> list[Linkage]: 

320 l_list = [] 

321 expanded_template = wxr.wtp.parse( 

322 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

323 ) 

324 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, "")) 

325 for span_tag in expanded_template.find_html("span"): 

326 span_lang = span_tag.attrs.get("lang", "") 

327 if span_lang == lang_code: 327 ↛ 325line 327 didn't jump to line 325 because the condition on line 327 was always true

328 l_list.append( 

329 Linkage(word=clean_node(wxr, None, span_tag), sense=sense) 

330 ) 

331 

332 return l_list 

333 

334 

335def extract_idiom_list_item( 

336 wxr: WiktextractContext, list_item: WikiNode 

337) -> list[Linkage]: 

338 l_list = [] 

339 bold_index = 0 

340 sense_nodes = [] 

341 for index, node in enumerate(list_item.children): 

342 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD: 

343 word = clean_node(wxr, None, node) 

344 if word != "": 344 ↛ 341line 344 didn't jump to line 341 because the condition on line 344 was always true

345 bold_index = index 

346 l_list.append(Linkage(word=word, tags=["idiomatic"])) 

347 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

348 for child_list_item in node.find_child(NodeKind.LIST_ITEM): 

349 sense = clean_node(wxr, None, child_list_item.children) 

350 if sense != "" and len(l_list) > 0: 350 ↛ 348line 350 didn't jump to line 348 because the condition on line 350 was always true

351 l_list[-1].senses.append(sense) 

352 elif index > bold_index: 

353 sense_nodes.append(node) 

354 

355 sense = clean_node(wxr, None, sense_nodes).strip(": ") 

356 if sense != "" and len(l_list) > 0: 

357 l_list[-1].sense = sense 

358 

359 return l_list 

360 

361 

362def extract_zh_l_template( 

363 wxr: WiktextractContext, 

364 template_node: TemplateNode, 

365 sense: str, 

366 raw_tags: list[str] = [], 

367) -> list[Linkage]: 

368 expanded_node = wxr.wtp.parse( 

369 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

370 ) 

371 roman = "" 

372 linkage_list = [] 

373 for i_tag in expanded_node.find_html_recursively( 

374 "span", attr_name="class", attr_value="Latn" 

375 ): 

376 roman = clean_node(wxr, None, i_tag) 

377 for span_tag in expanded_node.find_html( 

378 "span", attr_name="lang", attr_value="zh" 

379 ): 

380 linkage_data = Linkage( 

381 sense=sense, 

382 raw_tags=raw_tags, 

383 roman=roman, 

384 word=clean_node(wxr, None, span_tag), 

385 ) 

386 lang_attr = span_tag.attrs.get("lang", "") 

387 if lang_attr == "zh-Hant": 

388 linkage_data.tags.append("Traditional-Chinese") 

389 elif lang_attr == "zh-Hans": 

390 linkage_data.tags.append("Simplified-Chinese") 

391 if len(linkage_data.word) > 0 and linkage_data.word != "/": 

392 translate_raw_tags(linkage_data) 

393 linkage_list.append(linkage_data) 

394 return linkage_list 

395 

396 

397def extract_ja_r_template( 

398 wxr: WiktextractContext, 

399 template_node: TemplateNode, 

400 sense: str, 

401 raw_tags: list[str] = [], 

402) -> Linkage: 

403 expanded_node = wxr.wtp.parse( 

404 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

405 ) 

406 linkage_data = Linkage(word="", sense=sense, raw_tags=raw_tags) 

407 for span_node in expanded_node.find_html("span"): 

408 span_class = span_node.attrs.get("class", "").split() 

409 if "lang" in span_node.attrs: 

410 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node) 

411 linkage_data.word = clean_node(wxr, None, no_ruby_nodes) 

412 linkage_data.ruby = ruby_data 

413 elif "tr" in span_class: 

414 linkage_data.roman = clean_node(wxr, None, span_node) 

415 elif "mention-gloss" in span_class: 

416 linkage_data.sense = clean_node(wxr, None, span_node) 

417 

418 translate_raw_tags(linkage_data) 

419 return linkage_data 

420 

421 

422def extract_vi_l_template( 

423 wxr: WiktextractContext, 

424 t_node: TemplateNode, 

425 sense: str, 

426 raw_tags: list[str], 

427) -> Linkage: 

428 l_data = Linkage(word="", sense=sense, raw_tags=raw_tags) 

429 expanded_node = wxr.wtp.parse( 

430 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

431 ) 

432 for span_tag in expanded_node.find_html("span"): 

433 span_lang = span_tag.attrs.get("lang", "") 

434 match span_lang: 

435 case "vi": 435 ↛ 437line 435 didn't jump to line 437 because the pattern on line 435 always matched

436 l_data.word = clean_node(wxr, None, span_tag) 

437 case "vi-Latn": 

438 l_data.roman = clean_node(wxr, None, span_tag) 

439 for link_node in expanded_node.find_child(NodeKind.LINK): 439 ↛ 440line 439 didn't jump to line 440 because the loop on line 439 never started

440 clean_node(wxr, l_data, link_node) 

441 return l_data 

442 

443 

444def extract_anagrams_template( 

445 wxr: WiktextractContext, 

446 t_node: TemplateNode, 

447 sense: str, 

448 raw_tags: list[str], 

449) -> list[Linkage]: 

450 l_list = [] 

451 for arg_index in count(2): 

452 if arg_index not in t_node.template_parameters: 

453 break 

454 word = clean_node(wxr, None, t_node.template_parameters[arg_index]) 

455 if word != "": 

456 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags) 

457 translate_raw_tags(l_data) 

458 l_list.append(l_data) 

459 

460 return l_list 

461 

462 

463def extract_zh_dial_template( 

464 wxr: WiktextractContext, template_node: TemplateNode, sense: str 

465) -> list[Linkage]: 

466 from .sound import split_zh_pron_raw_tag 

467 

468 linkage_list = [] 

469 expanded_node = wxr.wtp.parse( 

470 wxr.wtp.node_to_wikitext(template_node), expand_all=True 

471 ) 

472 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE): 

473 is_note_row = False 

474 note_tags = {} 

475 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

476 for cell_node in row_node.find_child( 

477 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL 

478 ): 

479 if cell_node.kind == NodeKind.TABLE_HEADER_CELL: 

480 is_note_row = clean_node(wxr, None, cell_node) == "Ghi chú" 

481 elif is_note_row: 

482 for note_str in clean_node(wxr, None, cell_node).split(";"): 

483 if "-" in note_str: 

484 note_symbol, note = note_str.split("-", maxsplit=1) 

485 note_symbol = note_symbol.strip() 

486 note = note.strip() 

487 if note_symbol != "" and note != "": 

488 note_tags[note_symbol] = note 

489 lang_tags = [] 

490 region_tags = [] 

491 for row_node in table_node.find_child(NodeKind.TABLE_ROW): 

492 if not row_node.contain_node(NodeKind.TABLE_CELL): 

493 continue # skip header row 

494 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL): 

495 lang_tags = split_zh_pron_raw_tag( 

496 clean_node(wxr, None, header_node) 

497 ) 

498 if lang_tags == ["Ghi chú"]: # skip last note row 

499 continue 

500 for cell_node in row_node.find_child(NodeKind.TABLE_CELL): 

501 for link_node in cell_node.find_child(NodeKind.LINK): 

502 region_tags = split_zh_pron_raw_tag( 

503 clean_node(wxr, None, link_node) 

504 ) 

505 for span_tag in cell_node.find_html("span"): 

506 span_text = clean_node(wxr, None, span_tag) 

507 if span_text == "": 

508 continue 

509 if ( 

510 span_tag.attrs.get("lang", "") == "zh" 

511 and span_text != wxr.wtp.title 

512 ): 

513 l_data = Linkage(word=span_text, sense=sense) 

514 if len(lang_tags) > 0: 

515 l_data.raw_tags.extend(lang_tags) 

516 if len(region_tags) > 0: 

517 l_data.raw_tags.extend(region_tags) 

518 translate_raw_tags(l_data) 

519 linkage_list.append(l_data) 

520 elif ( 

521 span_tag.attrs.get("style", "") == "font-size:60%" 

522 and len(linkage_list) > 0 

523 ): 

524 for note_symbol in span_text.split(","): 

525 note_symbol = note_symbol.strip() 

526 raw_tag = note_symbol 

527 if note_symbol in note_tags: 

528 raw_tag = note_tags[note_symbol] 

529 if raw_tag != "": 

530 linkage_list[-1].raw_tags.append(raw_tag) 

531 translate_raw_tags(linkage_list[-1]) 

532 

533 return linkage_list