Coverage for src/wiktextract/extractor/vi/linkage.py: 60%

1import re

2from itertools import count

4from wikitextprocessor import (

5 HTMLNode,

6 LevelNode,

7 NodeKind,

8 TemplateNode,

9 WikiNode,

10)

12from ...page import clean_node

13from ...wxr_context import WiktextractContext

14from ..ruby import extract_ruby

15from .models import Form, Linkage, WordEntry

16from .tags import translate_raw_tags

18GLOSS_LIST_LINKAGE_TEMPLATES = {

19 "antonyms": "antonyms",

20 "def-ant": "antonyms",

21 "antonym": "antonyms",

22 "coordinate terms": "coordinate_terms",

23 "def-cot": "coordinate_terms",

24 "def-coo": "coordinate_terms",

25 "cot": "coordinate_terms",

26 "holonyms": "holonyms",

27 "holonym": "holonyms",

28 "holo": "holonyms",

29 "hypernyms": "hypernyms",

30 "hyper": "hypernyms",

31 "hyponyms": "hyponyms",

32 "hypo": "hyponyms",

33 "inline alt forms": "alt_forms",

34 "alti": "alt_forms",

35 "meronyms": "meronyms",

36 "mero": "meronyms",

37 "synonyms": "synonyms",

38 "synonym": "synonyms",

39 "def-syn": "synonyms",

40 "synsee": "synonyms",

41}

43QUALIFIER_TEMPLATES = ["qualifier", "qual", "q", "qf", "i"]

46def extract_gloss_list_linkage_template(

47 wxr: WiktextractContext,

48 word_entry: WordEntry,

49 t_node: TemplateNode,

50 linkage_type: str,

51 sense: str,

52):

53 expanded_node = wxr.wtp.parse(

54 wxr.wtp.node_to_wikitext(t_node), expand_all=True

55 )

56 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

57 l_list = []

58 raw_tags = []

59 for top_span_tag in expanded_node.find_html("span"):

60 for node in top_span_tag.children:

61 if isinstance(node, HTMLNode) and node.tag == "span":

62 span_lang = node.attrs.get("lang", "")

63 span_class = node.attrs.get("class", "").split()

64 if span_lang == lang_code:

65 l_data = Linkage(

66 word=clean_node(wxr, None, node),

67 sense=sense,

68 raw_tags=raw_tags,

69 )

70 if "Hant" in span_class: 70 ↛ 71line 70 didn't jump to line 71 because the condition on line 70 was never true

71 l_data.tags.append("Traditional-Chinese")

72 elif "Hans" in span_class: 72 ↛ 73line 72 didn't jump to line 73 because the condition on line 72 was never true

73 l_data.tags.append("Simplified-Chinese")

74 if l_data.word != "": 74 ↛ 60line 74 didn't jump to line 60 because the condition on line 74 was always true

75 translate_raw_tags(l_data)

76 l_list.append(l_data)

77 elif span_lang == f"{lang_code}-Latn" or "tr" in span_class:

78 roman = clean_node(wxr, None, node)

79 for d in l_list:

80 d.roman = roman

81 elif "mention-gloss" in span_class: 81 ↛ 82line 81 didn't jump to line 82 because the condition on line 81 was never true

82 sense = clean_node(wxr, None, node)

83 for d in l_list:

84 d.sense = sense

85 elif "qualifier-content" in span_class:

86 raw_tag_str = clean_node(wxr, None, node)

87 for raw_tag in raw_tag_str.split("，"):

88 raw_tag = raw_tag.strip()

89 if raw_tag != "": 89 ↛ 87line 89 didn't jump to line 87 because the condition on line 89 was always true

90 raw_tags.append(raw_tag)

91 elif isinstance(node, str) and node.strip() == ",":

92 if linkage_type == "alt_forms": 92 ↛ 93line 92 didn't jump to line 93 because the condition on line 92 was never true

93 for l_data in l_list:

94 word_entry.forms.append(

95 Form(

96 form=l_data.word,

97 sense=l_data.sense,

98 tags=l_data.tags + ["alternative"],

99 raw_tags=l_data.raw_tags,

100 roman=l_data.roman,

101 )

102 )

103 else:

104 getattr(word_entry, linkage_type).extend(l_list)

105 l_list.clear()

106 raw_tags.clear()

107

108 if linkage_type == "alt_forms":

109 for l_data in l_list:

110 word_entry.forms.append(

111 Form(

112 form=l_data.word,

113 sense=l_data.sense,

114 tags=l_data.tags + ["alternative"],

115 raw_tags=l_data.raw_tags,

116 roman=l_data.roman,

117 )

118 )

119 else:

120 getattr(word_entry, linkage_type).extend(l_list)

121

122

123def extract_alt_form_section(

124 wxr: WiktextractContext,

125 base_data: WordEntry,

126 page_data: list[WordEntry],

127 level_node: LevelNode,

128):

129 forms = []

130 for list_node in level_node.find_child(NodeKind.LIST):

131 for list_item in list_node.find_child(NodeKind.LIST_ITEM):

132 raw_tags = []

133 for node in list_item.children:

134 if isinstance(node, TemplateNode) and node.template_name in [

135 "alter",

136 "def-alt",

137 ]:

138 forms.extend(extract_alter_template(wxr, node, raw_tags))

139 elif (

140 isinstance(node, TemplateNode)

141 and node.template_name in QUALIFIER_TEMPLATES

142 ):

143 raw_tags.extend(extract_qualifier_template(wxr, node))

144

145 if len(page_data) == 0 or page_data[-1].lang != base_data.lang:

146 base_data.forms.extend(forms)

147 else:

148 page_data[-1].forms.extend(forms)

149

150

151def extract_alter_template(

152 wxr: WiktextractContext, t_node: TemplateNode, raw_tags: list[str]

153) -> list[Form]:

154 forms = []

155 expanded_node = wxr.wtp.parse(

156 wxr.wtp.node_to_wikitext(t_node), expand_all=True

157 )

158 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

159 for span_tag in expanded_node.find_html(

160 "span", attr_name="lang", attr_value=lang_code

161 ):

162 word = clean_node(wxr, None, span_tag)

163 if word != "": 163 ↛ 159line 163 didn't jump to line 159 because the condition on line 163 was always true

164 form = Form(form=word, tags=["alternative"], raw_tags=raw_tags)

165 translate_raw_tags(form)

166 forms.append(form)

167 return forms

168

169

170def extract_qualifier_template(

171 wxr: WiktextractContext, t_node: TemplateNode

172) -> list[str]:

173 raw_tags = []

174 for raw_tag in clean_node(wxr, None, t_node).strip("()").split(","):

175 raw_tag = raw_tag.strip()

176 if raw_tag != "": 176 ↛ 174line 176 didn't jump to line 174 because the condition on line 176 was always true

177 raw_tags.append(raw_tag)

178 return raw_tags

179

180

181def extract_linkage_section(

182 wxr: WiktextractContext,

183 page_data: list[WordEntry],

184 level_node: LevelNode,

185 linkage_type: str,

186):

187 l_list = []

188 sense = ""

189 for node in level_node.children:

190 if isinstance(node, TemplateNode) and (

191 re.fullmatch(r"(?:col|der|rel)(?:\d+)?", node.template_name)

192 or node.template_name in ["columns", "column"]

193 ):

194 l_list.extend(extract_col_template(wxr, node))

195 elif isinstance(node, TemplateNode) and node.template_name.startswith(

196 "der-top"

197 ):

198 sense = clean_node(wxr, None, node.template_parameters.get(1, ""))

199 elif isinstance(node, TemplateNode) and node.template_name in [ 199 ↛ 203line 199 didn't jump to line 203 because the condition on line 199 was never true

200 "zh-dial",

201 "zho-dial",

202 ]:

203 l_list.extend(extract_zh_dial_template(wxr, node, sense))

204 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

205 for list_item in node.find_child(NodeKind.LIST_ITEM):

206 l_list.extend(

207 extract_idiom_list_item(wxr, list_item)

208 if linkage_type == "idioms"

209 and list_item.contain_node(NodeKind.BOLD)

210 else extract_linkage_list_item(wxr, list_item, sense)

211 )

212 if linkage_type == "idioms":

213 linkage_type = "related"

214 if level_node.kind == NodeKind.LEVEL3: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 for data in page_data:

216 if data.lang_code == page_data[-1].lang_code:

217 getattr(data, linkage_type).extend(l_list)

218 for l_data in l_list:

219 data.categories.extend(l_data.categories)

220 elif len(page_data) > 0: 220 ↛ exitline 220 didn't return from function 'extract_linkage_section' because the condition on line 220 was always true

221 getattr(page_data[-1], linkage_type).extend(l_list)

222 for l_data in l_list:

223 page_data[-1].categories.extend(l_data.categories)

224

225

226def extract_col_template(

227 wxr: WiktextractContext, t_node: TemplateNode

228) -> list[Linkage]:

229 l_list = []

230 expanded_template = wxr.wtp.parse(

231 wxr.wtp.node_to_wikitext(t_node), expand_all=True

232 )

233 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

234 for li_tag in expanded_template.find_html_recursively("li"):

235 first_word = True

236 translation = ""

237 for node in li_tag.children:

238 if isinstance(node, str):

239 m = re.search(r"“(.+)”", node)

240 if m is not None:

241 translation = m.group(1).strip()

242 for span_tag in li_tag.find_html("span"):

243 span_lang = span_tag.attrs.get("lang", "")

244 span_class = span_tag.attrs.get("class", "")

245 if span_lang.endswith("-Latn") and len(l_list) > 0:

246 l_list[-1].roman = clean_node(wxr, None, span_tag)

247 elif span_lang == lang_code:

248 if lang_code == "zh":

249 l_data = Linkage(word=clean_node(wxr, None, span_tag))

250 if "Hant" in span_class:

251 l_data.tags.append("Traditional-Chinese")

252 elif "Hans" in span_class: 252 ↛ 254line 252 didn't jump to line 254 because the condition on line 252 was always true

253 l_data.tags.append("Simplified-Chinese")

254 l_list.append(l_data)

255 elif not first_word:

256 l_list[-1].other = clean_node(wxr, None, span_tag)

257 else:

258 l_list.append(

259 Linkage(

260 word=clean_node(wxr, None, span_tag),

261 translation=translation,

262 )

263 )

264 first_word = False

265

266 return l_list

267

268

269def extract_linkage_list_item(

270 wxr: WiktextractContext, list_item: WikiNode, sense: str

271) -> list[Linkage]:

272 l_list = []

273 raw_tags = []

274 for index, node in enumerate(list_item.children):

275 if isinstance(node, TemplateNode):

276 if node.template_name in ["sense", "s"]:

277 sense = clean_node(wxr, None, node).strip("(): ")

278 elif node.template_name in ["l", "link"]:

279 l_list.extend(extract_link_template(wxr, node, sense))

280 elif node.template_name in ["qualifier", "qual"]:

281 raw_tags.append(clean_node(wxr, None, node).strip("()"))

282 elif node.template_name in ["zh-l", "zho-l"]: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 l_list.extend(extract_zh_l_template(wxr, node, sense, raw_tags))

284 raw_tags.clear()

285 elif node.template_name in ["ja-r", "jpn-r"]: 285 ↛ 286line 285 didn't jump to line 286 because the condition on line 285 was never true

286 l_list.append(extract_ja_r_template(wxr, node, sense, raw_tags))

287 raw_tags.clear()

288 elif node.template_name in ["vi-l", "vie-l"]: 288 ↛ 291line 288 didn't jump to line 291 because the condition on line 288 was always true

289 l_list.append(extract_vi_l_template(wxr, node, sense, raw_tags))

290 raw_tags.clear()

291 elif node.template_name in ["anagrams", "Anagrams", "đảo chữ"]:

292 l_list.extend(

293 extract_anagrams_template(wxr, node, sense, raw_tags)

294 )

295 raw_tags.clear()

296 elif isinstance(node, WikiNode) and node.kind == NodeKind.LINK:

297 word = clean_node(wxr, None, node)

298 if word != "": 298 ↛ 274line 298 didn't jump to line 274 because the condition on line 298 was always true

299 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags)

300 translate_raw_tags(l_data)

301 l_list.append(l_data)

302 elif (

303 isinstance(node, str)

304 and node.strip().startswith("-")

305 and len(l_list) > 0

306 ):

307 l_list[-1].sense = clean_node(

308 wxr, None, list_item.children[index:]

309 ).strip("- \n")

310 break

311 if len(raw_tags) > 0 and len(l_list) > 0:

312 l_list[-1].raw_tags.extend(raw_tags)

313 translate_raw_tags(l_list[-1])

314 return l_list

315

316

317def extract_link_template(

318 wxr: WiktextractContext, t_node: TemplateNode, sense: str

319) -> list[Linkage]:

320 l_list = []

321 expanded_template = wxr.wtp.parse(

322 wxr.wtp.node_to_wikitext(t_node), expand_all=True

323 )

324 lang_code = clean_node(wxr, None, t_node.template_parameters.get(1, ""))

325 for span_tag in expanded_template.find_html("span"):

326 span_lang = span_tag.attrs.get("lang", "")

327 if span_lang == lang_code: 327 ↛ 325line 327 didn't jump to line 325 because the condition on line 327 was always true

328 l_list.append(

329 Linkage(word=clean_node(wxr, None, span_tag), sense=sense)

330 )

331

332 return l_list

333

334

335def extract_idiom_list_item(

336 wxr: WiktextractContext, list_item: WikiNode

337) -> list[Linkage]:

338 l_list = []

339 bold_index = 0

340 sense_nodes = []

341 for index, node in enumerate(list_item.children):

342 if isinstance(node, WikiNode) and node.kind == NodeKind.BOLD:

343 word = clean_node(wxr, None, node)

344 if word != "": 344 ↛ 341line 344 didn't jump to line 341 because the condition on line 344 was always true

345 bold_index = index

346 l_list.append(Linkage(word=word, tags=["idiomatic"]))

347 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST:

348 for child_list_item in node.find_child(NodeKind.LIST_ITEM):

349 sense = clean_node(wxr, None, child_list_item.children)

350 if sense != "" and len(l_list) > 0: 350 ↛ 348line 350 didn't jump to line 348 because the condition on line 350 was always true

351 l_list[-1].senses.append(sense)

352 elif index > bold_index:

353 sense_nodes.append(node)

354

355 sense = clean_node(wxr, None, sense_nodes).strip(": ")

356 if sense != "" and len(l_list) > 0:

357 l_list[-1].sense = sense

358

359 return l_list

360

361

362def extract_zh_l_template(

363 wxr: WiktextractContext,

364 template_node: TemplateNode,

365 sense: str,

366 raw_tags: list[str] = [],

367) -> list[Linkage]:

368 expanded_node = wxr.wtp.parse(

369 wxr.wtp.node_to_wikitext(template_node), expand_all=True

370 )

371 roman = ""

372 linkage_list = []

373 for i_tag in expanded_node.find_html_recursively(

374 "span", attr_name="class", attr_value="Latn"

375 ):

376 roman = clean_node(wxr, None, i_tag)

377 for span_tag in expanded_node.find_html(

378 "span", attr_name="lang", attr_value="zh"

379 ):

380 linkage_data = Linkage(

381 sense=sense,

382 raw_tags=raw_tags,

383 roman=roman,

384 word=clean_node(wxr, None, span_tag),

385 )

386 lang_attr = span_tag.attrs.get("lang", "")

387 if lang_attr == "zh-Hant":

388 linkage_data.tags.append("Traditional-Chinese")

389 elif lang_attr == "zh-Hans":

390 linkage_data.tags.append("Simplified-Chinese")

391 if len(linkage_data.word) > 0 and linkage_data.word != "／":

392 translate_raw_tags(linkage_data)

393 linkage_list.append(linkage_data)

394 return linkage_list

395

396

397def extract_ja_r_template(

398 wxr: WiktextractContext,

399 template_node: TemplateNode,

400 sense: str,

401 raw_tags: list[str] = [],

402) -> Linkage:

403 expanded_node = wxr.wtp.parse(

404 wxr.wtp.node_to_wikitext(template_node), expand_all=True

405 )

406 linkage_data = Linkage(word="", sense=sense, raw_tags=raw_tags)

407 for span_node in expanded_node.find_html("span"):

408 span_class = span_node.attrs.get("class", "").split()

409 if "lang" in span_node.attrs:

410 ruby_data, no_ruby_nodes = extract_ruby(wxr, span_node)

411 linkage_data.word = clean_node(wxr, None, no_ruby_nodes)

412 linkage_data.ruby = ruby_data

413 elif "tr" in span_class:

414 linkage_data.roman = clean_node(wxr, None, span_node)

415 elif "mention-gloss" in span_class:

416 linkage_data.sense = clean_node(wxr, None, span_node)

417

418 translate_raw_tags(linkage_data)

419 return linkage_data

420

421

422def extract_vi_l_template(

423 wxr: WiktextractContext,

424 t_node: TemplateNode,

425 sense: str,

426 raw_tags: list[str],

427) -> Linkage:

428 l_data = Linkage(word="", sense=sense, raw_tags=raw_tags)

429 expanded_node = wxr.wtp.parse(

430 wxr.wtp.node_to_wikitext(t_node), expand_all=True

431 )

432 for span_tag in expanded_node.find_html("span"):

433 span_lang = span_tag.attrs.get("lang", "")

434 match span_lang:

435 case "vi": 435 ↛ 437line 435 didn't jump to line 437 because the pattern on line 435 always matched

436 l_data.word = clean_node(wxr, None, span_tag)

437 case "vi-Latn":

438 l_data.roman = clean_node(wxr, None, span_tag)

439 for link_node in expanded_node.find_child(NodeKind.LINK): 439 ↛ 440line 439 didn't jump to line 440 because the loop on line 439 never started

440 clean_node(wxr, l_data, link_node)

441 return l_data

442

443

444def extract_anagrams_template(

445 wxr: WiktextractContext,

446 t_node: TemplateNode,

447 sense: str,

448 raw_tags: list[str],

449) -> list[Linkage]:

450 l_list = []

451 for arg_index in count(2):

452 if arg_index not in t_node.template_parameters:

453 break

454 word = clean_node(wxr, None, t_node.template_parameters[arg_index])

455 if word != "":

456 l_data = Linkage(word=word, sense=sense, raw_tags=raw_tags)

457 translate_raw_tags(l_data)

458 l_list.append(l_data)

459

460 return l_list

461

462

463def extract_zh_dial_template(

464 wxr: WiktextractContext, template_node: TemplateNode, sense: str

465) -> list[Linkage]:

466 from .sound import split_zh_pron_raw_tag

467

468 linkage_list = []

469 expanded_node = wxr.wtp.parse(

470 wxr.wtp.node_to_wikitext(template_node), expand_all=True

471 )

472 for table_node in expanded_node.find_child_recursively(NodeKind.TABLE):

473 is_note_row = False

474 note_tags = {}

475 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

476 for cell_node in row_node.find_child(

477 NodeKind.TABLE_HEADER_CELL | NodeKind.TABLE_CELL

478 ):

479 if cell_node.kind == NodeKind.TABLE_HEADER_CELL:

480 is_note_row = clean_node(wxr, None, cell_node) == "Ghi chú"

481 elif is_note_row:

482 for note_str in clean_node(wxr, None, cell_node).split(";"):

483 if "-" in note_str:

484 note_symbol, note = note_str.split("-", maxsplit=1)

485 note_symbol = note_symbol.strip()

486 note = note.strip()

487 if note_symbol != "" and note != "":

488 note_tags[note_symbol] = note

489 lang_tags = []

490 region_tags = []

491 for row_node in table_node.find_child(NodeKind.TABLE_ROW):

492 if not row_node.contain_node(NodeKind.TABLE_CELL):

493 continue # skip header row

494 for header_node in row_node.find_child(NodeKind.TABLE_HEADER_CELL):

495 lang_tags = split_zh_pron_raw_tag(

496 clean_node(wxr, None, header_node)

497 )

498 if lang_tags == ["Ghi chú"]: # skip last note row

499 continue

500 for cell_node in row_node.find_child(NodeKind.TABLE_CELL):

501 for link_node in cell_node.find_child(NodeKind.LINK):

502 region_tags = split_zh_pron_raw_tag(

503 clean_node(wxr, None, link_node)

504 )

505 for span_tag in cell_node.find_html("span"):

506 span_text = clean_node(wxr, None, span_tag)

507 if span_text == "":

508 continue

509 if (

510 span_tag.attrs.get("lang", "") == "zh"

511 and span_text != wxr.wtp.title

512 ):

513 l_data = Linkage(word=span_text, sense=sense)

514 if len(lang_tags) > 0:

515 l_data.raw_tags.extend(lang_tags)

516 if len(region_tags) > 0:

517 l_data.raw_tags.extend(region_tags)

518 translate_raw_tags(l_data)

519 linkage_list.append(l_data)

520 elif (

521 span_tag.attrs.get("style", "") == "font-size:60%"

522 and len(linkage_list) > 0

523 ):

524 for note_symbol in span_text.split(","):

525 note_symbol = note_symbol.strip()

526 raw_tag = note_symbol

527 if note_symbol in note_tags:

528 raw_tag = note_tags[note_symbol]

529 if raw_tag != "":

530 linkage_list[-1].raw_tags.append(raw_tag)

531 translate_raw_tags(linkage_list[-1])

532

533 return linkage_list

Coverage for src / wiktextract / extractor / vi / linkage.py: 60%

315 statements