Coverage for src / wiktextract / extractor / vi / etymology.py: 33%

50 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-12-05 07:46 +0000

1from wikitextprocessor import ( 

2 HTMLNode, 

3 LevelNode, 

4 NodeKind, 

5 TemplateNode, 

6 WikiNode, 

7) 

8 

9from ...page import clean_node 

10from ...wxr_context import WiktextractContext 

11from .models import Form, WordEntry 

12from .tags import translate_raw_tags 

13 

14 

15def extract_etymology_section( 

16 wxr: WiktextractContext, base_data: WordEntry, level_node: LevelNode 

17): 

18 e_nodes = [] 

19 for node in level_node.children: 

20 if isinstance(node, TemplateNode) and ( 20 ↛ 24line 20 didn't jump to line 24 because the condition on line 20 was never true

21 node.template_name.endswith("-kanjitab") 

22 or node.template_name == "ja-kt" 

23 ): 

24 extract_ja_kanjitab_template(wxr, node, base_data) 

25 elif isinstance(node, WikiNode) and node.kind == NodeKind.LIST: 

26 for list_item in node.find_child(NodeKind.LIST_ITEM): 

27 e_text = clean_node(wxr, base_data, list_item.children) 

28 if e_text != "": 28 ↛ 26line 28 didn't jump to line 26 because the condition on line 28 was always true

29 base_data.etymology_texts.append(e_text) 

30 elif isinstance(node, LevelNode): 30 ↛ 31line 30 didn't jump to line 31 because the condition on line 30 was never true

31 break 

32 else: 

33 e_nodes.append(node) 

34 

35 if len(e_nodes) > 0: 35 ↛ exitline 35 didn't return from function 'extract_etymology_section' because the condition on line 35 was always true

36 e_text = clean_node(wxr, base_data, e_nodes) 

37 if e_text != "": 37 ↛ 38line 37 didn't jump to line 38 because the condition on line 37 was never true

38 base_data.etymology_texts.append(e_text) 

39 

40 

41def extract_ja_kanjitab_template( 

42 wxr: WiktextractContext, t_node: TemplateNode, base_data: WordEntry 

43): 

44 expanded_node = wxr.wtp.parse( 

45 wxr.wtp.node_to_wikitext(t_node), expand_all=True 

46 ) 

47 for table in expanded_node.find_child(NodeKind.TABLE): 

48 is_alt_form_table = False 

49 for row in table.find_child(NodeKind.TABLE_ROW): 

50 for header_node in row.find_child(NodeKind.TABLE_HEADER_CELL): 

51 header_text = clean_node(wxr, None, header_node) 

52 if header_text == "Cách viết khác": 

53 is_alt_form_table = True 

54 if not is_alt_form_table: 

55 continue 

56 forms = [] 

57 for row in table.find_child(NodeKind.TABLE_ROW): 

58 for cell_node in row.find_child(NodeKind.TABLE_CELL): 

59 for child_node in cell_node.children: 

60 if isinstance(child_node, HTMLNode): 

61 if child_node.tag == "span": 

62 word = clean_node(wxr, None, child_node) 

63 if word != "": 

64 forms.append( 

65 Form( 

66 form=word, tags=["alternative", "kanji"] 

67 ) 

68 ) 

69 elif child_node.tag == "small": 

70 raw_tag = clean_node(wxr, None, child_node).strip( 

71 "()" 

72 ) 

73 if raw_tag != "" and len(forms) > 0: 

74 forms[-1].raw_tags.append(raw_tag) 

75 translate_raw_tags(forms[-1]) 

76 base_data.forms.extend(forms) 

77 for link_node in expanded_node.find_child(NodeKind.LINK): 

78 clean_node(wxr, base_data, link_node)